From de3a8cb56136c2c02b343203a320b20495401ee3 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Fri, 18 Oct 2024 04:37:13 +0000 Subject: [PATCH 01/41] Initial Polars support --- python/Makefile | 2 +- python/poetry.lock | 35 +- python/pyproject.toml | 12 +- python/tests/api/logger/test_logger.py | 5 + python/tests/api/logger/test_logger_polars.py | 341 ++++++++++++++++++ python/whylogs/api/logger/__init__.py | 14 +- python/whylogs/api/logger/logger.py | 21 +- python/whylogs/api/logger/rolling.py | 3 +- .../whylogs/api/logger/segment_processing.py | 8 +- python/whylogs/api/logger/transient.py | 3 +- .../api/whylabs/session/notebook_logger.py | 4 +- python/whylogs/core/dataframe_wrapper.py | 23 ++ python/whylogs/core/dataset_profile.py | 34 +- python/whylogs/core/datatypes.py | 15 +- python/whylogs/core/input_resolver.py | 19 +- python/whylogs/core/metrics/unicode_range.py | 1 + python/whylogs/core/preprocessing.py | 89 ++++- python/whylogs/core/schema.py | 21 +- python/whylogs/core/stubs.py | 30 +- 19 files changed, 610 insertions(+), 70 deletions(-) create mode 100644 python/tests/api/logger/test_logger_polars.py create mode 100644 python/whylogs/core/dataframe_wrapper.py diff --git a/python/Makefile b/python/Makefile index f89a9b07ff..a469be14dc 100644 --- a/python/Makefile +++ b/python/Makefile @@ -165,7 +165,7 @@ telemetry-opt-out: ## create opt out file install: ## Install all dependencies with poetry. @$(call i, Installing dependencies) - poetry install -E "viz s3 spark mlflow image fugue gcs embeddings proc" + poetry install -E "viz s3 spark mlflow image fugue gcs embeddings proc polars" coverage: ## Generate test coverage reports. @$(call i, Generating test coverage) diff --git a/python/poetry.lock b/python/poetry.lock index 0eedaeee12..3fbcbf4409 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -2890,6 +2890,38 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "0.18.4" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, + {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, + {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, + {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} + +[package.extras] +all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +connectorx = ["connectorx"] +deltalake = ["deltalake (>=0.8.0)"] +fsspec = ["fsspec"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "pre-commit" version = "2.20.0" @@ -4808,7 +4840,7 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -all = ["Pillow", "Pillow", "boto3", "faster-fifo", "fugue", "google-cloud-storage", "ipython", "mlflow-skinny", "mlflow-skinny", "numpy", "numpy", "orjson", "pandas", "pyarrow", "pybars3", "pyspark", "scikit-learn", "scikit-learn", "scipy", "scipy"] +all = ["Pillow", "Pillow", "boto3", "faster-fifo", "fugue", "google-cloud-storage", "ipython", "mlflow-skinny", "mlflow-skinny", "numpy", "numpy", "orjson", "pandas", "polars", "pyarrow", "pybars3", "pyspark", "scikit-learn", "scikit-learn", "scipy", "scipy"] datasets = ["pandas"] docs = ["furo", "ipython_genutils", "myst-parser", "nbconvert", "nbsphinx", "sphinx", "sphinx-autoapi", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] embeddings = ["numpy", "numpy", "scikit-learn", "scikit-learn"] @@ -4816,6 +4848,7 @@ fugue = ["fugue"] gcs = ["google-cloud-storage"] image = ["Pillow", "Pillow", "numpy", "numpy"] mlflow = ["databricks-cli", "mlflow-skinny", "mlflow-skinny"] +polars = ["polars"] proc = ["faster-fifo", "orjson", "pandas"] proc-mp = ["orjson", "pandas"] s3 = ["boto3"] diff --git a/python/pyproject.toml b/python/pyproject.toml index 6b182b615d..624f9cdc02 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -34,7 +34,11 @@ numpy = [ ] # datasets module. -pandas = { version = "*", optional = true} +pandas = { version = "*", optional = true } + +# Polars module. +polars = { version = ">=1.8.2", optional = true } +# TODO: do we want polars-u64-idx ? # Doc dependencies sphinx = { version = "*", optional = true } @@ -110,6 +114,10 @@ spark = [ datasets = [ "pandas", ] +polars = [ + "polars", +] + gcs = [ "google-cloud-storage", ] @@ -148,6 +156,7 @@ all = [ "boto3", "google-cloud-storage", "pandas", + "polars", "pyarrow", "pyspark", "ipython", @@ -177,6 +186,7 @@ mypy-protobuf = ">=3.2.0" types-protobuf = ">=0.1.14" pandas = "*" pandas-stubs = "*" +polars = "*" ipykernel = ">=6.11" # for developing in Jupyter notebook types-python-dateutil = "^2.8.12" moto = ">4.2" diff --git a/python/tests/api/logger/test_logger.py b/python/tests/api/logger/test_logger.py index f2d6a8d6d1..d75e303755 100644 --- a/python/tests/api/logger/test_logger.py +++ b/python/tests/api/logger/test_logger.py @@ -15,6 +15,9 @@ from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) + FLOAT_TYPES = [float, np.float16, np.float32, np.float64, np.floating, np.float_, np.longdouble] INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] DATETIME_TYPES = [np.datetime64, pd.Timestamp] @@ -28,6 +31,8 @@ def test_basic_log_schema() -> None: results = logger.log(df, schema=DatasetSchema()) profile = results.profile() assert profile._columns["col1"]._schema.dtype == np.int64 + print(profile.view().to_pandas()) + assert False def test_basic_log_schem_constructor() -> None: diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py new file mode 100644 index 0000000000..53e3cab635 --- /dev/null +++ b/python/tests/api/logger/test_logger_polars.py @@ -0,0 +1,341 @@ +import os +import tempfile +from typing import Any + +import numpy as np +import pandas as pd +import polars as pl +import pytest + +import whylogs as why +from whylogs.api.logger import write +from whylogs.api.logger.result_set import ResultSet, ResultSetReader +from whylogs.core import ColumnProfileView, MetricConfig +from whylogs.core.errors import LoggingError +from whylogs.core.metrics import StandardMetric +from whylogs.core.resolvers import Resolver +from whylogs.core.schema import DatasetSchema + +''' +FLOAT_TYPES = [float, np.float16, np.float32, np.float64, np.floating, np.float_, np.longdouble] +INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] +DATETIME_TYPES = [np.datetime64, pd.Timestamp] +TIMEDELTA_TYPES = ["timedelta64[s]", "timedelta64[ms]"] +''' + +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) + + +def test_basic_log_schema() -> None: + d = {"col1": [1, 2]} + df = pl.DataFrame(data=d) + logger = why.logger() + results = logger.log(df, schema=DatasetSchema()) + profile = results.profile() + assert profile._columns["col1"]._schema.dtype == pl.Int64 + +def test_basic_log_schem_constructor() -> None: + d = {"col1": [1, 2]} + df = pl.DataFrame(data=d) + logger = why.logger(schema=DatasetSchema()) + results = logger.log(df) + profile = results.profile() + assert profile._columns["col1"]._schema.dtype == pl.Int64 + + +def test_basic_log() -> None: + d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} + df = pl.DataFrame(data=d) + + results = why.log(df) + + profile = results.profile() + + assert profile._columns["col1"]._schema.dtype == pl.Int64 + assert profile._columns["col2"]._schema.dtype == pl.Float64 + assert profile._columns["col3"]._schema.dtype == pl.Utf8 + + +def test_log_nothing_raises_error() -> None: + with pytest.raises(LoggingError): + why.log() + + +def test_basic_log_row() -> None: + d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} + + results = why.log(row=d) + + profile = results.profile() + + assert profile._columns["col1"]._schema.dtype == list + assert profile._columns["col2"]._schema.dtype == list + assert profile._columns["col3"]._schema.dtype == list + + +def test_basic_log_dict_of_lists() -> None: + d = {"col1": [np.int64(1), np.int64(2)], "col2": [3.0, 4.0], "col3": ["a", "b"]} + + results = why.log(d) + + profile = results.profile() + + assert profile._columns["col1"]._schema.dtype == list + assert profile._columns["col2"]._schema.dtype == list + assert profile._columns["col3"]._schema.dtype == list + + +def test_basic_log_dictionary() -> None: + d = {"a": 1.0, "b": 2.0} + + results = why.log(d) + + profile = results.profile() + + assert profile._columns["a"]._schema.dtype == float + assert profile._columns["b"]._schema.dtype == float + + +def test_lending_club(lending_club_df: pl.DataFrame) -> None: + res = why.log(lending_club_df) + view = res.view() + df = view.to_pandas() + assert len(df) == 151 + + +@pytest.mark.skip("type not supported yet") +def test_categorical_dtype() -> None: + data = {"can_fly": [0, 1, 0, 0], "habitat": ["forest", "forest", "river", "river"]} + + df = pl.DataFrame(data) + df["can_fly"] = df["can_fly"].astype("category") + df["habitat"] = df["habitat"].astype("category") + + results = why.log(df) + view = results.view() + metrics = view.get_column("can_fly").get_metric("counts").to_summary_dict() + assert metrics["n"] == 4 + + +def test_roundtrip_resultset(tmp_path: Any) -> None: + d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} + df = pl.DataFrame(data=d) + + results = why.log(df) + status, path = results.writer("local", base_name="profile.bin").option(base_dir=tmp_path).write() + assert status + roundtrip_result_set = why.read(path) + assert len(results.view().to_pandas()) == len(roundtrip_result_set.view().to_pandas()) + + +def test_profile_write(tmp_path: Any) -> None: + d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} + df = pl.DataFrame(data=d) + results = why.log(df) + profile = results.profile() + write(profile, tmp_path, "test1_profile.bin") + assert os.path.isfile(os.path.join(tmp_path, "test1_profile.bin")) + path = os.path.join(tmp_path, "test2_profile.bin") + write(profile, path) + assert os.path.isfile(path) + + +''' +@pytest.mark.parametrize("data_type", [*INTEGER_TYPES, *FLOAT_TYPES, *TIMEDELTA_TYPES]) +def test_different_integer_types(data_type) -> None: + d = {"col1": [1, 3, 2, 5]} + df = pl.DataFrame(d, dtype=data_type) + results = why.log(df) + view = results.view() + + assert isinstance(view._columns["col1"], ColumnProfileView) + assert view._columns.get("col1")._failure_count == 0 + assert view._columns.get("col1")._success_count > 0 + + view_pandas = view.to_pandas() + assert len(view_pandas) == 1 + assert len(view_pandas.columns) > 0 +''' + + +def test_counters_dataframe_vs_row() -> None: + d = {"a": 1, "b": 2.0, "c": ["foo", "bar"]} + df = pl.DataFrame(d) + + df_results = why.log(df) + row_results = why.log(d) + + df_view = df_results.view() + row_view = row_results.view() + + view_pandas = df_view.to_pandas() + assert len(view_pandas) == 3 + assert len(view_pandas.columns) > 0 + + view_row_pandas = row_view.to_pandas() + assert len(view_row_pandas) == 3 + assert len(view_row_pandas.columns) > 0 + + +@pytest.mark.parametrize( + "input", + [{"a": ["x", "y"]}, {"a": []}], # non-numeric list -> object # tensors require positive shape in every dimension +) +def test_object_count_dict(input) -> None: + row_results = why.log(input) + row_view = row_results.view() + assert row_view._columns.get("a")._success_count == 2 + assert row_view._columns.get("a")._metrics.get("types").object.value == 1 + + +@pytest.mark.parametrize( + "input,stub_np,ints,reals,bools,strs,tensors,objs", + [ + ({"a": 1}, False, 1, 0, 0, 0, 0, 0), + ({"a": 1.0}, False, 0, 1, 0, 0, 0, 0), + ({"a": True}, False, 0, 0, 1, 0, 0, 0), + ({"a": "foo"}, False, 0, 0, 0, 1, 0, 0), + ({"a": [1, 2]}, False, 0, 0, 0, 0, 1, 0), + ({"a": [[1, 2], [3, 4]]}, False, 0, 0, 0, 0, 1, 0), + ({"a": [[1, 2.5], [3.14, 4]]}, False, 0, 0, 0, 0, 1, 0), + ({"a": [[1, 2], ["x", "y"]]}, False, 0, 0, 0, 0, 0, 1), + ({"a": np.asarray([1, 2])}, False, 0, 0, 0, 0, 1, 0), + ({"a": np.asarray([[1, 2], [3, 4]])}, False, 0, 0, 0, 0, 1, 0), + ({"a": np.asarray([[1, 2.5], [3.14, 4]])}, False, 0, 0, 0, 0, 1, 0), + ({"a": np.asarray([[1, 2], ["x", "y"]])}, False, 0, 0, 0, 0, 0, 1), + ({"a": []}, False, 0, 0, 0, 0, 0, 1), + ({"a": 1}, True, 1, 0, 0, 0, 0, 0), + ({"a": 1.0}, True, 0, 1, 0, 0, 0, 0), + ({"a": True}, True, 0, 0, 1, 0, 0, 0), + ({"a": "foo"}, True, 0, 0, 0, 1, 0, 0), + ({"a": [1, 2]}, True, 0, 0, 0, 0, 0, 1), + ({"a": [[1, 2], [3, 4]]}, True, 0, 0, 0, 0, 0, 1), + ({"a": [[1, 2.5], [3.14, 4]]}, True, 0, 0, 0, 0, 0, 1), + ({"a": [[1, 2], ["x", "y"]]}, True, 0, 0, 0, 0, 0, 1), + ({"a": np.asarray([1, 2])}, True, 0, 0, 0, 0, 0, 1), + ({"a": np.asarray([[1, 2], [3, 4]])}, True, 0, 0, 0, 0, 0, 1), + ({"a": np.asarray([[1, 2.5], [3.14, 4]])}, True, 0, 0, 0, 0, 0, 1), + ({"a": np.asarray([[1, 2], ["x", "y"]])}, True, 0, 0, 0, 0, 0, 1), + ({"a": []}, True, 0, 0, 0, 0, 0, 1), + ], +) +def test_type_count_dict(input, stub_np, ints, reals, bools, strs, tensors, objs, monkeypatch) -> None: + monkeypatch.setattr("whylogs.core.preprocessing.is_not_stub", lambda x: (not stub_np)) + row_results = why.log(input) + row_view = row_results.view() + assert row_view._columns.get("a")._metrics.get("types").integral.value == ints + assert row_view._columns.get("a")._metrics.get("types").fractional.value == reals + assert row_view._columns.get("a")._metrics.get("types").boolean.value == bools + assert row_view._columns.get("a")._metrics.get("types").string.value == strs + assert row_view._columns.get("a")._metrics.get("types").tensor.value == tensors + assert row_view._columns.get("a")._metrics.get("types").object.value == objs + + +def test_bool_count(): + data = { + "animal": ["cat", "hawk", "snake", "cat"], + "fly": [False, True, False, False], + "legs": [4, 2, 0, 4], + } + + df = pl.DataFrame(data) + + results = why.log(polars=df) + prof_view = results.profile().view() + assert prof_view._columns.get("fly")._metrics.get("types").boolean.value == 4 + assert prof_view._columns.get("fly")._metrics.get("types").integral.value == 0 + + +def test_unicode_range_enabled() -> None: + strings = { + "words": ["1", "12", "123", "1234a", "abc", "abc123", "I😍emoticons"], + } # TODO: follow and create ranges for common emoji like ❤️ /u+fe0f + data = pl.DataFrame(strings) + digit_counts = [1, 2, 3, 4, 0, 3, 0] + latin_counts = [1, 2, 3, 5, 3, 6, 10] + emoticon_counts = [0, 0, 0, 0, 0, 0, 1] + configured_schema = DatasetSchema(default_configs=MetricConfig(track_unicode_ranges=True)) + prof_view = why.log(data, schema=configured_schema).view() + assert "words" in prof_view.get_columns() + column_profile = prof_view.get_column("words") + assert "unicode_range" in column_profile.get_metric_names() + metric = column_profile.get_metric("unicode_range") + + assert "digits" in metric.submetrics + assert "basic-latin" in metric.submetrics + assert "emoticon" in metric.submetrics + + assert metric.submetrics["digits"]["distribution"].mean.value == np.array(digit_counts).mean() + assert metric.submetrics["emoticon"]["distribution"].mean.value == np.array(emoticon_counts).mean() + assert metric.submetrics["basic-latin"]["distribution"].mean.value == np.array(latin_counts).mean() + + +def test_unicode_range_default_config_off() -> None: + strings = { + "words": ["1", "12", "123", "1234a", "abc", "abc123", "I😍emoticon"], + } + data = pl.DataFrame(strings) + + prof_view = why.log(data).view() + assert "words" in prof_view.get_columns() + column_profile = prof_view.get_column("words") + assert "unicode_range" not in column_profile.get_metric_names() + + +def test_frequent_items() -> None: + strings = { + "words": ["1", "12", "123"], + } + data = pl.DataFrame(strings) + + prof_view = why.log(data).view() + assert "words" in prof_view.get_columns() + column_profile = prof_view.get_column("words") + assert "frequent_items" in column_profile.get_metric_names() + + +def test_frequent_items_disabled() -> None: + strings = { + "words": ["1", "12", "123"], + } + data = pl.DataFrame(strings) + configured_schema = DatasetSchema(default_configs=MetricConfig(fi_disabled=True)) + + prof_view = why.log(data, schema=configured_schema).view() + assert "words" in prof_view.get_columns() + column_profile = prof_view.get_column("words") + assert "frequent_items" not in column_profile.get_metric_names() + + +def test_custom_resolver() -> None: + class CustomResolver(Resolver): + """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types.""" + + def resolve(self, name: str, why_type, column_schema): + metrics = [] + if name == "col1": + metrics.append(StandardMetric.counts) + result = {} + for m in metrics: + result[m.name] = m.zero(column_schema.cfg) + return result + + d = {"col1": [3.0, 4.0, 5.0]} + df = pl.DataFrame(data=d) + prof_view = why.log(df, schema=DatasetSchema(resolvers=CustomResolver())).profile().view() + + assert prof_view.get_column("col1").get_metric("counts").n.value == 3 + assert not prof_view.get_column("col1").get_metric("distribution") + + +def test_result_set_reader(profile_view): + with tempfile.NamedTemporaryFile() as tmp_file: + success, path = profile_view.write(file=tmp_file) + assert success + tmp_file.flush() + tmp_file.seek(0) + reader = why.reader(name="local") + results = reader.read(path=path) + assert isinstance(reader, ResultSetReader) + assert isinstance(results, ResultSet) diff --git a/python/whylogs/api/logger/__init__.py b/python/whylogs/api/logger/__init__.py index 15f666173c..da9e67a7be 100644 --- a/python/whylogs/api/logger/__init__.py +++ b/python/whylogs/api/logger/__init__.py @@ -30,7 +30,7 @@ from whylogs.core.model_performance_metrics.model_performance_metrics import ( ModelPerformanceMetrics, ) -from whylogs.core.stubs import pd +from whylogs.core.stubs import pd, pl from whylogs.core.utils import deprecated_argument diagnostic_logger = logging.getLogger(__name__) @@ -43,6 +43,7 @@ def log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, name: Optional[str] = None, @@ -68,11 +69,18 @@ def log( return result_set else: result_set = TransientLogger(schema=schema).log( - obj, pandas=pandas, row=row, name=name, trace_id=trace_id, tags=tags, segment_key_values=segment_key_values + obj, + pandas=pandas, + polars=polars, + row=row, + name=name, + trace_id=trace_id, + tags=tags, + segment_key_values=segment_key_values ) if dataset_timestamp is not None: result_set.set_dataset_timestamp(dataset_timestamp) - notebook_session_log(result_set, obj, pandas=pandas, row=row, name=name) + notebook_session_log(result_set, obj, pandas=pandas, polars=polars, row=row, name=name) if debug_event is not None: if trace_id is None and WHYLABS_TRACE_ID_KEY in result_set.metadata: diff --git a/python/whylogs/api/logger/logger.py b/python/whylogs/api/logger/logger.py index 52605c581d..39e08770cd 100644 --- a/python/whylogs/api/logger/logger.py +++ b/python/whylogs/api/logger/logger.py @@ -15,13 +15,14 @@ from whylogs.api.store import ProfileStore from whylogs.api.writer import Writer, Writers from whylogs.core import DatasetProfile, DatasetSchema +from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.errors import LoggingError -from whylogs.core.input_resolver import _pandas_or_dict +from whylogs.core.input_resolver import _dataframe_or_dict from whylogs.core.metadata import ( _populate_common_profile_metadata, _safe_merge_metadata, ) -from whylogs.core.stubs import pd +from whylogs.core.stubs import pd, pl logger = logging.getLogger(__name__) @@ -70,7 +71,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - pandas: Optional[pd.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: @@ -81,6 +82,7 @@ def log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, timestamp_ms: Optional[int] = None, # Not the dataset timestamp, but the timestamp of the data @@ -97,7 +99,7 @@ def log( """ if self._is_closed: raise LoggingError("Cannot log to a closed logger") - if obj is None and pandas is None and row is None: + if obj is None and pandas is None and polars is None and row is None: # TODO: check for shell environment and emit more verbose error string to let user know how to correct. raise LoggingError("log() was called without passing in any input!") @@ -106,10 +108,11 @@ def log( self._metadata = dict() self._metadata["name"] = name active_schema = schema or self._schema + dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) if active_schema: - pandas, row = _pandas_or_dict(obj, pandas, row) - obj = None - pandas, row = active_schema._run_udfs(pandas, row) + dataframe, row = active_schema._run_udfs(dataframe, row) + obj = None + # If segments are defined use segment_processing to return a SegmentedResultSet if active_schema and active_schema.segments: @@ -126,10 +129,10 @@ def log( _safe_merge_metadata(default_metadata=segmented_results.metadata, incoming_metadata=active_schema.metadata) return segmented_results - profiles = self._get_matching_profiles(obj, pandas=pandas, row=row, schema=active_schema) + profiles = self._get_matching_profiles(obj, dataframe=dataframe, row=row, schema=active_schema) for prof in profiles: - prof.track(obj, pandas=pandas, row=row, execute_udfs=False) + prof.track(obj, dataframe=dataframe, row=row, execute_udfs=False) prof._metadata = _populate_common_profile_metadata(prof._metadata, trace_id=trace_id, tags=tags) if active_schema: _safe_merge_metadata(prof._metadata, active_schema.metadata) diff --git a/python/whylogs/api/logger/rolling.py b/python/whylogs/api/logger/rolling.py index 1906683a5f..b57ba02337 100644 --- a/python/whylogs/api/logger/rolling.py +++ b/python/whylogs/api/logger/rolling.py @@ -14,6 +14,7 @@ from whylogs.api.logger.segment_cache import SegmentCache from whylogs.api.writer import Writer from whylogs.core import DatasetProfile, DatasetProfileView, DatasetSchema +from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.stubs import pd from whylogs.core.view.segmented_dataset_profile_view import SegmentedDatasetProfileView @@ -147,7 +148,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - pandas: Optional[pd.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: diff --git a/python/whylogs/api/logger/segment_processing.py b/python/whylogs/api/logger/segment_processing.py index f4bd30dfbc..8e71204b1b 100644 --- a/python/whylogs/api/logger/segment_processing.py +++ b/python/whylogs/api/logger/segment_processing.py @@ -7,14 +7,15 @@ from whylogs.api.logger.segment_cache import SegmentCache from whylogs.core import DatasetSchema from whylogs.core.dataset_profile import DatasetProfile -from whylogs.core.input_resolver import _pandas_or_dict +from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.input_resolver import _dataframe_or_dict from whylogs.core.segment import Segment from whylogs.core.segmentation_partition import ( ColumnMapperFunction, SegmentationPartition, SegmentFilter, ) -from whylogs.core.stubs import pd +from whylogs.core.stubs import pd, pl logger = logging.getLogger(__name__) @@ -136,7 +137,8 @@ def _log_segment( segment_key_values: Optional[Dict[str, str]] = None, ) -> Dict[Segment, Any]: segments: Dict[Segment, Any] = {} - pandas, row = _pandas_or_dict(obj, pandas, row) + dataframe, row = _dataframe_or_dict(obj, pandas, row=row) + pandas = dataframe.pd_df if dataframe else pandas if partition.filter: pandas, row = _filter_inputs(partition.filter, pandas, row) if partition.simple: diff --git a/python/whylogs/api/logger/transient.py b/python/whylogs/api/logger/transient.py index 850dbc6687..f118e349ed 100644 --- a/python/whylogs/api/logger/transient.py +++ b/python/whylogs/api/logger/transient.py @@ -2,6 +2,7 @@ from whylogs.api.logger.logger import Logger from whylogs.core import DatasetProfile, DatasetSchema +from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.stubs import pd @@ -13,7 +14,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - pandas: Optional[pd.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: diff --git a/python/whylogs/api/whylabs/session/notebook_logger.py b/python/whylogs/api/whylabs/session/notebook_logger.py index 7ba93ad9db..0ef87a7b12 100644 --- a/python/whylogs/api/whylabs/session/notebook_logger.py +++ b/python/whylogs/api/whylabs/session/notebook_logger.py @@ -6,7 +6,8 @@ from whylogs.api.whylabs.session.session_manager import get_current_session from whylogs.api.whylabs.session.session_types import InteractiveLogger as il from whylogs.api.whylabs.session.session_types import SessionType -from whylogs.core.stubs import pd +from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.stubs import pd, pl def notebook_session_log_comparison( @@ -61,6 +62,7 @@ def notebook_session_log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, row: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ) -> None: diff --git a/python/whylogs/core/dataframe_wrapper.py b/python/whylogs/core/dataframe_wrapper.py new file mode 100644 index 0000000000..32f31792a0 --- /dev/null +++ b/python/whylogs/core/dataframe_wrapper.py @@ -0,0 +1,23 @@ +from typing import List, Optional, Union + +from whylogs.core.stubs import pd, pl + + +class DataFrameWrapper: + def __init__(self, pandas: Optional[pd.DataFrame]=None, polars: Optional[pl.DataFrame]=None): + if pandas is not None and polars is not None: + raise ValueError("Cannot pass both pandas and polars params") + if pandas is None and polars is None: + raise ValueError("Must pass either pandas or polars") + + self.pd_df = pandas + self.pl_df = polars + + self.column_names = list(pandas.columns) if pandas is not None else polars.columns + self.dtypes = pandas.dtypes if pandas is not None else polars.schema + self.empty = pandas.empty if pandas is not None else len(polars) == 0 + + def get(self, column: str) -> Optional[Union[pd.Series, pl.Series]]: + if self.pd_df is not None: + return self.pd_df.get(column) + return self.pl_df[column] if column in self.pl_df.schema else None diff --git a/python/whylogs/core/dataset_profile.py b/python/whylogs/core/dataset_profile.py index ccf609cd64..85ba7e1d7b 100644 --- a/python/whylogs/core/dataset_profile.py +++ b/python/whylogs/core/dataset_profile.py @@ -12,9 +12,10 @@ from whylogs.core.utils.utils import deprecated, deprecated_alias, ensure_timezone from .column_profile import ColumnProfile -from .input_resolver import _pandas_or_dict +from .dataframe_wrapper import DataFrameWrapper +from .input_resolver import _dataframe_or_dict from .schema import DatasetSchema -from .stubs import pd +from .stubs import pd, pl from .view import DatasetProfileView logger = logging.getLogger(__name__) @@ -109,13 +110,18 @@ def track( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None, execute_udfs: bool = True, ) -> None: + if dataframe is None: + dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) + try: self._is_active = True self._track_count += 1 - self._do_track(obj, pandas=pandas, row=row, execute_udfs=execute_udfs) + self._do_track(obj, dataframe=dataframe, row=row, execute_udfs=execute_udfs) finally: self._is_active = False @@ -123,18 +129,17 @@ def _do_track( self, obj: Any = None, *, - pandas: Optional[pd.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None, execute_udfs: bool = True, ) -> None: - pandas, row = _pandas_or_dict(obj, pandas, row) if execute_udfs: - pandas, row = self._schema._run_udfs(pandas, row) + dataframe, row = self._schema._run_udfs(dataframe, row) col_id: Optional[str] = getattr(self._schema.default_configs, "identity_column", None) # TODO: do this less frequently when operating at row level - dirty = self._schema.resolve(pandas=pandas, row=row) + dirty = self._schema.resolve(dataframe=dataframe, row=row) if dirty: schema_col_keys = self._schema.get_col_names() new_cols = (col for col in schema_col_keys if col not in self._columns) @@ -146,23 +151,24 @@ def _do_track( self._columns[k]._track_datum(row[k], row_id) return - elif pandas is not None: + elif dataframe is not None: # TODO: iterating over each column in order assumes single column metrics # but if we instead iterate over a new artifact contained in dataset profile: "MetricProfiles", then # each metric profile can specify which columns its tracks, and we can call like this: # metric_profile.track(pandas) - if pandas.empty: - logger.warning("whylogs was passed an empty pandas DataFrame so nothing to profile in this call.") + if dataframe.empty: + logger.warning("whylogs was passed an empty DataFrame so nothing to profile in this call.") return - for k in pandas.keys(): - column_values = pandas.get(k) + for k in dataframe.column_names: + column_values = dataframe.get(k) if column_values is None: logger.error( - f"whylogs was passed a pandas DataFrame with key [{k}] but DataFrame.get({k}) returned nothing!" + f"whylogs was passed a DataFrame with key [{k}] but DataFrame.get({k}) returned nothing!" ) return dtype = self._schema.types.get(k) + # TODO: support Polars homogeneous columns? homogeneous = ( dtype is not None and isinstance(dtype, tuple) @@ -171,7 +177,7 @@ def _do_track( and dtype[1] == ColumnProperties.homogeneous ) - id_values = pandas.get(col_id) if col_id else None + id_values = dataframe.get(col_id) if col_id else None if col_id is not None and id_values is None: logger.warning(f"identity column was passed as {col_id} but column was not found in the dataframe.") diff --git a/python/whylogs/core/datatypes.py b/python/whylogs/core/datatypes.py index 9941043e10..5755be771a 100644 --- a/python/whylogs/core/datatypes.py +++ b/python/whylogs/core/datatypes.py @@ -2,7 +2,7 @@ from decimal import Decimal from typing import Any, Generic, List, Optional, Type, TypeVar, Union -from whylogs.core.stubs import is_not_stub, np +from whylogs.core.stubs import is_not_stub, np, pl try: from pandas.core.api import CategoricalDtype @@ -51,10 +51,13 @@ def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool: if maybe_type: dtype_or_type = maybe_type # type: ignore + if issubclass(type(dtype_or_type), pl.datatypes.IntegerType): + return True + if not isinstance(dtype_or_type, type): return False - if issubclass(dtype_or_type, (bool, int, np.number, np.bool_)): + if issubclass(dtype_or_type, (bool, int, np.number, np.bool_, pl.datatypes.IntegerType)): if is_not_stub(np.issubdtype) and np.issubdtype(dtype_or_type, np.floating): return False if issubclass(dtype_or_type, (np.datetime64, np.timedelta64)): @@ -73,6 +76,9 @@ def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool: if maybe_type: dtype_or_type = maybe_type + if issubclass(type(dtype_or_type), (pl.Float32, pl.Float64)): + return True + if not isinstance(dtype_or_type, type): return False @@ -85,6 +91,9 @@ def __init__(self) -> None: @classmethod def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool: + if issubclass(type(dtype_or_type), (pl.String, pl.Utf8)): + return True + # Pandas Categorical is Strings if CategoricalDtype is not None and isinstance(dtype_or_type, CategoricalDtype): return True @@ -101,7 +110,7 @@ def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool: if not isinstance(dtype_or_type, type): return False - if issubclass(dtype_or_type, (str, np.unicode_)): + if issubclass(dtype_or_type, (str, np.unicode_, pl.String, pl.Utf8)): return True return False diff --git a/python/whylogs/core/input_resolver.py b/python/whylogs/core/input_resolver.py index d6ace01d44..99c9145560 100644 --- a/python/whylogs/core/input_resolver.py +++ b/python/whylogs/core/input_resolver.py @@ -1,14 +1,17 @@ from typing import Any, Dict, Mapping, Optional, Tuple -from whylogs.core.stubs import pd +from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.stubs import pd, pl -def _pandas_or_dict( - obj: Any, pandas: Optional[pd.DataFrame] = None, row: Optional[Mapping[str, Any]] = None -) -> Tuple[Optional[pd.DataFrame], Optional[Mapping[str, Any]]]: +def _dataframe_or_dict( + obj: Any, pandas: Optional[pd.DataFrame] = None, polars: Optional[pl.DataFrame] = None, row: Optional[Mapping[str, Any]] = None +) -> Tuple[Optional[DataFrameWrapper], Optional[Mapping[str, Any]]]: if obj is not None: if pandas is not None: raise ValueError("Cannot pass both obj and pandas params") + if polars is not None: + raise ValueError("Cannot pass both obj and polars params") if row is not None: raise ValueError("Cannot pass both obj and row params") @@ -16,8 +19,14 @@ def _pandas_or_dict( row = obj elif pd.DataFrame is not None and isinstance(obj, pd.DataFrame): pandas = obj + elif pl.DataFrame is not None and isinstance(obj, pl.DataFrame): + polars = obj if pandas is not None and row is not None: raise ValueError("Cannot pass both pandas and row params") - return (pandas, row) + if polars is not None and row is not None: + raise ValueError("Cannot pass both polars and row params") + + df = DataFrameWrapper(pandas, polars) if (pandas is not None or polars is not None) else None + return (df, row) diff --git a/python/whylogs/core/metrics/unicode_range.py b/python/whylogs/core/metrics/unicode_range.py index 49302dccf3..f819fb8658 100644 --- a/python/whylogs/core/metrics/unicode_range.py +++ b/python/whylogs/core/metrics/unicode_range.py @@ -79,6 +79,7 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult: view.pandas.strings.to_list() if view.pandas.strings is not None and not view.pandas.strings.empty else [] ) data = (data + view.list.strings) if view.list.strings else data + data = (data + view.numpy.strings.tolist()) if view.numpy.strings is not None else data range_data: Dict[str, List[int]] = {range_name: [] for range_name in self.range_definitions.keys()} lengths: List[int] = [] for value in data: diff --git a/python/whylogs/core/preprocessing.py b/python/whylogs/core/preprocessing.py index 02defa287d..e4651476ef 100644 --- a/python/whylogs/core/preprocessing.py +++ b/python/whylogs/core/preprocessing.py @@ -6,7 +6,7 @@ from math import isinf, isnan from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union -from whylogs.core.stubs import is_not_stub, np, pd +from whylogs.core.stubs import is_not_stub, np, pd, pl logger = logging.getLogger("whylogs.core.views") @@ -170,7 +170,7 @@ def _pandas_split(self, series: pd.Series, parse_numeric_string: bool = False) - int_mask = non_null_series.apply(lambda x: pdc.is_number(x) and pdc.is_integer(x) and not pdc.is_bool(x)) str_mask = non_null_series.apply(lambda x: isinstance(x, str)) tensor_mask = non_null_series.apply( - lambda x: isinstance(x, (list, np.ndarray)) and PreprocessedColumn._is_tensorable(x) + lambda x: isinstance(x, (list, np.ndarray)) and _is_tensorable(x) ) floats = non_null_series[float_mask] @@ -199,6 +199,54 @@ def _pandas_split(self, series: pd.Series, parse_numeric_string: bool = False) - self.bool_count = bool_count self.bool_count_where_true = bool_count_where_true + def _polars_split(self, series: pl.Series, parse_numeric_string: bool = False) -> None: + """ + Split a Polars Series into numpy array and other Polars series. + + Args: + series: the original Pandas series + parse_numeric_string: if set, this will coerce values into integer using pands.to_numeric() method. + + Returns: + SplitSeries with multiple values, including numpy arrays for numbers, and strings as a Polars Series. + """ + + # TODO: add a PolarsView, or convert PandasView to work with both Polars & Pandas + + if series is None: + return None + if pl.Series is None: + return None + + non_null_series = series.drop_nulls() + if non_null_series.len() < 1: + return + + self.null_count = series.null_count() + if series.dtype.is_numeric(): + if series.dtype.is_float(): + non_nan_series = non_null_series.drop_nans() + self.nan_count = non_null_series.len() - non_nan_series.len() + self.inf_count = non_null_series.is_infinite().sum() + self.numpy.floats = non_null_series.to_numpy() + return + else: + self.numpy.ints = non_null_series.to_numpy() + return + + if series.dtype in {pl.String, pl.Categorical, pl.Enum, pl.Utf8}: + self.numpy.strings = non_null_series.to_numpy() + return + + # TODO: tensor support + + if series.dtype == pl.Boolean: + self.bool_count = non_null_series.len() + self.bool_count_where_true = non_null_series.sum() + return + + self.list.objs = non_null_series.to_list() + def raw_iterator(self) -> Iterator[Any]: iterables = [ *self.numpy.iterables(), @@ -234,9 +282,9 @@ def _process_scalar_value(value: Any) -> "PreprocessedColumn": float_list.append(value) elif isinstance(value, str): string_list.append(value) - elif isinstance(value, list) and PreprocessedColumn._is_tensorable(value): + elif isinstance(value, list) and _is_tensorable(value): tensor_list.append(np.asarray(value)) - elif is_not_stub(np.ndarray) and PreprocessedColumn._is_tensorable(value): + elif is_not_stub(np.ndarray) and _is_tensorable(value): tensor_list.append(value) elif value is not None: obj_list.append(value) @@ -292,7 +340,7 @@ def _process_homogeneous_column(series: pd.Series) -> "PreprocessedColumn": result.bool_count = series.count() result.bool_count_where_true = series[bool_mask_where_true].count() return result - elif isinstance(value, (list, np.ndarray)) and PreprocessedColumn._is_tensorable(value): + elif isinstance(value, (list, np.ndarray)) and _is_tensorable(value): if isinstance(value, np.ndarray): result.pandas.tensors = series else: @@ -312,6 +360,11 @@ def apply(data: Any) -> "PreprocessedColumn": result.len = len(data) return result + if pl.Series is not None and isinstance(data, pl.Series): + result._polars_split(data) + result.len = len(data) + return result + if isinstance(data, np.ndarray): result.len = len(data) if issubclass(data.dtype.type, (np.number, np.str_)): @@ -343,9 +396,9 @@ def apply(data: Any) -> "PreprocessedColumn": float_list.append(x) elif isinstance(x, str): string_list.append(x) - elif isinstance(x, list) and PreprocessedColumn._is_tensorable(x): + elif isinstance(x, list) and _is_tensorable(x): tensor_list.append(np.asarray(x)) - elif isinstance(x, np.ndarray) and PreprocessedColumn._is_tensorable(x): + elif isinstance(x, np.ndarray) and _is_tensorable(x): tensor_list.append(x) elif x is not None: obj_list.append(x) @@ -378,14 +431,14 @@ def apply(data: Any) -> "PreprocessedColumn": list_format = [data] return PreprocessedColumn.apply(list_format) - @staticmethod - def _is_tensorable(value: Union[np.ndarray, List[Any]]) -> bool: - if not is_not_stub(np.ndarray): - return False - - maybe_tensor = value if isinstance(value, np.ndarray) else np.asarray(value) - return ( - len(maybe_tensor.shape) > 0 - and all([i > 0 for i in maybe_tensor.shape]) - and np.issubdtype(maybe_tensor.dtype, np.number) - ) + +def _is_tensorable(value: Union[np.ndarray, List[Any]]) -> bool: + if not is_not_stub(np.ndarray): + return False + + maybe_tensor = value if isinstance(value, np.ndarray) else np.asarray(value) + return ( + len(maybe_tensor.shape) > 0 + and all([i > 0 for i in maybe_tensor.shape]) + and np.issubdtype(maybe_tensor.dtype, np.number) + ) diff --git a/python/whylogs/core/schema.py b/python/whylogs/core/schema.py index 7cef397bd9..cbb43a3bcc 100644 --- a/python/whylogs/core/schema.py +++ b/python/whylogs/core/schema.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, TypeVar, Union import whylogs.core.resolvers as res +from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.datatypes import StandardTypeMapper, TypeMapper from whylogs.core.metrics.metrics import Metric, MetricConfig from whylogs.core.resolvers import ( @@ -13,7 +14,7 @@ ResolverSpec, ) from whylogs.core.segmentation_partition import SegmentationPartition -from whylogs.core.stubs import pd +from whylogs.core.stubs import pd, pl from whylogs.core.validators.validator import Validator, deepcopy_validators logger = logging.getLogger(__name__) @@ -131,10 +132,14 @@ def resolve( self, *, pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None, ) -> bool: - if pandas is not None: - return self._resolve_pdf(pandas) + if dataframe: + return self._resolve_dataframe(dataframe) + if pandas is not None or polars is not None: + return self._resolve_dataframe(DataFrameWrapper(pandas, polars)) if row is not None: for k, v in row.items(): @@ -151,12 +156,12 @@ def resolve( return True raise NotImplementedError - def _resolve_pdf(self, df: pd.DataFrame, force_resolve: bool = False) -> bool: + def _resolve_dataframe(self, df: DataFrameWrapper, force_resolve: bool = False) -> bool: """ Resolve ColumnSchema from the dataframe. We only resolve newly detected columns unless `force_resolve` is set to True. """ - col_names = df.dtypes.keys() + col_names = df.column_names dirty = False for col_name in col_names: if not force_resolve and col_name in self._columns: @@ -176,9 +181,9 @@ def _resolve_pdf(self, df: pd.DataFrame, force_resolve: bool = False) -> bool: return dirty def _run_udfs( - self, pandas: Optional[pd.DataFrame] = None, row: Optional[Mapping[str, Any]] = None - ) -> Tuple[Optional[pd.DataFrame], Optional[Mapping[str, Any]]]: - return pandas, row + self, df: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None + ) -> Tuple[Optional[DataFrameWrapper], Optional[Mapping[str, Any]]]: + return df, row def get_col_names(self) -> tuple: return tuple(self._columns.keys()) diff --git a/python/whylogs/core/stubs.py b/python/whylogs/core/stubs.py index 629807cd3f..f7949c844c 100644 --- a/python/whylogs/core/stubs.py +++ b/python/whylogs/core/stubs.py @@ -9,6 +9,11 @@ except ImportError: # noqa _pd = None # type: ignore +try: + import polars as _pl +except ImportError: # noqa + _pl = None # type: ignore + try: import numpy as _np except ImportError: # noqa @@ -55,6 +60,25 @@ class PandasStub(object): DataFrame: type = _StubClass +@dataclass(frozen=True) +class _PolarsTypeStub: + IntegerType: type = _StubClass + + +@dataclass(frozen=True) +class PolarsStub(object): + Series: type = _StubClass + DataFrame: type = _StubClass + # TODO: support more Polars types? + datatypes: type = _PolarsTypeStub + Int32: type = _StubClass + Int64: type = _StubClass + Float32: type = _StubClass + Float64: type = _StubClass + String: type = _StubClass + Utf8: type = _StubClass + + @dataclass(frozen=True) class ScipyStub: sparse: type = _StubClass @@ -72,7 +96,7 @@ def is_not_stub(stubbed_class: Any) -> bool: if ( stubbed_class and stubbed_class is not _StubClass - and not isinstance(stubbed_class, (PandasStub, NumpyStub, ScipyStub, ScikitLearnStub)) + and not isinstance(stubbed_class, (PandasStub, PolarsStub, NumpyStub, ScipyStub, ScikitLearnStub)) ): return True return False @@ -84,6 +108,9 @@ def is_not_stub(stubbed_class: Any) -> bool: if _pd is None: _pd = PandasStub() +if _pl is None: + _pl = PolarsStub() + if _sp is None: _sp = ScipyStub() @@ -99,6 +126,7 @@ def is_not_stub(stubbed_class: Any) -> bool: np = _np pd = _pd +pl = _pl sp = _sp sklp = _sklp sklc = _sklc From 9d60022439524e0f6fc06f047b4c35d129debb90 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Tue, 29 Oct 2024 17:19:25 +0000 Subject: [PATCH 02/41] segments and udfs --- python/tests/api/logger/test_logger.py | 5 - python/tests/api/logger/test_logger_polars.py | 19 +- .../tests/api/logger/test_segments_polars.py | 546 ++++++++++++++++++ .../tests/core/metrics/test_metrics_polars.py | 317 ++++++++++ python/tests/core/test_performance_polars.py | 215 +++++++ .../core/test_udf_schema_polars.py | 370 ++++++++++++ python/whylogs/api/logger/__init__.py | 2 +- .../experimental/logger/actor/data_logger.py | 1 + .../logger/actor/thread_rolling_logger.py | 8 +- python/whylogs/api/logger/logger.py | 4 +- python/whylogs/api/logger/rolling.py | 1 - .../whylogs/api/logger/segment_processing.py | 49 +- python/whylogs/api/logger/transient.py | 1 - .../api/whylabs/session/notebook_logger.py | 1 - python/whylogs/core/dataframe_wrapper.py | 147 ++++- python/whylogs/core/input_resolver.py | 5 +- python/whylogs/core/preprocessing.py | 4 +- .../whylogs/experimental/core/udf_schema.py | 86 +-- 18 files changed, 1687 insertions(+), 94 deletions(-) create mode 100644 python/tests/api/logger/test_segments_polars.py create mode 100644 python/tests/core/metrics/test_metrics_polars.py create mode 100644 python/tests/core/test_performance_polars.py create mode 100644 python/tests/experimental/core/test_udf_schema_polars.py diff --git a/python/tests/api/logger/test_logger.py b/python/tests/api/logger/test_logger.py index d75e303755..f2d6a8d6d1 100644 --- a/python/tests/api/logger/test_logger.py +++ b/python/tests/api/logger/test_logger.py @@ -15,9 +15,6 @@ from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema -pd.set_option("display.max_columns", None) -pd.set_option("display.max_rows", None) - FLOAT_TYPES = [float, np.float16, np.float32, np.float64, np.floating, np.float_, np.longdouble] INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] DATETIME_TYPES = [np.datetime64, pd.Timestamp] @@ -31,8 +28,6 @@ def test_basic_log_schema() -> None: results = logger.log(df, schema=DatasetSchema()) profile = results.profile() assert profile._columns["col1"]._schema.dtype == np.int64 - print(profile.view().to_pandas()) - assert False def test_basic_log_schem_constructor() -> None: diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 53e3cab635..f8c56a8dda 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -3,7 +3,6 @@ from typing import Any import numpy as np -import pandas as pd import polars as pl import pytest @@ -16,15 +15,8 @@ from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema -''' -FLOAT_TYPES = [float, np.float16, np.float32, np.float64, np.floating, np.float_, np.longdouble] +FLOAT_TYPES = [float, np.float32, np.float64, np.float_] INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] -DATETIME_TYPES = [np.datetime64, pd.Timestamp] -TIMEDELTA_TYPES = ["timedelta64[s]", "timedelta64[ms]"] -''' - -pd.set_option("display.max_columns", None) -pd.set_option("display.max_rows", None) def test_basic_log_schema() -> None: @@ -35,6 +27,7 @@ def test_basic_log_schema() -> None: profile = results.profile() assert profile._columns["col1"]._schema.dtype == pl.Int64 + def test_basic_log_schem_constructor() -> None: d = {"col1": [1, 2]} df = pl.DataFrame(data=d) @@ -141,11 +134,10 @@ def test_profile_write(tmp_path: Any) -> None: assert os.path.isfile(path) -''' -@pytest.mark.parametrize("data_type", [*INTEGER_TYPES, *FLOAT_TYPES, *TIMEDELTA_TYPES]) +@pytest.mark.parametrize("data_type", [*INTEGER_TYPES, *FLOAT_TYPES]) def test_different_integer_types(data_type) -> None: - d = {"col1": [1, 3, 2, 5]} - df = pl.DataFrame(d, dtype=data_type) + d = {"col1": [data_type(x) for x in [1, 3, 2, 5]]} + df = pl.DataFrame(d) results = why.log(df) view = results.view() @@ -156,7 +148,6 @@ def test_different_integer_types(data_type) -> None: view_pandas = view.to_pandas() assert len(view_pandas) == 1 assert len(view_pandas.columns) > 0 -''' def test_counters_dataframe_vs_row() -> None: diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py new file mode 100644 index 0000000000..da979cf5b3 --- /dev/null +++ b/python/tests/api/logger/test_segments_polars.py @@ -0,0 +1,546 @@ +import math +import os +import pickle +import tempfile +from glob import glob +from logging import getLogger +from typing import Any + +import numpy as np +import polars as pl +import pytest + +import whylogs as why +from whylogs.api.logger.result_set import ( + ProfileResultSet, + SegmentedResultSet, + ViewResultSet, +) +from whylogs.core.metrics.metrics import CardinalityMetric, DistributionMetric +from whylogs.core.schema import DatasetSchema +from whylogs.core.segment import Segment +from whylogs.core.segmentation_partition import ( + ColumnMapperFunction, + SegmentationPartition, + SegmentFilter, + segment_on_column, +) +from whylogs.core.view.dataset_profile_view import DatasetProfileView +from whylogs.migration.converters import read_v0_to_view + +TEST_LOGGER = getLogger(__name__) + + +def test_single_row_segment() -> None: + segment_column = "col3" + number_of_segments = 1 + + test_segments = segment_on_column("col3") + results: SegmentedResultSet = why.log( + {"col1": 1, "col2": 1.1, "col3": "x0"}, schema=DatasetSchema(segments=test_segments) + ) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment = next(iter(segments)) + assert first_segment.key == ("x0",) + first_segment_profile = results.profile(first_segment) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == int + assert first_segment_profile._columns["col2"]._schema.dtype == float + assert first_segment_profile._columns["col3"]._schema.dtype == str + segment_cardinality: CardinalityMetric = ( + first_segment_profile.view().get_column(segment_column).get_metric("cardinality") + ) + cardinality = segment_cardinality.estimate + assert cardinality is not None + assert cardinality == 1.0 + + +def test_single_column_segment() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column("col3") + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments)) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment = next(iter(segments)) + first_segment_profile = results.profile(first_segment) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + segment_cardinality: CardinalityMetric = ( + first_segment_profile.view().get_column(segment_column).get_metric("cardinality") + ) + cardinality = segment_cardinality.estimate + assert cardinality is not None + assert cardinality == 1.0 + + +def test_single_column_and_manual_segment() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column("col3") + results: SegmentedResultSet = why.log( + df, schema=DatasetSchema(segments=test_segments), segment_key_values={"zzz": "foo", "ver": 1} + ) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment = next(iter(segments)) + #assert first_segment.key == ("x0", "1", "foo") + first_segment_profile = results.profile(first_segment) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + segment_cardinality: CardinalityMetric = ( + first_segment_profile.view().get_column(segment_column).get_metric("cardinality") + ) + cardinality = segment_cardinality.estimate + assert cardinality is not None + assert cardinality == 1.0 + + +def test_throw_on_duplicate_keys() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column("col3") + + with pytest.raises(ValueError): + why.log(df, schema=DatasetSchema(segments=test_segments), segment_key_values={segment_column: "foo"}) + + +def test_single_column_segment_with_trace_id() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + trace_id = "123-456" + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column("col3") + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments), trace_id=trace_id) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment = next(iter(segments)) + first_segment_profile = results.profile(first_segment) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + segment_cardinality: CardinalityMetric = ( + first_segment_profile.view().get_column(segment_column).get_metric("cardinality") + ) + cardinality = segment_cardinality.estimate + assert cardinality is not None + # cardinality is an estimate, and because this is the segment column, it should + # by definition contain only one unique value per segment. + assert cardinality == 1.0 + assert results.metadata is not None + assert results.metadata["whylabs.traceId"] == trace_id + + +def test_single_integer_column_segment() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [(i % number_of_segments) for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column("col3") + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments)) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment = next(iter(segments)) + first_segment_profile = results.profile(first_segment) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.Int64 #np.int64 + segment_cardinality: CardinalityMetric = ( + first_segment_profile.view().get_column(segment_column).get_metric("cardinality") + ) + cardinality = segment_cardinality.estimate + assert cardinality is not None + assert cardinality == 1.0 + + +@pytest.mark.skip("Haven't figured out how to curry Polars expressions yet") +def test_filtered_single_column_segment() -> None: + input_rows = 100 + segment_column = "col3" + number_of_segments = 5 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column(segment_column) + test_segments[segment_column].filter = SegmentFilter(filter_function=(pl.col("col1") > 49)) + + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments)) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + first_segment: Segment = next(next(iter(segments))) # polars comes out in different order + first_segment_profile = results.profile(first_segment) + assert first_segment.key == ("x0",) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String #.name == "object" + segment_distribution: DistributionMetric = ( + first_segment_profile.view().get_column("col1").get_metric("distribution") + ) + assert segment_distribution is not None + count = segment_distribution.n + assert count is not None + assert count == 10 + + +@pytest.mark.parametrize("v0", [True, False]) +def test_segment_write_roundtrip_versions(tmp_path: Any, v0) -> None: + input_rows = 10 + segment_column = "col3" + number_of_segments = 2 + trace_id = "123-456" + values_per_segment = input_rows / number_of_segments + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + segment_column: [f"x{str(i%number_of_segments)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + test_segments = segment_on_column(segment_column) + + results: SegmentedResultSet = why.log(df, trace_id=trace_id, schema=DatasetSchema(segments=test_segments)) + assert results.count == number_of_segments + partitions = results.partitions + assert len(partitions) == 1 + partition = partitions[0] + segments = results.segments_in_partition(partition) + assert len(segments) == number_of_segments + + seg_i = iter(segments) # polars segments order is non-deterministic + first_segment: Segment = next(seg_i) + if first_segment.key == ("x1",): + first_segment = next(seg_i) + first_segment_profile = results.profile(first_segment) + assert first_segment.key == ("x0",) + assert first_segment_profile is not None + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String #.name == "object" + segment_distribution: DistributionMetric = ( + first_segment_profile.view().get_column("col1").get_metric("distribution") + ) + count = segment_distribution.n + assert count is not None + assert count == values_per_segment + + results.writer().option(base_dir=tmp_path).write(use_v0=v0) + paths = glob(os.path.join(tmp_path) + "/*x0.bin") + assert len(paths) == 1 + roundtrip_profiles = [] + for file_path in paths: + if v0: + roundtrip_profiles.append(read_v0_to_view(path=file_path)) + else: + roundtrip_profiles.append(why.read(path=file_path).view()) + assert len(roundtrip_profiles) == 1 + post_deserialization_first_view = roundtrip_profiles[0] + assert post_deserialization_first_view is not None + assert isinstance(post_deserialization_first_view, DatasetProfileView) + + # check that trace_id is preserved round trip in metadata + assert post_deserialization_first_view.metadata + assert "whylabs.traceId" in post_deserialization_first_view.metadata + assert trace_id == post_deserialization_first_view.metadata["whylabs.traceId"] + pre_serialization_first_view = first_segment_profile.view() + pre_columns = pre_serialization_first_view.get_columns() + post_columns = post_deserialization_first_view.get_columns() + + # check that the distribution looks similar for each column profile + for column_name in pre_columns: + initial_column_profile = pre_columns[column_name] + target_column_profile = post_columns[column_name] + assert initial_column_profile is not None + assert target_column_profile is not None + assert target_column_profile.get_metric("distribution").n == initial_column_profile.get_metric("distribution").n + assert ( + target_column_profile.get_metric("distribution").avg + == initial_column_profile.get_metric("distribution").avg + ) + + +def test_multi_column_segment() -> None: + input_rows = 100 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + "col3": [f"x{str(i%5)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + segmentation_partition = SegmentationPartition( + name="col1,col3", mapper=ColumnMapperFunction(col_names=["col1", "col3"]) + ) + test_segments = {segmentation_partition.name: segmentation_partition} + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments)) + segments = results.segments() + last_segment = segments[-1] + + # Note this segment is not useful as there is only one datapoint per segment, we have 100 rows and + # 100 segments. The segment value is a tuple of strings identifying this segment. + #assert last_segment.key == ("99", "x4") + + last_segment_profile = results.profile(last_segment) + + assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert last_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + + segment_distribution: DistributionMetric = last_segment_profile.view().get_column("col1").get_metric("distribution") + count = segment_distribution.n + assert count is not None + assert count == 1 + + +def test_multicolumn_and_manual_segment() -> None: + input_rows = 100 + d = { + "col1": [i for i in range(input_rows)], + "col2": [i * i * 1.1 for i in range(input_rows)], + "col3": [f"x{str(i%5)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + segmentation_partition = SegmentationPartition( + name="col1,col3", mapper=ColumnMapperFunction(col_names=["col1", "col3"]) + ) + test_segments = {segmentation_partition.name: segmentation_partition} + results: SegmentedResultSet = why.log( + df, schema=DatasetSchema(segments=test_segments), segment_key_values={"ver": 42, "zzz": "bar"} + ) + segments = results.segments() + last_segment = segments[-1] + + # Note this segment is not useful as there is only one datapoint per segment, we have 100 rows and + # 100 segments. The segment value is a tuple of strings identifying this segment. + #assert last_segment.key == ("99", "x4", "42", "bar") + + last_segment_profile = results.profile(last_segment) + + assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 + assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 + assert last_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + + segment_distribution: DistributionMetric = last_segment_profile.view().get_column("col1").get_metric("distribution") + count = segment_distribution.n + assert count is not None + assert count == 1 + + +def test_multi_column_segment_serialization_roundtrip_v0(tmp_path: Any) -> None: + input_rows = 35 + d = { + "A": [i % 7 for i in range(input_rows)], + "B": [f"x{str(i%5)}" for i in range(input_rows)], + } + + df = pl.DataFrame(d) + segmentation_partition = SegmentationPartition(name="A,B", mapper=ColumnMapperFunction(col_names=["A", "B"])) + test_segments = {segmentation_partition.name: segmentation_partition} + results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments)) + results.writer().option(base_dir=tmp_path).write(use_v0=True) + + paths = glob(os.path.join(tmp_path) + "/*.bin") + assert len(paths) == input_rows + roundtrip_profiles = [] + for file_path in paths: + roundtrip_profiles.append(read_v0_to_view(file_path)) + assert len(roundtrip_profiles) == input_rows + TEST_LOGGER.info(roundtrip_profiles) + TEST_LOGGER.info(roundtrip_profiles[15]) + + post_deserialization_view = roundtrip_profiles[15] + assert post_deserialization_view is not None + assert isinstance(post_deserialization_view, DatasetProfileView) + + post_columns = post_deserialization_view.get_columns() + assert "A" in post_columns.keys() + assert "B" in post_columns.keys() + + +def test_merge_view() -> None: + df = pl.DataFrame({"col1": [1, 2]}) + logger = why.logger() + results = logger.log(df) + merged_results = results.merge(ViewResultSet.zero()) + view = merged_results.view() + assert view._columns["col1"]._metrics["types"].integral.value == 2 + + +def test_merge_two_result_sets() -> None: + df1 = pl.DataFrame({"col1": [1, 2]}) + df2 = pl.DataFrame({"col1": [3, 4]}) + logger = why.logger() + results1 = logger.log(df1) + results2 = logger.log(df2) + merged_results = results1.merge(results2) + view = merged_results.view() + assert view._columns["col1"]._metrics["types"].integral.value == 4 + assert view._columns["col1"]._metrics["distribution"].min == 1 + assert view._columns["col1"]._metrics["distribution"].max == 4 + + +def test_merge_result_set_zero() -> None: + df = pl.DataFrame({"col1": [1, 2]}) + logger = why.logger() + results = logger.log(df) + merged_results = results.merge(ProfileResultSet.zero()) + view = merged_results.view() + assert view._columns["col1"]._metrics["types"].integral.value == 2 + + +def test_pickle_load_merge_profile_view() -> None: + df = pl.DataFrame({"col1": [1, 2]}) + logger = why.logger() + results = logger.log(df) + view2 = logger.log({"col1": 3}).view() + pickle_loaded_view = None + with tempfile.NamedTemporaryFile() as tmp_file: + pickle.dump(results.view(), tmp_file) + tmp_file.flush() + tmp_file.seek(0) + pickle_loaded_view = pickle.load(tmp_file) + + assert pickle_loaded_view is not None + assert isinstance(pickle_loaded_view, DatasetProfileView) + + merged_view = view2.merge(pickle_loaded_view) + assert merged_view._columns["col1"]._metrics["types"].integral.value == 3 + + +def test_segment_merge_different_columns() -> None: + input_rows = 35 + d = { + "A": [i % 7 for i in range(input_rows)], + "B": [f"x{str(i%5)}" for i in range(input_rows)], + } + input_rows2 = 27 + d2 = { + "A": [i % 4 for i in range(input_rows2)], + "B": [f"x{str(i%7)}" for i in range(input_rows2)], + "C": [bool(i % 2) for i in range(input_rows2)], + } + + df = pl.DataFrame(d) + df2 = pl.DataFrame(d2) + segmentation_partition = SegmentationPartition(name="A,B", mapper=ColumnMapperFunction(col_names=["A", "B"])) + test_segments = {segmentation_partition.name: segmentation_partition} + segmented_schema = DatasetSchema(segments=test_segments) + results: SegmentedResultSet = why.log(df, schema=segmented_schema) + results2: SegmentedResultSet = why.log(df2, schema=segmented_schema) + merged_results = results.merge(results2) + + assert merged_results.count == 42 + for segment in merged_results.segments(): + segmented_view = merged_results.view(segment=segment) + if len(segmented_view._columns) == 3: + assert segmented_view._columns["C"] is not None + assert segmented_view._columns["C"]._metrics["types"].boolean.value > 0 + else: + # some segments haven't seen column 'C' and so only have two columns + assert len(segmented_view._columns) == 2 + assert segmented_view._columns["A"] is not None + assert segmented_view._columns["B"] is not None + assert segmented_view._columns["A"]._metrics["cardinality"].estimate == pytest.approx(1.0) + + +def test_segment_with_nans() -> None: + df = pl.DataFrame({"col_1": [1, 2, 3, 4, 5, 6], "col_nan": [0.0, 0.0, None, None, np.nan, math.nan]}) + column_segments = segment_on_column("col_nan") + schema = DatasetSchema(segments=column_segments) + profile_results = why.log(df, schema=schema) + assert profile_results.count == 3 # col_nan = True + segment = profile_results.segments()[0] + segmented_view = profile_results.profile(segment).view() + assert segmented_view.get_column("col_nan").get_metric("counts").to_summary_dict()["n"] == 2 + + segmentation_partition = SegmentationPartition( + name="col_1,col_nan", mapper=ColumnMapperFunction(col_names=["col_1", "col_nan"]) + ) + multi_column_segments = {segmentation_partition.name: segmentation_partition} + schema = DatasetSchema(segments=multi_column_segments) + + profile_results = why.log(df, schema=schema) + assert profile_results.count == 6 # (1,True), (2,True), (3,nan), (4,nan), (5,nan), (6,nan) + for segment in profile_results.segments(): + segmented_view = profile_results.profile(segment).view() + # each segment has n=1 + assert segmented_view.get_column("col_nan").get_metric("counts").to_summary_dict()["n"] == 1 diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py new file mode 100644 index 0000000000..6f2007d6f5 --- /dev/null +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -0,0 +1,317 @@ +from logging import getLogger + +import numpy as np +import pandas as pd +import pytest + +import whylogs as why +import whylogs.core.configs as cfg +from whylogs.core import ColumnProfileView, DatasetSchema +from whylogs.core.datatypes import Integral +from whylogs.core.metrics.maths import VarianceM2Result, parallel_variance_m2 +from whylogs.core.metrics.metrics import ( + CardinalityMetric, + DistributionMetric, + MetricConfig, +) +from whylogs.core.preprocessing import PreprocessedColumn +from whylogs.core.resolvers import StandardResolver +from whylogs.core.schema import ColumnSchema + +TEST_LOGGER = getLogger(__name__) + + +def test_distribution_metrics_numpy() -> None: + dist = DistributionMetric.zero(MetricConfig()) + data = list(range(0, 100)) + arr = np.array(data) + col = PreprocessedColumn.apply(arr) + dist.columnar_update(col) + + assert dist.kll.value.get_n() == 100 + assert dist.mean.value == arr.mean() + assert dist.variance == arr.var(ddof=1) + + distribution_summary = dist.to_summary_dict() + assert distribution_summary["q_01"] == 1.0 + assert distribution_summary["q_05"] == 5.0 + assert distribution_summary["q_95"] == 95.0 + assert distribution_summary["q_99"] == 99.0 + + +def test_distribution_metrics_series() -> None: + dist = DistributionMetric.zero(MetricConfig()) + data = pd.Series(list(range(100))) + col = PreprocessedColumn.apply(data) + dist.columnar_update(col) + + assert dist.kll.value.get_n() == 100 + assert dist.mean.value == data.mean() + assert dist.variance == data.var() + + +def test_distribution_variance_m2() -> None: + import statistics + + dist_list = DistributionMetric.zero(MetricConfig()) + dist_pandas = DistributionMetric.zero(MetricConfig()) + dist_numpy = DistributionMetric.zero(MetricConfig()) + test_input = [1, 2, 3, 4] + + list_test_input = PreprocessedColumn() + list_test_input.list.ints = test_input + n = len(test_input) + mean = sum(test_input) / n + variance = statistics.variance(test_input) # sample variance, uses n-1 normalization + m2 = (n - 1) * variance + TEST_LOGGER.info(f"statistic package using input {test_input} has variance={variance}, m2={m2}, n={n}") + pandas_test_input = PreprocessedColumn.apply(pd.Series(test_input)) + numpy_test_input = PreprocessedColumn.apply(np.array(test_input)) + dist_list.columnar_update(list_test_input) + dist_pandas.columnar_update(pandas_test_input) + dist_numpy.columnar_update(numpy_test_input) + + TEST_LOGGER.info(f"dist_list={dist_list.to_summary_dict()}") + TEST_LOGGER.info(f"dist_pandas={dist_pandas.to_summary_dict()}") + TEST_LOGGER.info(f"dist_numpy={dist_numpy.to_summary_dict()}") + assert dist_list.m2.value == m2 + assert dist_pandas.m2.value == m2 + assert dist_numpy.m2.value == m2 + assert dist_list.variance == variance + assert dist_pandas.variance == variance + assert dist_numpy.variance == variance + assert dist_list.avg == mean + assert dist_pandas.avg == mean + assert dist_numpy.avg == mean + + +def test_distribution_metrics_indexed_series_single_row() -> None: + dist = DistributionMetric.zero(MetricConfig()) + data = pd.Series(list(range(1)), index=[284]) + col = PreprocessedColumn.apply(data) + dist.columnar_update(col) + + assert dist.kll.value.get_n() == 1 + assert dist.mean.value == data.mean() + + +def test_distribution_metrics_list() -> None: + dist = DistributionMetric.zero(MetricConfig()) + col = PreprocessedColumn() + data = list(range(0, 100)) + col.list.ints = data + dist.columnar_update(col) + + assert dist.kll.value.get_n() == 100 + assert dist.mean.value == np.array(data).mean() + assert dist.variance == np.array(data).var(ddof=1) + + +def test_distribution_metrics_mixed_np_and_list() -> None: + dist = DistributionMetric.zero(MetricConfig()) + col = PreprocessedColumn() + col.list.ints = list(range(0, 50)) + col.numpy.ints = np.array(range(50, 100)) + dist.columnar_update(col) + + assert dist.kll.value.get_n() == 100 + a = np.array(col.list.ints) + b = col.numpy.ints + + assert dist.mean.value == np.array(np.concatenate([a, b])).mean() + + m2_a = a.var(ddof=1) * (len(a) - 1) + m2_b = b.var(ddof=1) * (len(b) - 1) + a_var = VarianceM2Result(n=len(a), mean=a.mean(), m2=m2_a) + b_var = VarianceM2Result(n=len(b), mean=b.mean(), m2=m2_b) + overall = parallel_variance_m2(first=a_var, second=b_var) + assert dist.variance == overall.m2 / (overall.n - 1) + + +def test_distribution_metrics_bool() -> None: + import whylogs.core.metrics.metrics as met + + met._BOOL_LIST_CHUNK_SIZE = 2 + + dist = DistributionMetric.zero() + p_col = PreprocessedColumn.apply([True, True, True, True, False, "foo", "bar"]) + operation_result = dist.columnar_update(p_col) + assert operation_result.ok + assert round(dist.mean.value, 3) == 0.8 + + +def test_distribution_metrics_bool_mixed() -> None: + import whylogs.core.metrics.metrics as met + + met._BOOL_LIST_CHUNK_SIZE = 2 + + dist = DistributionMetric.zero() + p_col = PreprocessedColumn.apply([True, False, 42]) + operation_result = dist.columnar_update(p_col) + assert operation_result.ok + assert dist.kll.value.get_n() == 3 + assert round(dist.avg, 3) == round(43 / 3, 3) + + +def test_track_single_values_profile_mean() -> None: + data = list(range(30)) + df = pd.DataFrame(data, columns=["col1"]) + actual_mean = df["col1"].mean() + actual_stddev = df["col1"].std() + prof_view_df = why.log(df).profile().view() + profile_mean1 = prof_view_df.get_column("col1").get_metric("distribution").mean.value + profile_stddev1 = prof_view_df.get_column("col1").get_metric("distribution").stddev + for i, d in enumerate(data): + if i == 0: + prof_track = why.log(row={"col1": d}).profile() + else: + prof_track.track({"col1": d}) + profile_mean2 = prof_track.view().get_column("col1").get_metric("distribution").mean.value + profile_stddev2 = prof_track.view().get_column("col1").get_metric("distribution").stddev + + assert round(actual_mean, 3) == round(profile_mean1, 3) + assert round(actual_mean, 3) == round(profile_mean2, 3) + assert round(actual_stddev, 3) == round(profile_stddev1, 3) + assert round(actual_stddev, 3) == round(profile_stddev2, 3) + + +def test_merge_single_values_profile_mean() -> None: + data = list(range(30)) + df = pd.DataFrame(data, columns=["col1"]) + actual_mean = df["col1"].mean() + actual_stddev = df["col1"].std() + prof_view_df = why.log(df).profile().view() + profile_mean1 = prof_view_df.get_column("col1").get_metric("distribution").mean.value + profile_stddev1 = prof_view_df.get_column("col1").get_metric("distribution").stddev + + profiles = [why.log(row={"col1": d}).profile().view() for d in data] + from functools import reduce + + merged_profile_view = reduce((lambda x, y: x.merge(y)), profiles) + profile_mean2 = merged_profile_view.get_column("col1").get_metric("distribution").mean.value + profile_stddev2 = merged_profile_view.get_column("col1").get_metric("distribution").stddev + + assert round(actual_mean, 3) == round(profile_mean1, 3) + assert round(actual_mean, 3) == round(profile_mean2, 3) + assert round(actual_stddev, 3) == round(profile_stddev1, 3) + assert round(actual_stddev, 3) == round(profile_stddev2, 3) + + +def test_merge_two_profiles_mean(lending_club_df: pd.DataFrame) -> None: + first_df = lending_club_df.head(500) + + second_df = lending_club_df.tail(500) + + actual_mean = lending_club_df["loan_amnt"].mean() + actual_mean_1 = first_df["loan_amnt"].mean() + actual_mean_2 = second_df["loan_amnt"].mean() + + first_profile: ColumnProfileView = why.log(first_df).view().get_column("loan_amnt") + first_profile_mean = first_profile.get_metric("distribution").mean.value + second_profile = why.log(second_df).view().get_column("loan_amnt") + second_profile_mean = second_profile.get_metric("distribution").mean.value + + merged_profile = first_profile.merge(second_profile) + merged_profile_mean = merged_profile.get_metric("distribution").mean.value + + assert round(merged_profile_mean, 3) == round(actual_mean, 3) + assert round(first_profile_mean, 3) == round(actual_mean_1, 3) + assert round(second_profile_mean, 3) == round(actual_mean_2, 3) + + +def test_frequent_items_handling_int_as_string() -> None: + df = pd.DataFrame({"int": [1, 1, 1]}) + + res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] + assert res.array[0][0].value == "1" # type: ignore + + +def test_frequent_items_handling_bool_as_string() -> None: + import whylogs.core.metrics.metrics as met + + met._BOOL_LIST_CHUNK_SIZE = 2 + df = pd.DataFrame({"bool": [True, True, True, True, False]}) + + res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] + assert res.array[0][0].value == "True" # type: ignore + assert res.array[0][1].value == "False" # type: ignore + + +def test_frequent_items_bounds_order() -> None: + df_gamma = pd.DataFrame({"feature1": np.random.gamma(1, 2, 1000).astype(int)}) + df_rand = pd.DataFrame({"feature1": np.random.randint(10000, size=9000)}) + df = df_gamma.append(df_rand) + + res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] + fi_tuple = res.array[0][0] + assert fi_tuple.lower <= fi_tuple.est <= fi_tuple.upper + + +@pytest.mark.parametrize( + "config, limit", + [ + (MetricConfig(), MetricConfig().max_frequent_item_size), + (MetricConfig(max_frequent_item_size=50), 50), + ], +) +def test_frequent_item_max_size(config: MetricConfig, limit: int) -> None: + df = pd.DataFrame({"str": ["X" * 200]}) + schema = DatasetSchema(default_configs=config) + res = why.log(df, schema=schema).view().to_pandas()["frequent_items/frequent_strings"] + assert len(res.array[0][0].value) <= limit + + +def test_cardinality_metric_booleans() -> None: + cardinality = CardinalityMetric.zero(MetricConfig()) + data = pd.Series([True, False, True, True]) + col = PreprocessedColumn.apply(data) + cardinality.columnar_update(col) + + assert cardinality.estimate == pytest.approx(2, 0.1) + + +def test_cardinality_metric_row_booleans() -> None: + column_name = "col1" + data = {column_name: True} + profile = why.log(data).profile() + view = profile.view() + cardinality = view.get_column(column_name).get_metric("cardinality") + + assert cardinality is not None + assert cardinality.estimate == pytest.approx(1, 0.1) + # track a bool value of false in the same column and check that cardinality increased to near 2. + profile.track(row={column_name: False}) + assert cardinality.estimate == pytest.approx(2, 0.1) + + +def test_cardinality_metric_booleans_top_level_api() -> None: + input_rows = 5 + col_name = "p" + d = {col_name: [bool(i % 2) for i in range(input_rows)]} + df = pd.DataFrame(data=d) + + results = why.log(df) + col_prof = results.view().get_column(col_name) + cardinality: CardinalityMetric = col_prof.get_metric("cardinality") + assert cardinality is not None + assert cardinality.estimate == pytest.approx(2, 0.1) + + +def test_cardinality_metric_booleans_all_false() -> None: + df = pd.DataFrame({"b": [False for i in range(3)]}) + col_prof = why.log(df).view().get_column("b") + cardinality: CardinalityMetric = col_prof.get_metric("cardinality") + assert cardinality.estimate == pytest.approx(1, 0.1) + + +def test_configure_MetricConfig_defaults(): + c0 = MetricConfig() + assert c0.kll_k == cfg.kll_k + assert not c0.fi_disabled + assert "frequent_items" in StandardResolver().resolve("", Integral(), ColumnSchema(Integral, c0)) + cfg.fi_disabled = True + c1 = MetricConfig() + assert c1.fi_disabled + assert not c0.fi_disabled + assert "frequent_items" not in StandardResolver().resolve("", Integral(), ColumnSchema(Integral, c1)) + cfg.fi_disabled = False diff --git a/python/tests/core/test_performance_polars.py b/python/tests/core/test_performance_polars.py new file mode 100644 index 0000000000..10133f97ea --- /dev/null +++ b/python/tests/core/test_performance_polars.py @@ -0,0 +1,215 @@ +import cProfile +import pstats +import random +from dataclasses import dataclass, field +from io import StringIO +from logging import getLogger +from typing import Any, Dict + +import numpy as np +import pandas as pd +import polars as pl +import pytest +import whylogs_sketching as ds # type: ignore + +import whylogs +from whylogs.core import ColumnProfile, ColumnSchema +from whylogs.core.dataset_profile import DatasetProfile +from whylogs.core.metrics.metrics import MetricConfig +from whylogs.core.resolvers import ( + HistogramCountingTrackingResolver, + LimitedTrackingResolver, + Resolver, + StandardResolver, +) + +TEST_LOGGER = getLogger(__name__) + +_TEST_RESOLVERS = [HistogramCountingTrackingResolver(), LimitedTrackingResolver(), StandardResolver()] + + +# TODO: this is from the baseline benchmark, but its not integrated with our metrics +@dataclass +class CustomHistogramMetric: + histogram: ds.kll_floats_sketch = field( + default=ds.kll_floats_sketch(MetricConfig().kll_k), + ) + + def track(self, val: Any) -> "CustomHistogramMetric": + if pd.isna(val): + return self + self.histogram.update(val) + return self + + +@pytest.mark.load +@pytest.mark.parametrize("test_resolver", _TEST_RESOLVERS) +def test_track_column_benchmark(test_resolver: Resolver) -> None: + dataframe_shapes = [(3400000, 43)] # 10x less rows to estimate + TEST_LOGGER.info(f"Running test_track_column_benchmark with {len(dataframe_shapes)} different test dataframes") + for num_rows, num_columns in dataframe_shapes: + TEST_LOGGER.info(f"shape of test dataframe is ({num_rows},{num_columns})...") + + profiler = cProfile.Profile() + string_output_stream = StringIO() + profiler.enable() + for column_index in range(num_columns): + column_name = str(column_index) + col_df = pl.DataFrame({ column_name: np.random.random(size=(num_rows,)) }) + col_prof = ColumnProfile( + name="perf_test", schema=ColumnSchema(float, resolver=test_resolver), cache_size=1024 + ) + if column_index == 0: + TEST_LOGGER.info( + f"using the following trackers {[metric for metric in col_prof._metrics]} and {col_prof._schema.resolver}" + ) + col_prof.track_column(col_df[column_name]) + profiler.disable() + stats = pstats.Stats(profiler, stream=string_output_stream).sort_stats("cumulative") + stats.print_stats(10) + TEST_LOGGER.info( + f"track_column stats using the following trackers {[metric_name for metric_name in col_prof._metrics]} " + f"are\n{string_output_stream.getvalue()}" + ) + assert col_prof.view().get_metric("distribution") is not None + assert False + + +@pytest.mark.load +def test_track_dataset_benchmark() -> None: + dataframe_shapes = [(120000, 34), (3400000, 43)] # 10x less rows to estimate + TEST_LOGGER.info(f"Running dataset_profile.track with {len(dataframe_shapes)} shapes") + for num_rows, num_columns in dataframe_shapes: + TEST_LOGGER.info(f"shape of test dataframe is ({num_rows},{num_columns})...") + + profiler = cProfile.Profile() + string_output_stream = StringIO() + full_df = pl.DataFrame( + { str(i): np.random.random(size=(num_rows,)) for i in range(num_columns) } + ) + dataset_profile = DatasetProfile() + profiler.enable() + dataset_profile.track(full_df) + profiler.disable() + stats = pstats.Stats(profiler, stream=string_output_stream).sort_stats("cumulative") + stats.print_stats(20) + test_column_name = next(iter(dataset_profile._columns)) + TEST_LOGGER.info( + f"dataset_profile.track stats on ({num_rows},{num_columns}) using " + f"{[metric_name for metric_name in dataset_profile._columns[test_column_name]._metrics]} are" + f"\n{string_output_stream.getvalue()}" + ) + for column_name in dataset_profile._columns: + assert dataset_profile._columns[column_name].view().get_metric("distribution") is not None + + +@pytest.mark.load +def test_track_baseline_benchmark() -> None: + # dataframe_shapes = [(1200000, 34), (34000000, 43)] # full baseline + dataframe_shapes = [(120000, 34)] # 10x less rows, first test only to estimate in reasonable time + TEST_LOGGER.info(f"Running custom_metric.track with {len(dataframe_shapes)} different test dataframes") + for num_rows, num_columns in dataframe_shapes: + TEST_LOGGER.info(f"shape of test dataframe is ({num_rows},{num_columns})...") + + profiler = cProfile.Profile() + string_output_stream = StringIO() + baseline_metric = CustomHistogramMetric() + profiler.enable() + for column_index in range(num_columns): + column_name = str(column_index) + baseline_metric = CustomHistogramMetric() + col_df = pl.DataFrame( {column_name: np.random.random(size=(num_rows,)) } ) + if column_index == 0: + TEST_LOGGER.info(f"using the following trackers {baseline_metric}") + for value in col_df[column_name]: + baseline_metric.track(value) + TEST_LOGGER.info(f"\tcolumn: {column_index}") + + profiler.disable() + stats = pstats.Stats(profiler, stream=string_output_stream).sort_stats("cumulative") + stats.print_stats(20) + TEST_LOGGER.info( + f"stats for baseline_benchmark (custom_metric.track) on df({num_rows},{num_columns}) " + f"are\n{string_output_stream.getvalue()}" + ) + + +def _gen_test_row_message(i: int) -> Dict[str, Any]: + additional_fields = 40 + test_message = { + "jobtitle": "software engineer", + "employer": "whylabs", + "city": "seattle", + "state": "washington", + "country": "united states", + "date": "2022-11-02", + "optional_features": i % 100, + "nan_feature": float("nan") if i % 13 == 0 else float(i) / 3.3, + "None_feature": None if i % 3 == 0 else {"a": 1}, + "debug": True, + } + for i in range(additional_fields): + test_message[f"field_{i}"] = random.random() + return test_message + + +def _gen_test_df() -> pl.DataFrame: + num_rows = 1 + num_columns = 50 + full_df = pl.DataFrame({str(i): np.random.random(size=(num_rows,)) for i in range(num_columns)}) + return full_df + + +@pytest.mark.load +def test_rolling_logger_latency_row_benchmark() -> None: + number_of_iterations = 1000 + TEST_LOGGER.info(f"Running latency test with {number_of_iterations} iterations") + test_log = whylogs.logger(mode="rolling", interval=60, when="S", fork=True) + test_log.append_writer("local") + + profiler = cProfile.Profile() + string_output_stream = StringIO() + profiler.enable() + + for i in range(number_of_iterations): + msg = _gen_test_row_message(i) + test_log.log(msg) + + test_log.close() + profiler.disable() + stats = pstats.Stats(profiler, stream=string_output_stream).sort_stats("cumulative") + stats.print_stats(20) + TEST_LOGGER.info(f"stats for rolling latency benchmark are\n{string_output_stream.getvalue()}") + + +@pytest.mark.load +@pytest.mark.parametrize("mode", ["pandas_to_row", "row", "pandas"]) +def test_rolling_logger_latency_benchmark(mode) -> None: + number_of_iterations = 1000 + TEST_LOGGER.info(f"Running latency test with {number_of_iterations} iterations") + test_log = whylogs.logger(mode="rolling", interval=60, when="S", fork=True) + test_log.append_writer("local") + + profiler = cProfile.Profile() + string_output_stream = StringIO() + profiler.enable() + + for i in range(number_of_iterations): + data = None + if mode == "pandas": + data = _gen_test_df() + elif mode == "row": + data = _gen_test_row_message(i) + elif mode == "pandas_to_row": + df = _gen_test_df() + records = df.to_dict(as_series=False) + data = {k: v[0] for k, v in records.items()} + else: + raise ValueError(f"Mode: ({mode}) not supported, must be 'row', 'pandas'...") + test_log.log(data) + + test_log.close() + profiler.disable() + stats = pstats.Stats(profiler, stream=string_output_stream).sort_stats("cumulative") + stats.print_stats(20) + TEST_LOGGER.info(f"stats for rolling latency [{mode}]based benchmark are\n{string_output_stream.getvalue()}") diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py new file mode 100644 index 0000000000..560d646e9b --- /dev/null +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -0,0 +1,370 @@ +from typing import Any, Dict, List, Tuple, Union + +import polars as pl + +import whylogs as why +from whylogs.core.dataset_profile import DatasetProfile +from whylogs.core.datatypes import Fractional, Integral, String +from whylogs.core.metrics import ( + CardinalityMetric, + DistributionMetric, + MetricConfig, + StandardMetric, +) +from whylogs.core.preprocessing import ColumnProperties +from whylogs.core.resolvers import STANDARD_RESOLVER, MetricSpec, ResolverSpec +from whylogs.core.segmentation_partition import segment_on_column +from whylogs.experimental.core.metrics.udf_metric import register_metric_udf +from whylogs.experimental.core.udf_schema import ( + UdfSchema, + UdfSpec, + register_dataset_udf, + register_multioutput_udf, + register_type_udf, + udf_schema, + unregister_udf, +) +from whylogs.experimental.core.validators import condition_validator + + +def test_udf_polars() -> None: + schema = UdfSchema( + STANDARD_RESOLVER, + udf_specs=[UdfSpec(column_names=["col1"], udfs={"col2": lambda x: x[0], "col3": lambda x: x[0]})], + ) + data = pl.DataFrame({"col1": [42, 12, 7]}) + results = why.log(data, schema=schema).view() + col1 = results.get_column("col1").to_summary_dict() + col2 = results.get_column("col2").to_summary_dict() + col3 = results.get_column("col3").to_summary_dict() + assert col1 == col2 == col3 + assert len(data.columns) == 1 + + +@register_multioutput_udf(["xx1", "xx2"]) +def f1(x) -> pl.DataFrame: + return pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) + + +@register_multioutput_udf(["xx1", "xx2"], prefix="blah") +def f2(x) -> pl.DataFrame: + return pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) + + +@register_multioutput_udf(["xx1", "xx2"], no_prefix=True) +def no_prefix_udf(x) -> pl.DataFrame: + df = pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) + return df + + +def test_multioutput_udf_dataframe() -> None: + schema = udf_schema() + df = pl.DataFrame({"xx1": [42, 7], "xx2": [3.14, 2.72]}) + results = why.log(df, schema=schema).view() + assert results.get_column("f1.foo") is not None + assert results.get_column("f1.bar") is not None + assert results.get_column("blah.foo") is not None + assert results.get_column("blah.bar") is not None + assert results.get_column("foo") is not None + assert results.get_column("bar") is not None + + +def test_drop_columns() -> None: + schema = udf_schema(drop_columns={"xx1", "xx2"}) + df = pl.DataFrame({"xx1": [42, 7], "xx2": [3.14, 2.72]}) + results = why.log(df, schema=schema).view() + assert results.get_column("xx1") is None + assert results.get_column("xx2") is None + # UDFs that needed the dropped columns as input still work + assert results.get_column("f1.foo") is not None + assert results.get_column("f1.bar") is not None + assert results.get_column("blah.foo") is not None + assert results.get_column("blah.bar") is not None + assert results.get_column("foo") is not None + assert results.get_column("bar") is not None + + +@register_dataset_udf(["col1"], schema_name="unit-tests") +def add5(x) -> float: + return x[0]+5 + + +def square(x: Tuple) -> float: + print(f"square(): {type(x)}\n{x}") + return x[0] * x[0] + + +action_list = [] + + +def do_something_important(validator_name, condition_name: str, value: Any, column_id=None): + print("Validator: {}\n Condition name {} failed for value {}".format(validator_name, condition_name, value)) + action_list.append(value) + if column_id: + # this list is just to verify that the action was called with the correct column id + action_list.append(column_id) + return + + +@condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important]) +def lt_4(x): + return x < 4 + + +def test_validator_udf_polars() -> None: + global action_list + data = pl.DataFrame({"col1": [1, 3, 7]}) + schema = udf_schema() + why.log(data, schema=schema).view() + assert 7 in action_list + + +def test_validator_double_register_udf_polars() -> None: + global action_list + + @condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important]) + def lt_4_2(x): + return x < 4 + + schema = udf_schema() + # registering the same validator twice should keep only the latest registration + assert schema.validators["col1"][0].conditions["less_than_four"].__name__ == "lt_4_2" + assert len(schema.validators["col1"]) == 1 + + +def test_decorator_polars() -> None: + extra_spec = UdfSpec(["col1"], {"sqr": square}) + schema = udf_schema([extra_spec], STANDARD_RESOLVER, schema_name="unit-tests") + data = pl.DataFrame({"col1": [42, 12, 7], "col2": ["a", "b", "c"]}) + results = why.log(data, schema=schema).view() + col1_summary = results.get_column("col1").to_summary_dict() + assert "distribution/n" in col1_summary + add5_summary = results.get_column("add5").to_summary_dict() + assert "distribution/n" in add5_summary + sqr_summary = results.get_column("sqr").to_summary_dict() + assert "distribution/n" in sqr_summary + + +@register_dataset_udf( + ["col1"], "annihilate_me", anti_metrics=[CardinalityMetric, DistributionMetric], schema_name="unit-tests" +) +def plus1(x) -> float: + return x[0] + 1 + + +def test_anti_resolver() -> None: + schema = udf_schema(schema_name="unit-tests") + data = pl.DataFrame({"col1": [42, 12, 7], "col2": ["a", "b", "c"]}) + results = why.log(data, schema=schema).view() + col1_summary = results.get_column("col1").to_summary_dict() + assert "distribution/n" in col1_summary + assert "cardinality/est" in col1_summary + col2_summary = results.get_column("col2").to_summary_dict() + assert "distribution/n" in col2_summary + assert "cardinality/est" in col2_summary + add5_summary = results.get_column("add5").to_summary_dict() + assert "distribution/n" in add5_summary + assert "cardinality/est" in add5_summary + plus1_summary = results.get_column("annihilate_me").to_summary_dict() + assert "ints/max" in plus1_summary + assert "distribution/n" not in plus1_summary + assert "cardinality/est" not in plus1_summary + + +@register_dataset_udf(["col1"], "colliding_name", namespace="pluto", schema_name="unit-tests") +def a_function(x): + return x[0] + + +@register_dataset_udf(["col1"], "colliding_name", namespace="neptune", schema_name="unit-tests") +def another_function(x): + return x[0] + + +@register_dataset_udf(["col1", "col2"], "product", schema_name="unit-tests") +def times(x: Tuple) -> float: + return x[0] * x[1] + + +@register_dataset_udf( + ["col1", "col3"], metrics=[MetricSpec(StandardMetric.distribution.value)], schema_name="unit-tests" +) +def ratio(x: Tuple) -> float: + return x[0] / x[1] + + +def test_multicolumn_udf_pandas() -> None: + count_only = [ + ResolverSpec( + column_type=Integral, + metrics=[MetricSpec(StandardMetric.counts.value)], + ), + ResolverSpec( + column_type=Fractional, + metrics=[MetricSpec(StandardMetric.counts.value)], + ), + ResolverSpec( + column_type=String, + metrics=[MetricSpec(StandardMetric.counts.value)], + ), + ] + + extra_spec = UdfSpec(["col1"], {"sqr": square}) + schema = udf_schema([extra_spec], count_only, schema_name="unit-tests") + data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) + results = why.log(data, schema=schema).view() + col1_summary = results.get_column("col1").to_summary_dict() + assert "counts/n" in col1_summary + col2_summary = results.get_column("col2").to_summary_dict() + assert "counts/n" in col2_summary + col3_summary = results.get_column("col3").to_summary_dict() + assert "counts/n" in col3_summary + add5_summary = results.get_column("add5").to_summary_dict() + assert "counts/n" in add5_summary + prod_summary = results.get_column("product").to_summary_dict() + assert prod_summary["counts/n"] == 3 + sqr_summary = results.get_column("sqr").to_summary_dict() + assert "counts/n" in sqr_summary + div_summary = results.get_column("ratio").to_summary_dict() + assert div_summary["distribution/n"] == 3 + # Integral -> counts plus registered distribution + assert results.get_column("ratio").get_metric("counts") is not None + assert results.get_column("ratio").get_metric("distribution") is not None + + +n: int = 0 + + +@register_dataset_udf(["oops"], schema_name="unit-tests") +def exothermic(x: pl.DataFrame) -> pl.Series: + global n + n += 1 + if n < 3: + raise ValueError("kaboom") + + return x["oops"] + + +def test_udf_throws_polars() -> None: + global n + n = 0 + schema = udf_schema(schema_name="unit-tests") + df = pl.DataFrame({"oops": [1, 2, 3, 4], "ok": [5, 6, 7, 8]}) + results = why.log(df, schema=schema).view() + assert "exothermic" in results.get_columns() + oops_summary = results.get_column("exothermic").to_summary_dict() + assert oops_summary["counts/nan"] > 0 + ok_summary = results.get_column("ok").to_summary_dict() + assert ok_summary["counts/n"] == 4 + + +@register_metric_udf("foo") +def bar(x: Any) -> Any: + return x + + +def test_udf_metric_resolving() -> None: + schema = udf_schema(schema_name="unit-tests") + df = pl.DataFrame({"col1": [1, 2, 3], "foo": [1, 2, 3]}) + results = why.log(df, schema=schema).view() + assert "add5" in results.get_columns() + assert results.get_column("add5").to_summary_dict()["counts/n"] == 3 + assert results.get_column("col1").to_summary_dict()["counts/n"] == 3 + foo_summary = results.get_column("foo").to_summary_dict() + assert "udf/bar:counts/n" in foo_summary + + +def test_udf_segmentation_pandas() -> None: + column_segments = segment_on_column("product") + segmented_schema = udf_schema(segments=column_segments, schema_name="unit-tests") + data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) + results = why.log(data, schema=segmented_schema) + assert len(results.segments()) == 3 + + +def test_udf_segmentation_obj() -> None: + column_segments = segment_on_column("product") + segmented_schema = udf_schema(segments=column_segments, schema_name="unit-tests") + data = {"col1": 42, "col2": 2, "col3": 2} + results = why.log(data, schema=segmented_schema) + assert len(results.segments()) == 1 + + +def test_udf_track() -> None: + schema = udf_schema(schema_name="unit-tests") + prof = DatasetProfile(schema) + data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) + prof.track(data) + results = prof.view() + col1_summary = results.get_column("col1").to_summary_dict() + assert "counts/n" in col1_summary + col2_summary = results.get_column("col2").to_summary_dict() + assert "counts/n" in col2_summary + col3_summary = results.get_column("col3").to_summary_dict() + assert "counts/n" in col3_summary + add5_summary = results.get_column("add5").to_summary_dict() + assert "counts/n" in add5_summary + prod_summary = results.get_column("product").to_summary_dict() + assert prod_summary["counts/n"] == 3 + div_summary = results.get_column("ratio").to_summary_dict() + assert div_summary["distribution/n"] == 3 + + +@register_dataset_udf(["schema.col1"], schema_name="bob") +def bob(x: pl.DataFrame) -> pl.Series: + return x["schema.col1"] + + +@register_metric_udf("schema.col1", schema_name="bob") +def rob(x: Any) -> Any: + return x + + +@register_dataset_udf(["schema.col1"], "add5") +def fob(x: pl.DataFrame) -> pl.Series: + return x["schema.col1"] + 5 + + +def test_direct_udfs() -> None: + schema = udf_schema(schema_name=["", "bob"]) + data = pl.DataFrame({"col1": [42, 12, 7]}) + more_data, _ = schema.apply_udfs(polars=data) + udf_columns = set(more_data.columns) + + result = why.log(data, schema=schema).view() + profile_columns = set(result.get_columns()) + assert udf_columns == profile_columns + + result = why.log(more_data, schema=schema).view() + more_columns = set(result.get_columns()) + assert more_columns == profile_columns + + +@register_type_udf(Fractional, schema_name="unit-tests") +def square_type(x: pl.Series) -> pl.Series: + return x * x + + +def test_type_udf_dataframe() -> None: + schema = udf_schema(schema_name="unit-tests") + data = pl.DataFrame({"col1": [3.14, 42.0]}) + results = why.log(data, schema=schema).view() + assert "col1.square_type" in results.get_columns().keys() + summary = results.get_column("col1.square_type").to_summary_dict() + assert summary["counts/n"] == 2 + assert summary["types/fractional"] == 2 + + +@register_type_udf(float, schema_name="unit-tests") +def square_python_type(x: pl.Series) -> pl.Series: + return x * x + + +def test_python_type_udf() -> None: + schema = udf_schema(schema_name="unit-tests") + data = pl.DataFrame({"col1": [3.14, 42.0]}) + results = why.log(data, schema=schema).view() + assert "col1.square_python_type" in results.get_columns().keys() + summary = results.get_column("col1.square_python_type").to_summary_dict() + assert summary["counts/n"] == 2 + assert summary["types/fractional"] == 2 diff --git a/python/whylogs/api/logger/__init__.py b/python/whylogs/api/logger/__init__.py index da9e67a7be..a56ebbb97c 100644 --- a/python/whylogs/api/logger/__init__.py +++ b/python/whylogs/api/logger/__init__.py @@ -76,7 +76,7 @@ def log( name=name, trace_id=trace_id, tags=tags, - segment_key_values=segment_key_values + segment_key_values=segment_key_values, ) if dataset_timestamp is not None: result_set.set_dataset_timestamp(dataset_timestamp) diff --git a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py index 11ebec8f10..dc8b923816 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py @@ -1,6 +1,7 @@ from abc import abstractmethod from typing import Any, Dict, Generic, List, Optional, TypeVar, Union +# TODO: stubs? try: import pandas as pd # type: ignore except ImportError: diff --git a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py index 3b68ce34a7..755eff1586 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py @@ -27,6 +27,7 @@ from whylogs.api.writer import Writer from whylogs.api.writer.writer import Writable from whylogs.core import DatasetProfile, DatasetProfileView, DatasetSchema +from whylogs.core.input_resolver import _dataframe_or_dict # pyright: ignore[reportPrivateUsage,reportUnknownVariableType] from whylogs.core.view.segmented_dataset_profile_view import SegmentedDatasetProfileView try: @@ -70,11 +71,12 @@ def _track_segments(self, data: TrackData) -> None: if self._schema: if isinstance(data, List): - input_data = [self._schema._run_udfs(pandas=None, row=it)[1] for it in data] # type: ignore + input_data = [self._schema._run_udfs(df=None, row=it)[1] for it in data] # pyright: ignore[reportPrivateUsage, reportUnknownMemberType] else: df = data if isinstance(data, pd.DataFrame) else None - row = data if isinstance(data, dict) else None # pyright: ignore[reportUnknownVariableType] - df, row = self._schema._run_udfs(df, row) # type: ignore + row = data if isinstance(data, dict) else None + df, row = _dataframe_or_dict(df, None, None, row) + df, row = self._schema._run_udfs(df, row) # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType, reportPrivateUsage] input_data: TrackData = cast(TrackData, df if df is not None else row) else: input_data = data diff --git a/python/whylogs/api/logger/logger.py b/python/whylogs/api/logger/logger.py index 39e08770cd..d1d80979ba 100644 --- a/python/whylogs/api/logger/logger.py +++ b/python/whylogs/api/logger/logger.py @@ -113,13 +113,13 @@ def log( dataframe, row = active_schema._run_udfs(dataframe, row) obj = None - # If segments are defined use segment_processing to return a SegmentedResultSet if active_schema and active_schema.segments: segmented_results: SegmentedResultSet = segment_processing( schema=active_schema, obj=obj, - pandas=pandas, + pandas=dataframe.pd_df if dataframe else None, + polars=dataframe.pl_df if dataframe else None, row=row, segment_cache=self._segment_cache, segment_key_values=segment_key_values, diff --git a/python/whylogs/api/logger/rolling.py b/python/whylogs/api/logger/rolling.py index b57ba02337..2e2541c309 100644 --- a/python/whylogs/api/logger/rolling.py +++ b/python/whylogs/api/logger/rolling.py @@ -15,7 +15,6 @@ from whylogs.api.writer import Writer from whylogs.core import DatasetProfile, DatasetProfileView, DatasetSchema from whylogs.core.dataframe_wrapper import DataFrameWrapper -from whylogs.core.stubs import pd from whylogs.core.view.segmented_dataset_profile_view import SegmentedDatasetProfileView logger = logging.getLogger(__name__) diff --git a/python/whylogs/api/logger/segment_processing.py b/python/whylogs/api/logger/segment_processing.py index 8e71204b1b..2e99175164 100644 --- a/python/whylogs/api/logger/segment_processing.py +++ b/python/whylogs/api/logger/segment_processing.py @@ -6,8 +6,8 @@ from whylogs.api.logger.result_set import SegmentedResultSet from whylogs.api.logger.segment_cache import SegmentCache from whylogs.core import DatasetSchema -from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.input_resolver import _dataframe_or_dict from whylogs.core.segment import Segment from whylogs.core.segmentation_partition import ( @@ -36,7 +36,11 @@ def _process_segment( if profile is None: profile = DatasetProfile(schema) - profile.track(segmented_data, execute_udfs=False) + if isinstance(segmented_data, DataFrameWrapper): + profile.track(dataframe=segmented_data, execute_udfs=False) + else: + profile.track(segmented_data, execute_udfs=False) + segments[segment_key] = profile @@ -63,7 +67,7 @@ def _process_simple_partition( schema: DatasetSchema, segments: Dict[Segment, Any], columns: List[str], - pandas: Optional[pd.DataFrame] = None, + dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None, segment_cache: Optional[SegmentCache] = None, segment_key_values: Optional[Dict[str, str]] = None, @@ -71,23 +75,22 @@ def _process_simple_partition( explicit_keys = ( tuple(str(segment_key_values[k]) for k in sorted(segment_key_values.keys())) if segment_key_values else tuple() ) - if pandas is not None: - # simple means we can segment on column values - grouped_data = pandas.groupby(columns) - for group in grouped_data.groups.keys(): + if dataframe is not None: + group_keys = dataframe.group_keys(columns) + for group in group_keys: if isinstance(group, tuple) and any([_is_nan(x) for x in group]): evaluations = [] for val, col in zip(group, columns): if _is_nan(val): - evaluations.append((pandas[col].isna())) + evaluations.append(dataframe.get_nan_mask(col)) else: - evaluations.append((pandas[col] == val)) + evaluations.append(dataframe.get_val_mask(col, val)) mask = reduce(lambda x, y: x & y, evaluations) - pandas_segment = pandas[mask] + segment_frame = dataframe.filter(mask) else: - pandas_segment = grouped_data.get_group(group) + segment_frame = dataframe.get_group(columns, group) segment_key = _get_segment_from_group_key(group, partition_id, explicit_keys) - _process_segment(pandas_segment, segment_key, segments, schema, segment_cache) + _process_segment(segment_frame, segment_key, segments, schema, segment_cache) elif row: # TODO: consider if we need to combine with the column names segment_key = Segment(tuple(str(row[element]) for element in columns) + explicit_keys, partition_id) @@ -95,18 +98,18 @@ def _process_simple_partition( def _filter_inputs( - filter: SegmentFilter, pandas: Optional[pd.DataFrame] = None, row: Optional[Mapping[str, Any]] = None + filter: SegmentFilter, dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None ) -> Tuple[Optional[pd.DataFrame], Optional[Dict[str, Any]]]: assert ( filter.filter_function or filter.query_string ), f"must define at least a filter function or query string when specifying a segment filter: {filter}" - filtered_pandas = None + filtered_dataframe = None filtered_row = None - if pandas is not None: + if dataframe is not None: if filter.filter_function: - filtered_pandas = pandas[filter.filter_function] + filtered_dataframe = dataframe.filter(filter.filter_function) elif filter.query_string: - filtered_pandas = pandas.query(filter.query_string) + filtered_dataframe = dataframe.query(filter.query_string) elif row is not None: if filter.filter_function: filtered_row = filter.filter_function(row) @@ -114,7 +117,7 @@ def _filter_inputs( raise ValueError( "SegmentFilter query string not supported when logging rows, either don't specify a filter or implement the filter.filter_function" ) - return (filtered_pandas, filtered_row) + return (filtered_dataframe, filtered_row) def _grouped_dataframe(partition: SegmentationPartition, pandas: pd.DataFrame): @@ -135,17 +138,17 @@ def _log_segment( row: Optional[Mapping[str, Any]] = None, segment_cache: Optional[SegmentCache] = None, segment_key_values: Optional[Dict[str, str]] = None, + polars: Optional[pl.DataFrame] = None, ) -> Dict[Segment, Any]: segments: Dict[Segment, Any] = {} - dataframe, row = _dataframe_or_dict(obj, pandas, row=row) - pandas = dataframe.pd_df if dataframe else pandas + dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) if partition.filter: - pandas, row = _filter_inputs(partition.filter, pandas, row) + dataframe, row = _filter_inputs(partition.filter, dataframe, row) if partition.simple: columns = partition.mapper.col_names if partition.mapper else None if columns: _process_simple_partition( - partition.id, schema, segments, columns, pandas, row, segment_cache, segment_key_values + partition.id, schema, segments, columns, dataframe, row, segment_cache, segment_key_values ) else: logger.error( @@ -163,6 +166,7 @@ def segment_processing( row: Optional[Dict[str, Any]] = None, segment_cache: Optional[SegmentCache] = None, segment_key_values: Optional[Dict[str, str]] = None, + polars: Optional[pl.DataFrame] = None, ) -> SegmentedResultSet: number_of_partitions = len(schema.segments) logger.info(f"The specified schema defines segments with {number_of_partitions} partitions.") @@ -191,6 +195,7 @@ def segment_processing( schema=schema, obj=obj, pandas=pandas, + polars=polars, row=row, segment_cache=segment_cache, segment_key_values=segment_key_values, diff --git a/python/whylogs/api/logger/transient.py b/python/whylogs/api/logger/transient.py index f118e349ed..9321d48691 100644 --- a/python/whylogs/api/logger/transient.py +++ b/python/whylogs/api/logger/transient.py @@ -3,7 +3,6 @@ from whylogs.api.logger.logger import Logger from whylogs.core import DatasetProfile, DatasetSchema from whylogs.core.dataframe_wrapper import DataFrameWrapper -from whylogs.core.stubs import pd class TransientLogger(Logger): diff --git a/python/whylogs/api/whylabs/session/notebook_logger.py b/python/whylogs/api/whylabs/session/notebook_logger.py index 0ef87a7b12..61a49f236e 100644 --- a/python/whylogs/api/whylabs/session/notebook_logger.py +++ b/python/whylogs/api/whylabs/session/notebook_logger.py @@ -6,7 +6,6 @@ from whylogs.api.whylabs.session.session_manager import get_current_session from whylogs.api.whylabs.session.session_types import InteractiveLogger as il from whylogs.api.whylabs.session.session_types import SessionType -from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.stubs import pd, pl diff --git a/python/whylogs/core/dataframe_wrapper.py b/python/whylogs/core/dataframe_wrapper.py index 32f31792a0..0a41c7ff89 100644 --- a/python/whylogs/core/dataframe_wrapper.py +++ b/python/whylogs/core/dataframe_wrapper.py @@ -1,10 +1,12 @@ -from typing import List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from whylogs.core.stubs import pd, pl class DataFrameWrapper: - def __init__(self, pandas: Optional[pd.DataFrame]=None, polars: Optional[pl.DataFrame]=None): + def __init__(self, pandas: Optional[pd.DataFrame] = None, polars: Optional[pl.DataFrame] = None): + # TODO: __init__(self, df: Union[pd.DataFrame, pl.DataFrame]): with isinstance + # TODO: maybe PandasDataFrame, PolarsDataFrame <: DataFrameWrapper if pandas is not None and polars is not None: raise ValueError("Cannot pass both pandas and polars params") if pandas is None and polars is None: @@ -13,11 +15,144 @@ def __init__(self, pandas: Optional[pd.DataFrame]=None, polars: Optional[pl.Data self.pd_df = pandas self.pl_df = polars - self.column_names = list(pandas.columns) if pandas is not None else polars.columns - self.dtypes = pandas.dtypes if pandas is not None else polars.schema - self.empty = pandas.empty if pandas is not None else len(polars) == 0 + self.column_names = list(pandas.columns) if pandas is not None else polars.columns # type: ignore + self.dtypes = pandas.dtypes if pandas is not None else polars.schema # type: ignore + self.empty = pandas.empty if pandas is not None else len(polars) == 0 # type: ignore + + def _update(self) -> None: + self.column_names = list(self.pd_df.columns) if self.pd_df is not None else self.pl_df.columns # type: ignore + self.dtypes = self.pd_df.dtypes if self.pd_df is not None else self.pl_df.schema # type: ignore + self.empty = self.pd_df.empty if self.pd_df is not None else len(self.pl_df) == 0 # type: ignore def get(self, column: str) -> Optional[Union[pd.Series, pl.Series]]: if self.pd_df is not None: return self.pd_df.get(column) - return self.pl_df[column] if column in self.pl_df.schema else None + return self.pl_df[column] if column in self.pl_df.schema else None # type: ignore + + def filter(self, filter: Any) -> Optional["DataFrameWrapper"]: + if self.pd_df is not None: + return DataFrameWrapper(pandas=self.pd_df[filter]) + if self.pl_df is not None: + return DataFrameWrapper(polars=self.pl_df.filter(filter)) + return None + + def query(self, query: str) -> Optional["DataFrameWrapper"]: + if self.pd_df is not None: + return DataFrameWrapper(pandas=self.pd_df.query(query)) + if self.pl_df is not None: + ctx = pl.SQLContext(population=self.pl_df, eager=True) + return ctx.execute(query) + return None + + def group_keys(self, columns: List[str]) -> List[Tuple[Any]]: + if self.pd_df is not None: + return self.pd_df.groupby(columns).groups.keys() + elif self.pl_df is not None: + return [x for x, y in self.pl_df.group_by(columns)] + return [] + + def groupby( + self, columns: List[str] + ) -> Any: # Union[pl.dataframe.group_by.GroupBy, pd.core.groupby.generic.DataFrameGroupBy] + if self.pd_df is not None: + grouped = self.pd_df.groupby(columns) + return grouped + d = {g: grouped.get_group(g) for g in grouped.groups.keys()} + return d + elif self.pl_df is not None: + return self.pl_df.group_by(columns) + + def get_nan_mask(self, column: str) -> List[bool]: + if self.pd_df is not None: + return self.pd_df[column].isna() # .to_list() + elif self.pl_df is not None: + return self.pl_df[column].is_nan() # .to_list() + return [] + + def get_val_mask(self, column: str, value: Any) -> List[bool]: + if self.pd_df is not None: + return self.pd_df[column] == value # .to_list() + elif self.pl_df is not None: + return self.pl_df[column] == value # .to_list() + return [] + + def get_group(self, columns: List[str], key: Tuple[Any]) -> Any: + if self.pd_df is not None: + grouped = self.pd_df.groupby(columns) + return grouped.get_group(key) + elif self.pl_df is not None: + grouped = self.pl_df.group_by(columns) + return {k: g for k, g in grouped}[key] + raise ValueError("Cannot group empty DataFrame") + + def concat(self, other: "DataFrameWrapper") -> None: + if self.pd_df is not None: + self.pd_df = pd.concat([self.pd_df, other.pd_df], axis=1) + self._update() + return + elif self.pl_df is not None: + self.pl_df = pl.concat([self.pl_df, other.pl_df], how="horizontal") + self._update() + return + raise ValueError("Cannot concatenate empty DataFrame") + + def drop_columns(self, columns: List[str]) -> None: + if self.pd_df is not None: + self.pd_df = self.pd_df.drop(columns=columns) + self._update() + return + elif self.pl_df is not None: + self.pl_df = self.pl_df.drop(columns) + self._update() + return + raise ValueError("Cannot drop columns from empty DataFrame") + + def __getitem__(self, key: str) -> "DataFrameWrapper": + if self.pd_df is not None: + return DataFrameWrapper(pandas=pd.DataFrame(self.pd_df[key])) + elif self.pl_df is not None: + return DataFrameWrapper(polars=pl.DataFrame(self.pl_df[key])) + raise ValueError("Cannot index empty DataFrame") + + def __setitem__(self, key: str, value: Union[pd.Series, pl.Series]) -> None: + if self.pd_df is not None: + self.pd_df[key] = value + self._update() + return + elif self.pl_df is not None: + self.pl_df = self.pl_df.with_columns(value.alias(key)) + self._update() + return + raise ValueError("Cannot index empty DataFrame") + + def apply_udf(self, udf: Callable) -> Union[pd.Series, pl.Series]: + if self.pd_df is not None: + return pd.Series(udf(self.pd_df)) + elif self.pl_df is not None: + return self.pl_df.map_rows(udf)["map"] + raise ValueError("Cannot apply UDFs to empty DataFrame") + + def apply_type_udf(self, udf: Callable) -> Union[pd.Series, pl.Series]: + if self.pd_df is not None: + return pd.Series(udf(self.pd_df[self.pd_df.columns[0]])) + elif self.pl_df is not None: + return pl.Series(self.pl_df[self.pl_df.columns[0]].map_elements(udf)) + raise ValueError("Cannot apply UDFs to empty DataFrame") + + def apply_multicolumn_udf(self, udf: Callable) -> "DataFrameWrapper": + if self.pd_df is not None: + return DataFrameWrapper(pandas=udf(self.pd_df)) + elif self.pl_df is not None: + return DataFrameWrapper(polars=udf(self.pl_df)) + raise ValueError("Cannot apply UDFs to empty DataFrame") + + def rename(self, columns: Dict[str, str]) -> None: + if self.pd_df is not None: + self.pd_df = self.pd_df.rename(columns=columns) + self._update() + return + elif self.pl_df is not None: + self.pl_df = self.pl_df.rename(columns) + self._update() + return + raise ValueError("Cannot rename an empty DataFrame") diff --git a/python/whylogs/core/input_resolver.py b/python/whylogs/core/input_resolver.py index 99c9145560..1a5be55887 100644 --- a/python/whylogs/core/input_resolver.py +++ b/python/whylogs/core/input_resolver.py @@ -5,7 +5,10 @@ def _dataframe_or_dict( - obj: Any, pandas: Optional[pd.DataFrame] = None, polars: Optional[pl.DataFrame] = None, row: Optional[Mapping[str, Any]] = None + obj: Any, + pandas: Optional[pd.DataFrame] = None, + polars: Optional[pl.DataFrame] = None, + row: Optional[Mapping[str, Any]] = None, ) -> Tuple[Optional[DataFrameWrapper], Optional[Mapping[str, Any]]]: if obj is not None: if pandas is not None: diff --git a/python/whylogs/core/preprocessing.py b/python/whylogs/core/preprocessing.py index e4651476ef..9ca11b0e65 100644 --- a/python/whylogs/core/preprocessing.py +++ b/python/whylogs/core/preprocessing.py @@ -169,9 +169,7 @@ def _pandas_split(self, series: pd.Series, parse_numeric_string: bool = False) - bool_mask_where_true = non_null_series.apply(lambda x: pdc.is_bool(x) and x) int_mask = non_null_series.apply(lambda x: pdc.is_number(x) and pdc.is_integer(x) and not pdc.is_bool(x)) str_mask = non_null_series.apply(lambda x: isinstance(x, str)) - tensor_mask = non_null_series.apply( - lambda x: isinstance(x, (list, np.ndarray)) and _is_tensorable(x) - ) + tensor_mask = non_null_series.apply(lambda x: isinstance(x, (list, np.ndarray)) and _is_tensorable(x)) floats = non_null_series[float_mask] if non_null_series[int_mask].empty: diff --git a/python/whylogs/experimental/core/udf_schema.py b/python/whylogs/experimental/core/udf_schema.py index 4cc612d45a..9f9016eeb1 100644 --- a/python/whylogs/experimental/core/udf_schema.py +++ b/python/whylogs/experimental/core/udf_schema.py @@ -16,12 +16,13 @@ Union, ) +from whylogs.core.dataframe_wrapper import DataFrameWrapper from whylogs.core.datatypes import DataType, StandardTypeMapper, TypeMapper from whylogs.core.metrics.metrics import Metric, MetricConfig from whylogs.core.resolvers import NO_FI_RESOLVER, MetricSpec, ResolverSpec from whylogs.core.schema import DeclarativeSchema from whylogs.core.segmentation_partition import SegmentationPartition -from whylogs.core.stubs import pd +from whylogs.core.stubs import pd, pl from whylogs.core.validators.validator import Validator from whylogs.experimental.core.metrics.udf_metric import ( _reset_metric_udfs, @@ -109,7 +110,7 @@ def _apply_udf_on_row( def _apply_udfs_on_dataframe( - pandas: pd.DataFrame, udfs: Dict, new_df: pd.DataFrame, input_cols: Collection[str] + df: DataFrameWrapper, udfs: Dict, new_df: DataFrameWrapper, input_cols: Collection[str] ) -> None: """multiple input columns, single output column""" for new_col, udf in udfs.items(): @@ -117,23 +118,24 @@ def _apply_udfs_on_dataframe( continue try: - new_df[new_col] = pd.Series(udf(pandas)) + tmp = df.apply_udf(udf) + new_df[new_col] = tmp except Exception as e: # noqa - new_df[new_col] = pd.Series([None]) - logger.exception(f"Evaluating UDF {new_col} failed on columns {pandas.keys()} with error {e}") + new_df[new_col] = df.apply_udf(lambda x: float("nan")) # should be None, but can't infer type + logger.exception(f"Evaluating UDF {new_col} failed on columns {df.column_names} with error {e}") def _apply_udf_on_dataframe( name: str, prefix: Optional[str], - pandas: pd.DataFrame, + df: DataFrameWrapper, udf: Callable, - new_df: pd.DataFrame, + new_df: DataFrameWrapper, input_cols: Collection[str], ) -> None: """ multiple input columns, multiple output columns - udf(Union[Dict[str, List], pd.DataFrame]) -> Union[Dict[str, List], pd.DataFrame] + udf(Union[Dict[str, List], pd.DataFrame, pl.DataFrame]) -> Union[Dict[str, List], pd.DataFrame, pl.DataFrame] """ def add_prefix(col): @@ -141,24 +143,23 @@ def add_prefix(col): try: # TODO: I think it's OKAY if udf returns a dictionary - udf_output = pd.DataFrame(udf(pandas)) - udf_output = udf_output.rename(columns={old: add_prefix(old) for old in udf_output.keys()}) - for new_col in udf_output.keys(): - new_df[new_col] = udf_output[new_col] + udf_output = df.apply_multicolumn_udf(udf) # pd.DataFrame(udf(pandas)) + udf_output.rename(columns={old: add_prefix(old) for old in udf_output.column_names}) + new_df.concat(udf_output) except Exception as e: # noqa - logger.exception(f"Evaluating UDF {name} failed on columns {pandas.keys()} with error {e}") + logger.exception(f"Evaluating UDF {name} failed on columns {df.column_names} with error {e}") return pd.DataFrame() -def _apply_type_udfs(pandas: pd.Series, udfs: Dict, new_df: pd.DataFrame, input_cols: Collection[str]) -> None: +def _apply_type_udfs(df: DataFrameWrapper, udfs: Dict, new_df: pd.DataFrame, input_cols: Collection[str]) -> None: for new_col, udf in udfs.items(): if new_col in input_cols: continue try: - new_df[new_col] = pd.Series(udf(pandas)) + new_df[new_col] = df.apply_type_udf(udf) except Exception as e: # noqa - new_df[new_col] = pd.Series([None]) + new_df[new_col] = df.apply_udf(lambda x: float("nan")) # should be None, but can't infer type logger.exception(f"Evaluating UDF {new_col} failed on column {new_col} with error {e}") @@ -222,45 +223,62 @@ def _run_udfs_on_row( udfs = {f"{column}.{key}": spec.udfs[key] for key in spec.udfs.keys()} _apply_udfs_on_row([value], udfs, new_columns, input_cols) - def _run_udfs_on_dataframe(self, pandas: pd.DataFrame, new_df: pd.DataFrame, input_cols: Collection[str]) -> None: + def _run_udfs_on_dataframe( + self, df: DataFrameWrapper, new_df: DataFrameWrapper, input_cols: Collection[str] + ) -> None: for spec in self.multicolumn_udfs: - if spec.column_names and set(spec.column_names).issubset(set(pandas.keys())): + if spec.column_names and set(spec.column_names).issubset(set(df.column_names)): if spec.udf is not None: _apply_udf_on_dataframe( - spec.name, spec.prefix, pandas[spec.column_names], spec.udf, new_df, input_cols # type: ignore + spec.name, spec.prefix, df[spec.column_names], spec.udf, new_df, input_cols # type: ignore ) else: - _apply_udfs_on_dataframe(pandas[spec.column_names], spec.udfs, new_df, input_cols) + _apply_udfs_on_dataframe(df[spec.column_names], spec.udfs, new_df, input_cols) - for column, dtype in pandas.dtypes.items(): + for column, dtype in df.dtypes.items(): why_type = type(self.type_mapper(dtype)) for spec in self.type_udfs[why_type]: udfs = {f"{column}.{key}": spec.udfs[key] for key in spec.udfs.keys()} - _apply_type_udfs(pandas[column], udfs, new_df, input_cols) + _apply_type_udfs(df[column], udfs, new_df, input_cols) def _run_udfs( - self, pandas: Optional[pd.DataFrame] = None, row: Optional[Dict[str, Any]] = None - ) -> Tuple[Optional[pd.DataFrame], Optional[Mapping[str, Any]]]: + self, df: Optional[DataFrameWrapper] = None, row: Optional[Dict[str, Any]] = None + ) -> Tuple[Optional[DataFrameWrapper], Optional[Dict[str, Any]]]: new_columns = deepcopy(row) if row else None - new_df = pd.DataFrame() + if df: + new_df = ( + DataFrameWrapper(pandas=pd.DataFrame()) + if df.pd_df is not None + else DataFrameWrapper(polars=pl.DataFrame()) + ) + else: + new_df = None + if row is not None: self._run_udfs_on_row(row, new_columns, row.keys()) # type: ignore if self.drop_columns: for col in set(row.keys()).intersection(self.drop_columns): row.pop(col) - if pandas is not None: - self._run_udfs_on_dataframe(pandas, new_df, pandas.keys()) - new_df = pd.concat([pandas, new_df], axis=1) + if df is not None: + self._run_udfs_on_dataframe(df, new_df, df.column_names) + df.concat(new_df) if self.drop_columns: - new_df = new_df.drop(columns=list(set(new_df.keys()).intersection(self.drop_columns))) - - return new_df if pandas is not None else None, new_columns + df.drop_columns(columns=list(set(df.column_names).intersection(self.drop_columns))) + return df if df is not None else None, new_columns def apply_udfs( - self, pandas: Optional[pd.DataFrame] = None, row: Optional[Dict[str, Any]] = None - ) -> Tuple[Optional[pd.DataFrame], Optional[Mapping[str, Any]]]: - return self._run_udfs(pandas, row) + self, + pandas: Optional[pd.DataFrame] = None, + row: Optional[Dict[str, Any]] = None, + polars: Optional[pl.DataFrame] = None, + ) -> Tuple[Optional[Union[pd.DataFrame, pl.DataFrame]], Optional[Mapping[str, Any]]]: + df = DataFrameWrapper(pandas, polars) if (pandas is not None or polars is not None) else None + df, row = self._run_udfs(df, row) + if df is not None: + df = df.pd_df if df.pd_df is not None else df.pl_df + + return df, row _multicolumn_udfs: Dict[str, List[UdfSpec]] = defaultdict(list) From 66abf26a1567a842d6ec3bc4a5b84268050f2428 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Thu, 31 Oct 2024 05:30:42 +0000 Subject: [PATCH 03/41] make UDF pandas and polars tests work at the same time --- .../core/test_udf_schema_polars.py | 66 +++++++++---------- .../whylogs/experimental/core/udf_schema.py | 2 +- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 560d646e9b..04ba76e327 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -41,24 +41,24 @@ def test_udf_polars() -> None: assert len(data.columns) == 1 -@register_multioutput_udf(["xx1", "xx2"]) +@register_multioutput_udf(["xx1", "xx2"], schema_name="polars") def f1(x) -> pl.DataFrame: return pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) -@register_multioutput_udf(["xx1", "xx2"], prefix="blah") +@register_multioutput_udf(["xx1", "xx2"], prefix="blah", schema_name="polars") def f2(x) -> pl.DataFrame: return pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) -@register_multioutput_udf(["xx1", "xx2"], no_prefix=True) +@register_multioutput_udf(["xx1", "xx2"], no_prefix=True, schema_name="polars") def no_prefix_udf(x) -> pl.DataFrame: df = pl.DataFrame({"foo": x["xx1"], "bar": x["xx2"]}) return df def test_multioutput_udf_dataframe() -> None: - schema = udf_schema() + schema = udf_schema(schema_name="polars") df = pl.DataFrame({"xx1": [42, 7], "xx2": [3.14, 2.72]}) results = why.log(df, schema=schema).view() assert results.get_column("f1.foo") is not None @@ -70,7 +70,7 @@ def test_multioutput_udf_dataframe() -> None: def test_drop_columns() -> None: - schema = udf_schema(drop_columns={"xx1", "xx2"}) + schema = udf_schema(drop_columns={"xx1", "xx2"}, schema_name="polars") df = pl.DataFrame({"xx1": [42, 7], "xx2": [3.14, 2.72]}) results = why.log(df, schema=schema).view() assert results.get_column("xx1") is None @@ -84,13 +84,12 @@ def test_drop_columns() -> None: assert results.get_column("bar") is not None -@register_dataset_udf(["col1"], schema_name="unit-tests") +@register_dataset_udf(["col1"], schema_name="polars-unit-tests") def add5(x) -> float: return x[0]+5 def square(x: Tuple) -> float: - print(f"square(): {type(x)}\n{x}") return x[0] * x[0] @@ -106,7 +105,7 @@ def do_something_important(validator_name, condition_name: str, value: Any, colu return -@condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important]) +@condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars") def lt_4(x): return x < 4 @@ -114,7 +113,7 @@ def lt_4(x): def test_validator_udf_polars() -> None: global action_list data = pl.DataFrame({"col1": [1, 3, 7]}) - schema = udf_schema() + schema = udf_schema(schema_name="polars", include_default_schema=False) why.log(data, schema=schema).view() assert 7 in action_list @@ -122,19 +121,20 @@ def test_validator_udf_polars() -> None: def test_validator_double_register_udf_polars() -> None: global action_list - @condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important]) + @condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars") def lt_4_2(x): return x < 4 - schema = udf_schema() + schema = udf_schema(schema_name="polars", include_default_schema=False) # registering the same validator twice should keep only the latest registration assert schema.validators["col1"][0].conditions["less_than_four"].__name__ == "lt_4_2" + print(f"schema.validators['col1'] = {schema.validators['col1']}") assert len(schema.validators["col1"]) == 1 def test_decorator_polars() -> None: extra_spec = UdfSpec(["col1"], {"sqr": square}) - schema = udf_schema([extra_spec], STANDARD_RESOLVER, schema_name="unit-tests") + schema = udf_schema([extra_spec], STANDARD_RESOLVER, schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [42, 12, 7], "col2": ["a", "b", "c"]}) results = why.log(data, schema=schema).view() col1_summary = results.get_column("col1").to_summary_dict() @@ -146,14 +146,14 @@ def test_decorator_polars() -> None: @register_dataset_udf( - ["col1"], "annihilate_me", anti_metrics=[CardinalityMetric, DistributionMetric], schema_name="unit-tests" + ["col1"], "annihilate_me", anti_metrics=[CardinalityMetric, DistributionMetric], schema_name="polars-unit-tests" ) def plus1(x) -> float: return x[0] + 1 def test_anti_resolver() -> None: - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [42, 12, 7], "col2": ["a", "b", "c"]}) results = why.log(data, schema=schema).view() col1_summary = results.get_column("col1").to_summary_dict() @@ -171,23 +171,23 @@ def test_anti_resolver() -> None: assert "cardinality/est" not in plus1_summary -@register_dataset_udf(["col1"], "colliding_name", namespace="pluto", schema_name="unit-tests") +@register_dataset_udf(["col1"], "colliding_name", namespace="pluto", schema_name="polars-unit-tests") def a_function(x): return x[0] -@register_dataset_udf(["col1"], "colliding_name", namespace="neptune", schema_name="unit-tests") +@register_dataset_udf(["col1"], "colliding_name", namespace="neptune", schema_name="polars-unit-tests") def another_function(x): return x[0] -@register_dataset_udf(["col1", "col2"], "product", schema_name="unit-tests") +@register_dataset_udf(["col1", "col2"], "product", schema_name="polars-unit-tests") def times(x: Tuple) -> float: return x[0] * x[1] @register_dataset_udf( - ["col1", "col3"], metrics=[MetricSpec(StandardMetric.distribution.value)], schema_name="unit-tests" + ["col1", "col3"], metrics=[MetricSpec(StandardMetric.distribution.value)], schema_name="polars-unit-tests" ) def ratio(x: Tuple) -> float: return x[0] / x[1] @@ -210,7 +210,7 @@ def test_multicolumn_udf_pandas() -> None: ] extra_spec = UdfSpec(["col1"], {"sqr": square}) - schema = udf_schema([extra_spec], count_only, schema_name="unit-tests") + schema = udf_schema([extra_spec], count_only, schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) results = why.log(data, schema=schema).view() col1_summary = results.get_column("col1").to_summary_dict() @@ -235,7 +235,7 @@ def test_multicolumn_udf_pandas() -> None: n: int = 0 -@register_dataset_udf(["oops"], schema_name="unit-tests") +@register_dataset_udf(["oops"], schema_name="polars-unit-tests") def exothermic(x: pl.DataFrame) -> pl.Series: global n n += 1 @@ -248,7 +248,7 @@ def exothermic(x: pl.DataFrame) -> pl.Series: def test_udf_throws_polars() -> None: global n n = 0 - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") df = pl.DataFrame({"oops": [1, 2, 3, 4], "ok": [5, 6, 7, 8]}) results = why.log(df, schema=schema).view() assert "exothermic" in results.get_columns() @@ -264,7 +264,7 @@ def bar(x: Any) -> Any: def test_udf_metric_resolving() -> None: - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") df = pl.DataFrame({"col1": [1, 2, 3], "foo": [1, 2, 3]}) results = why.log(df, schema=schema).view() assert "add5" in results.get_columns() @@ -276,7 +276,7 @@ def test_udf_metric_resolving() -> None: def test_udf_segmentation_pandas() -> None: column_segments = segment_on_column("product") - segmented_schema = udf_schema(segments=column_segments, schema_name="unit-tests") + segmented_schema = udf_schema(segments=column_segments, schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) results = why.log(data, schema=segmented_schema) assert len(results.segments()) == 3 @@ -284,14 +284,14 @@ def test_udf_segmentation_pandas() -> None: def test_udf_segmentation_obj() -> None: column_segments = segment_on_column("product") - segmented_schema = udf_schema(segments=column_segments, schema_name="unit-tests") + segmented_schema = udf_schema(segments=column_segments, schema_name="polars-unit-tests") data = {"col1": 42, "col2": 2, "col3": 2} results = why.log(data, schema=segmented_schema) assert len(results.segments()) == 1 def test_udf_track() -> None: - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") prof = DatasetProfile(schema) data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) prof.track(data) @@ -310,23 +310,23 @@ def test_udf_track() -> None: assert div_summary["distribution/n"] == 3 -@register_dataset_udf(["schema.col1"], schema_name="bob") +@register_dataset_udf(["schema.col1"], schema_name="polars-bob") def bob(x: pl.DataFrame) -> pl.Series: return x["schema.col1"] -@register_metric_udf("schema.col1", schema_name="bob") +@register_metric_udf("schema.col1", schema_name="polars-bob") def rob(x: Any) -> Any: return x -@register_dataset_udf(["schema.col1"], "add5") +@register_dataset_udf(["schema.col1"], "add5", schema_name="polars") def fob(x: pl.DataFrame) -> pl.Series: return x["schema.col1"] + 5 def test_direct_udfs() -> None: - schema = udf_schema(schema_name=["", "bob"]) + schema = udf_schema(schema_name=["polars", "polars-bob"]) data = pl.DataFrame({"col1": [42, 12, 7]}) more_data, _ = schema.apply_udfs(polars=data) udf_columns = set(more_data.columns) @@ -340,13 +340,13 @@ def test_direct_udfs() -> None: assert more_columns == profile_columns -@register_type_udf(Fractional, schema_name="unit-tests") +@register_type_udf(Fractional, schema_name="polars-unit-tests") def square_type(x: pl.Series) -> pl.Series: return x * x def test_type_udf_dataframe() -> None: - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [3.14, 42.0]}) results = why.log(data, schema=schema).view() assert "col1.square_type" in results.get_columns().keys() @@ -355,13 +355,13 @@ def test_type_udf_dataframe() -> None: assert summary["types/fractional"] == 2 -@register_type_udf(float, schema_name="unit-tests") +@register_type_udf(float, schema_name="polars-unit-tests") def square_python_type(x: pl.Series) -> pl.Series: return x * x def test_python_type_udf() -> None: - schema = udf_schema(schema_name="unit-tests") + schema = udf_schema(schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [3.14, 42.0]}) results = why.log(data, schema=schema).view() assert "col1.square_python_type" in results.get_columns().keys() diff --git a/python/whylogs/experimental/core/udf_schema.py b/python/whylogs/experimental/core/udf_schema.py index 9f9016eeb1..06fdcdce27 100644 --- a/python/whylogs/experimental/core/udf_schema.py +++ b/python/whylogs/experimental/core/udf_schema.py @@ -498,7 +498,7 @@ def udf_schema( for name in schema_names: resolver_specs += _resolver_specs[name] - validators = generate_validators(validators, name, include_default_schema=True) + validators = generate_validators(validators, name, include_default_schema=include_default_schema) resolver_specs += generate_udf_resolvers(schema_name, include_default_schema) return UdfSchema( From b5bf95c9f678167dd242c6ea049debe2d12ec4d5 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Thu, 31 Oct 2024 07:37:51 +0000 Subject: [PATCH 04/41] work --- python/poetry.lock | 276 +++--------------- python/pyproject.toml | 2 +- .../tests/api/logger/test_segments_polars.py | 54 ++-- python/tests/core/test_performance_polars.py | 8 +- .../core/test_udf_schema_polars.py | 21 +- 5 files changed, 73 insertions(+), 288 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 3fbcbf4409..64da8cb3e6 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "2to3" @@ -97,7 +97,6 @@ files = [ [package.dependencies] lazy-object-proxy = ">=1.4.0" setuptools = ">=20.0" -typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""} typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""} wrapt = ">=1.11,<2" @@ -112,9 +111,6 @@ files = [ {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, ] -[package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] dev = ["attrs[tests]", "pre-commit"] @@ -225,7 +221,6 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} -typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -525,18 +520,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - -[[package]] -name = "cloudpickle" -version = "2.2.1" -description = "Extended pickling support for Python objects" -optional = true -python-versions = ">=3.6" -files = [ - {file = "cloudpickle-2.2.1-py3-none-any.whl", hash = "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f"}, - {file = "cloudpickle-2.2.1.tar.gz", hash = "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"}, -] [[package]] name = "cloudpickle" @@ -989,7 +972,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=1.1.0,<4.3", markers = "python_version < \"3.8\""} mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" @@ -1159,7 +1141,6 @@ files = [ [package.dependencies] gitdb = ">=4.0.1,<5" -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} [package.extras] doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] @@ -1225,7 +1206,6 @@ files = [ [package.dependencies] google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" -importlib-metadata = {version = ">1.0.0", markers = "python_version < \"3.8\""} [package.extras] grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] @@ -1400,7 +1380,6 @@ files = [ ] [package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -1608,17 +1587,6 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -[[package]] -name = "joblib" -version = "1.3.2" -description = "Lightweight pipelining with Python functions" -optional = true -python-versions = ">=3.7" -files = [ - {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, - {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, -] - [[package]] name = "joblib" version = "1.4.2" @@ -1643,11 +1611,9 @@ files = [ [package.dependencies] attrs = ">=17.4.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" -typing-extensions = {version = "*", markers = "python_version < \"3.8\""} [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] @@ -1801,7 +1767,6 @@ files = [ [package.dependencies] mdurl = ">=0.1,<1.0" -typing_extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""} [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] @@ -1948,36 +1913,6 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] -[[package]] -name = "mlflow-skinny" -version = "1.27.0" -description = "MLflow: A Platform for ML Development and Productionization" -optional = true -python-versions = ">=3.7" -files = [ - {file = "mlflow-skinny-1.27.0.tar.gz", hash = "sha256:77240a1bee2d5bbc2bf8eb9609b22fa43a2381067c3728d8c6539702962a268a"}, - {file = "mlflow_skinny-1.27.0-py3-none-any.whl", hash = "sha256:48462b9675a8365d8bd44357db29886098ee2723dbc925e35b16517e1ca43983"}, -] - -[package.dependencies] -click = ">=7.0" -cloudpickle = "*" -databricks-cli = ">=0.8.7" -entrypoints = "*" -gitpython = ">=2.1.0" -importlib-metadata = ">=3.7.0,<4.7.0 || >4.7.0" -packaging = "*" -protobuf = ">=3.12.0" -pytz = "*" -pyyaml = ">=5.1" -requests = ">=2.17.3" - -[package.extras] -aliyun-oss = ["aliyunstoreplugin"] -extras = ["azureml-core (>=1.2.0)", "boto3", "google-cloud-storage (>=1.30.0)", "kubernetes", "mlserver (>=0.5.3)", "mlserver-mlflow (>=0.5.3)", "pyarrow", "pysftp", "scikit-learn", "virtualenv"] -pipelines = ["Jinja2 (>=3.0)", "ipython (>=7.0)", "markdown (>=3.3)", "pandas-profiling (>=3.1)", "pyarrow (>=7.0)", "scikit-learn (>=1.0)", "shap (>=0.40)"] -sqlserver = ["mlflow-dbstore"] - [[package]] name = "mlflow-skinny" version = "2.13.0" @@ -2041,7 +1976,6 @@ files = [ boto3 = ">=1.9.201" botocore = ">=1.12.201" cryptography = ">=3.3.1" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" python-dateutil = ">=2.1,<3.0.0" requests = ">=2.5" @@ -2113,7 +2047,6 @@ files = [ [package.dependencies] mypy-extensions = ">=0.4.3" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -2247,7 +2180,6 @@ files = [ [package.dependencies] fastjsonschema = "*" -importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.8\""} jsonschema = ">=2.6" jupyter-core = "*" traitlets = ">=5.1" @@ -2566,9 +2498,6 @@ files = [ {file = "pandas_stubs-1.2.0.62-py3-none-any.whl", hash = "sha256:32a9e04582173104d42c090135efacc64d70e08c003405455b7dfb1540bd7e6c"}, ] -[package.dependencies] -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} - [[package]] name = "pandocfilters" version = "1.5.1" @@ -2664,85 +2593,6 @@ files = [ {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] -[[package]] -name = "pillow" -version = "9.5.0" -description = "Python Imaging Library (Fork)" -optional = true -python-versions = ">=3.7" -files = [ - {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, - {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, - {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, - {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, - {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, - {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, - {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, - {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, - {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, - {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, - {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, - {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, - {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, - {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, - {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, - {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, - {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, -] - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] -tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] - [[package]] name = "pillow" version = "10.3.0" @@ -2865,9 +2715,6 @@ files = [ {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""} - [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] @@ -2883,42 +2730,48 @@ files = [ {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.18.4" +version = "1.8.2" description = "Blazingly fast DataFrame library" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, - {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, - {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, - {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, + {file = "polars-1.8.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:114be1ebfb051b794fb9e1f15999430c79cc0824595e237d3f45632be3e56d73"}, + {file = "polars-1.8.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e4fc36cfe48972d4c5be21a7cb119d6378fb7af0bb3eeb61456b66a1f43228e3"}, + {file = "polars-1.8.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67c1e448d6e38697650b22dd359f13c40b567c0b66686c8602e4367400e87801"}, + {file = "polars-1.8.2-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:570ee86b033dc5a6dbe2cb0df48522301642f304dda3da48f53d7488899a2206"}, + {file = "polars-1.8.2-cp38-abi3-win_amd64.whl", hash = "sha256:ce1a1c1e2150ffcc44a5f1c461d738e1dcd95abbd0f210af0271c7ac0c9f7ef9"}, + {file = "polars-1.8.2.tar.gz", hash = "sha256:42f69277d5be2833b0b826af5e75dcf430222d65c9633872856e176a0bed27a0"}, ] -[package.dependencies] -typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} - [package.extras] -all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] -connectorx = ["connectorx"] -deltalake = ["deltalake (>=0.8.0)"] +adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"] +all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"] +async = ["gevent"] +calamine = ["fastexcel (>=0.9)"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"] +deltalake = ["deltalake (>=0.15.0)"] +excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"] fsspec = ["fsspec"] -matplotlib = ["matplotlib"] +gpu = ["cudf-polars-cu12"] +graph = ["matplotlib"] +iceberg = ["pyiceberg (>=0.5.0)"] numpy = ["numpy (>=1.16.0)"] -pandas = ["pandas", "pyarrow (>=7.0.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "polars[pyarrow]"] +plot = ["altair (>=5.4.0)"] pyarrow = ["pyarrow (>=7.0.0)"] -sqlalchemy = ["pandas", "sqlalchemy"] -timezone = ["backports.zoneinfo", "tzdata"] +pydantic = ["pydantic"] +sqlalchemy = ["polars[pandas]", "sqlalchemy"] +style = ["great-tables (>=0.8.0)"] +timezone = ["backports-zoneinfo", "tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] xlsxwriter = ["xlsxwriter"] @@ -2936,7 +2789,6 @@ files = [ [package.dependencies] cfgv = ">=2.0.0" identify = ">=1.0.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} nodeenv = ">=0.11.1" pyyaml = ">=5.1" toml = "*" @@ -3268,7 +3120,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -3656,7 +3507,6 @@ files = [ pyyaml = "*" requests = ">=2.30.0,<3.0" types-PyYAML = "*" -typing-extensions = {version = "*", markers = "python_version < \"3.8\""} urllib3 = ">=1.25.10,<3.0" [package.extras] @@ -4310,17 +4160,6 @@ files = [ {file = "textwrap3-0.9.2.zip", hash = "sha256:5008eeebdb236f6303dcd68f18b856d355f6197511d952ba74bc75e40e0c3414"}, ] -[[package]] -name = "threadpoolctl" -version = "3.1.0" -description = "threadpoolctl" -optional = true -python-versions = ">=3.6" -files = [ - {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, - {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, -] - [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4441,7 +4280,6 @@ files = [ [package.dependencies] fs = "*" fsspec = "*" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} numpy = "*" pandas = ">=1.2.0" pyarrow = "*" @@ -4472,56 +4310,6 @@ rfc3986 = ">=1.4.0" rich = ">=12.0.0" urllib3 = ">=1.26.0" -[[package]] -name = "typed-ast" -version = "1.5.5" -description = "a fork of Python 2 and 3 ast modules with type comment support" -optional = false -python-versions = ">=3.6" -files = [ - {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, - {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, - {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, - {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, - {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, - {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, - {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, - {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, - {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, - {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, - {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, -] - [[package]] name = "types-protobuf" version = "4.24.0.4" @@ -4633,7 +4421,6 @@ files = [ appdirs = ">=1.4.3,<2" distlib = ">=0.3.1,<1" filelock = ">=3.0.0,<4" -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} six = ">=1.9.0,<2" [package.extras] @@ -4712,6 +4499,9 @@ files = [ {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ba536fca5f9578fa34d106c243fdccfef7d75b9d1fffb9d93df0debfe8e3ebc"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afa843c68cafa08e82624e6a33d13ab7f00ad0301101960872fe152d5af5ab53"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-win_amd64.whl", hash = "sha256:303d55c37565340c2d21c268c64a712fad612504cc4b98b1d1df848cac6d934f"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b636cebf5f4d7724437616368199c8e7b153f89dfd396f9e8279a95bf55d817"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba4519780defebb35c4718ecc13d1b8c38894be722147a047e67b953cd2430ab"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b4606e5360ce922e6ad770e845c75038d873300fd8a54ea856e99003b3254fc9"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d65fcf8dade1affe50181582b8894929993e37d7daa922d973a811790cd0208"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4845e77c208ae64ada9170e1b92ed0abe28fe311c0fc35f9d8efa6926211ca2"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:02cac1c87ac42d7fc7e6597862ac50bc035825988d21e8a2d763b416e83e845f"}, @@ -4857,5 +4647,5 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" -python-versions = ">=3.7.1, <4" -content-hash = "9e2ecd0f225c7e843c021525c5269f8230ee356787fc82ff5345f99c8431e6a4" +python-versions = ">=3.8, <4" +content-hash = "b24a566dfd5a4d794ca6cf690e643e5999e49270861d09c0df149aa7b9f0c5e4" diff --git a/python/pyproject.toml b/python/pyproject.toml index 624f9cdc02..0c1c154f04 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -11,7 +11,7 @@ include = ["whylogs/core/proto/v0/*.py*", "whylogs/core/proto/*.py*"] [tool.poetry.dependencies] # core dependencies. Be REALLY mindful when touching this list -python = ">=3.7.1, <4" +python = ">=3.8, <4" whylogs-sketching = ">=3.4.1.dev3" protobuf = ">=3.19.4" importlib-metadata = { version = "<4.3", python = "<3.8" } diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py index da979cf5b3..ff0e51bc5d 100644 --- a/python/tests/api/logger/test_segments_polars.py +++ b/python/tests/api/logger/test_segments_polars.py @@ -84,9 +84,9 @@ def test_single_column_segment() -> None: first_segment = next(iter(segments)) first_segment_profile = results.profile(first_segment) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String # .name == "object" segment_cardinality: CardinalityMetric = ( first_segment_profile.view().get_column(segment_column).get_metric("cardinality") ) @@ -118,12 +118,12 @@ def test_single_column_and_manual_segment() -> None: assert len(segments) == number_of_segments first_segment = next(iter(segments)) - #assert first_segment.key == ("x0", "1", "foo") + # assert first_segment.key == ("x0", "1", "foo") first_segment_profile = results.profile(first_segment) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String # .name == "object" segment_cardinality: CardinalityMetric = ( first_segment_profile.view().get_column(segment_column).get_metric("cardinality") ) @@ -173,9 +173,9 @@ def test_single_column_segment_with_trace_id() -> None: first_segment = next(iter(segments)) first_segment_profile = results.profile(first_segment) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.String # .name == "object" segment_cardinality: CardinalityMetric = ( first_segment_profile.view().get_column(segment_column).get_metric("cardinality") ) @@ -211,9 +211,9 @@ def test_single_integer_column_segment() -> None: first_segment = next(iter(segments)) first_segment_profile = results.profile(first_segment) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns["col3"]._schema.dtype == pl.Int64 #np.int64 + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns["col3"]._schema.dtype == pl.Int64 # np.int64 segment_cardinality: CardinalityMetric = ( first_segment_profile.view().get_column(segment_column).get_metric("cardinality") ) @@ -249,9 +249,9 @@ def test_filtered_single_column_segment() -> None: first_segment_profile = results.profile(first_segment) assert first_segment.key == ("x0",) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String #.name == "object" + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String # .name == "object" segment_distribution: DistributionMetric = ( first_segment_profile.view().get_column("col1").get_metric("distribution") ) @@ -292,9 +292,9 @@ def test_segment_write_roundtrip_versions(tmp_path: Any, v0) -> None: first_segment_profile = results.profile(first_segment) assert first_segment.key == ("x0",) assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String #.name == "object" + assert first_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert first_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert first_segment_profile._columns[segment_column]._schema.dtype == pl.String # .name == "object" segment_distribution: DistributionMetric = ( first_segment_profile.view().get_column("col1").get_metric("distribution") ) @@ -356,13 +356,13 @@ def test_multi_column_segment() -> None: # Note this segment is not useful as there is only one datapoint per segment, we have 100 rows and # 100 segments. The segment value is a tuple of strings identifying this segment. - #assert last_segment.key == ("99", "x4") + # assert last_segment.key == ("99", "x4") last_segment_profile = results.profile(last_segment) - assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert last_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert last_segment_profile._columns["col3"]._schema.dtype == pl.String # .name == "object" segment_distribution: DistributionMetric = last_segment_profile.view().get_column("col1").get_metric("distribution") count = segment_distribution.n @@ -391,13 +391,13 @@ def test_multicolumn_and_manual_segment() -> None: # Note this segment is not useful as there is only one datapoint per segment, we have 100 rows and # 100 segments. The segment value is a tuple of strings identifying this segment. - #assert last_segment.key == ("99", "x4", "42", "bar") + # assert last_segment.key == ("99", "x4", "42", "bar") last_segment_profile = results.profile(last_segment) - assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 #np.int64 - assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 #np.float64 - assert last_segment_profile._columns["col3"]._schema.dtype == pl.String #.name == "object" + assert last_segment_profile._columns["col1"]._schema.dtype == pl.Int64 # np.int64 + assert last_segment_profile._columns["col2"]._schema.dtype == pl.Float64 # np.float64 + assert last_segment_profile._columns["col3"]._schema.dtype == pl.String # .name == "object" segment_distribution: DistributionMetric = last_segment_profile.view().get_column("col1").get_metric("distribution") count = segment_distribution.n diff --git a/python/tests/core/test_performance_polars.py b/python/tests/core/test_performance_polars.py index 10133f97ea..0f7e49eab5 100644 --- a/python/tests/core/test_performance_polars.py +++ b/python/tests/core/test_performance_polars.py @@ -55,7 +55,7 @@ def test_track_column_benchmark(test_resolver: Resolver) -> None: profiler.enable() for column_index in range(num_columns): column_name = str(column_index) - col_df = pl.DataFrame({ column_name: np.random.random(size=(num_rows,)) }) + col_df = pl.DataFrame({column_name: np.random.random(size=(num_rows,))}) col_prof = ColumnProfile( name="perf_test", schema=ColumnSchema(float, resolver=test_resolver), cache_size=1024 ) @@ -84,9 +84,7 @@ def test_track_dataset_benchmark() -> None: profiler = cProfile.Profile() string_output_stream = StringIO() - full_df = pl.DataFrame( - { str(i): np.random.random(size=(num_rows,)) for i in range(num_columns) } - ) + full_df = pl.DataFrame({str(i): np.random.random(size=(num_rows,)) for i in range(num_columns)}) dataset_profile = DatasetProfile() profiler.enable() dataset_profile.track(full_df) @@ -118,7 +116,7 @@ def test_track_baseline_benchmark() -> None: for column_index in range(num_columns): column_name = str(column_index) baseline_metric = CustomHistogramMetric() - col_df = pl.DataFrame( {column_name: np.random.random(size=(num_rows,)) } ) + col_df = pl.DataFrame({column_name: np.random.random(size=(num_rows,))}) if column_index == 0: TEST_LOGGER.info(f"using the following trackers {baseline_metric}") for value in col_df[column_name]: diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 04ba76e327..45f79bc1af 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -1,17 +1,11 @@ -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Tuple import polars as pl import whylogs as why from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.datatypes import Fractional, Integral, String -from whylogs.core.metrics import ( - CardinalityMetric, - DistributionMetric, - MetricConfig, - StandardMetric, -) -from whylogs.core.preprocessing import ColumnProperties +from whylogs.core.metrics import CardinalityMetric, DistributionMetric, StandardMetric from whylogs.core.resolvers import STANDARD_RESOLVER, MetricSpec, ResolverSpec from whylogs.core.segmentation_partition import segment_on_column from whylogs.experimental.core.metrics.udf_metric import register_metric_udf @@ -22,7 +16,6 @@ register_multioutput_udf, register_type_udf, udf_schema, - unregister_udf, ) from whylogs.experimental.core.validators import condition_validator @@ -86,7 +79,7 @@ def test_drop_columns() -> None: @register_dataset_udf(["col1"], schema_name="polars-unit-tests") def add5(x) -> float: - return x[0]+5 + return x[0] + 5 def square(x: Tuple) -> float: @@ -105,7 +98,9 @@ def do_something_important(validator_name, condition_name: str, value: Any, colu return -@condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars") +@condition_validator( + ["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars" +) def lt_4(x): return x < 4 @@ -121,7 +116,9 @@ def test_validator_udf_polars() -> None: def test_validator_double_register_udf_polars() -> None: global action_list - @condition_validator(["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars") + @condition_validator( + ["col1", "add5"], condition_name="less_than_four", actions=[do_something_important], schema_name="polars" + ) def lt_4_2(x): return x < 4 From b479979fd618b6274c9ca56a3020805d4e43f745 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Thu, 31 Oct 2024 16:44:21 +0000 Subject: [PATCH 05/41] pre-commit --- .../logger/actor/thread_rolling_logger.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py index 755eff1586..9d5c264ef5 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py @@ -71,13 +71,16 @@ def _track_segments(self, data: TrackData) -> None: if self._schema: if isinstance(data, List): - input_data = [self._schema._run_udfs(df=None, row=it)[1] for it in data] # pyright: ignore[reportPrivateUsage, reportUnknownMemberType] + input_data = [ + self._schema._run_udfs(df=None, row=it)[1] # pyright: ignore[reportUnknownArgumentType,reportPrivateUsage] + for it in data # pyright: ignore[reportUnknownArgumentType,reportUnknownVariableType,reportPrivateUsage] + ] # pyright: ignore[reportPrivateUsage, reportUnknownMemberType, reportUnknownArgumentType, reportUnknownvariableType] else: df = data if isinstance(data, pd.DataFrame) else None - row = data if isinstance(data, dict) else None - df, row = _dataframe_or_dict(df, None, None, row) + row = data if isinstance(data, dict) else None # pyright: ignore[reportUnknownVariableType] + df, row = _dataframe_or_dict(df, None, None, row) # pyright: ignore[reportUnknownArgumentType] df, row = self._schema._run_udfs(df, row) # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType, reportPrivateUsage] - input_data: TrackData = cast(TrackData, df if df is not None else row) + input_data: TrackData = cast(TrackData, df if df is not None else row) # type: ignore[no-redef] else: input_data = data From b974fbd97897df7cd0053a22bcc917d29b36f325 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Fri, 1 Nov 2024 04:59:23 +0000 Subject: [PATCH 06/41] pre-commit --- python/.pre-commit-config.yaml | 11 +- python/poetry.lock | 261 ++++++++++++++++++++++++++++++++- python/pyproject.toml | 2 +- 3 files changed, 267 insertions(+), 7 deletions(-) diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index c267119a8b..f3e7903452 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -6,10 +6,12 @@ repos: hooks: - id: black exclude: python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger + files: ^(python/whylogs/) - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 + files: ^(python/whylogs/) args: - --max-line-length=160 - --exclude="""\.tox | @@ -30,17 +32,24 @@ repos: hooks: - id: isort args: [--filter-files] + files: ^(python/whylogs/) exclude: python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.942 hooks: - id: mypy language: system - exclude: ^(python/tests/|python/examples/|python/examples/integration/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger) + files: ^(python/whylogs/) + exclude: ^(python/.venv/lib/python3.8/site-packages/polars/ml/torch.py|python/tests/|python/examples/|python/examples/integration/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger) + verbose: true + args: + - --exclude python/.venv + - -V - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.5.1 hooks: - id: prettier + files: ^(python/whylogs/) exclude: python/tests/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 diff --git a/python/poetry.lock b/python/poetry.lock index 64da8cb3e6..6579c64a64 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -97,6 +97,7 @@ files = [ [package.dependencies] lazy-object-proxy = ">=1.4.0" setuptools = ">=20.0" +typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""} typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""} wrapt = ">=1.11,<2" @@ -111,6 +112,9 @@ files = [ {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, ] +[package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] dev = ["attrs[tests]", "pre-commit"] @@ -221,6 +225,7 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -520,6 +525,18 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + +[[package]] +name = "cloudpickle" +version = "2.2.1" +description = "Extended pickling support for Python objects" +optional = true +python-versions = ">=3.6" +files = [ + {file = "cloudpickle-2.2.1-py3-none-any.whl", hash = "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f"}, + {file = "cloudpickle-2.2.1.tar.gz", hash = "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"}, +] [[package]] name = "cloudpickle" @@ -972,6 +989,7 @@ files = [ ] [package.dependencies] +importlib-metadata = {version = ">=1.1.0,<4.3", markers = "python_version < \"3.8\""} mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" @@ -1141,6 +1159,7 @@ files = [ [package.dependencies] gitdb = ">=4.0.1,<5" +typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} [package.extras] doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] @@ -1206,6 +1225,7 @@ files = [ [package.dependencies] google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" +importlib-metadata = {version = ">1.0.0", markers = "python_version < \"3.8\""} [package.extras] grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] @@ -1380,6 +1400,7 @@ files = [ ] [package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -1587,6 +1608,17 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joblib" +version = "1.3.2" +description = "Lightweight pipelining with Python functions" +optional = true +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, + {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, +] + [[package]] name = "joblib" version = "1.4.2" @@ -1611,9 +1643,11 @@ files = [ [package.dependencies] attrs = ">=17.4.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" +typing-extensions = {version = "*", markers = "python_version < \"3.8\""} [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] @@ -1767,6 +1801,7 @@ files = [ [package.dependencies] mdurl = ">=0.1,<1.0" +typing_extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""} [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] @@ -1913,6 +1948,36 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] +[[package]] +name = "mlflow-skinny" +version = "1.27.0" +description = "MLflow: A Platform for ML Development and Productionization" +optional = true +python-versions = ">=3.7" +files = [ + {file = "mlflow-skinny-1.27.0.tar.gz", hash = "sha256:77240a1bee2d5bbc2bf8eb9609b22fa43a2381067c3728d8c6539702962a268a"}, + {file = "mlflow_skinny-1.27.0-py3-none-any.whl", hash = "sha256:48462b9675a8365d8bd44357db29886098ee2723dbc925e35b16517e1ca43983"}, +] + +[package.dependencies] +click = ">=7.0" +cloudpickle = "*" +databricks-cli = ">=0.8.7" +entrypoints = "*" +gitpython = ">=2.1.0" +importlib-metadata = ">=3.7.0,<4.7.0 || >4.7.0" +packaging = "*" +protobuf = ">=3.12.0" +pytz = "*" +pyyaml = ">=5.1" +requests = ">=2.17.3" + +[package.extras] +aliyun-oss = ["aliyunstoreplugin"] +extras = ["azureml-core (>=1.2.0)", "boto3", "google-cloud-storage (>=1.30.0)", "kubernetes", "mlserver (>=0.5.3)", "mlserver-mlflow (>=0.5.3)", "pyarrow", "pysftp", "scikit-learn", "virtualenv"] +pipelines = ["Jinja2 (>=3.0)", "ipython (>=7.0)", "markdown (>=3.3)", "pandas-profiling (>=3.1)", "pyarrow (>=7.0)", "scikit-learn (>=1.0)", "shap (>=0.40)"] +sqlserver = ["mlflow-dbstore"] + [[package]] name = "mlflow-skinny" version = "2.13.0" @@ -1976,6 +2041,7 @@ files = [ boto3 = ">=1.9.201" botocore = ">=1.12.201" cryptography = ">=3.3.1" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" python-dateutil = ">=2.1,<3.0.0" requests = ">=2.5" @@ -2047,6 +2113,7 @@ files = [ [package.dependencies] mypy-extensions = ">=0.4.3" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -2180,6 +2247,7 @@ files = [ [package.dependencies] fastjsonschema = "*" +importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.8\""} jsonschema = ">=2.6" jupyter-core = "*" traitlets = ">=5.1" @@ -2498,6 +2566,9 @@ files = [ {file = "pandas_stubs-1.2.0.62-py3-none-any.whl", hash = "sha256:32a9e04582173104d42c090135efacc64d70e08c003405455b7dfb1540bd7e6c"}, ] +[package.dependencies] +typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} + [[package]] name = "pandocfilters" version = "1.5.1" @@ -2593,6 +2664,85 @@ files = [ {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] +[[package]] +name = "pillow" +version = "9.5.0" +description = "Python Imaging Library (Fork)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, + {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, + {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, + {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, + {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, + {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, + {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, + {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, + {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, + {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, + {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, + {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, + {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, + {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, + {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, + {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, + {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pillow" version = "10.3.0" @@ -2715,6 +2865,9 @@ files = [ {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, ] +[package.dependencies] +typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""} + [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] @@ -2730,10 +2883,45 @@ files = [ {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "0.18.4" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, + {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, + {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, + {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} + +[package.extras] +all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +connectorx = ["connectorx"] +deltalake = ["deltalake (>=0.8.0)"] +fsspec = ["fsspec"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "polars" version = "1.8.2" @@ -2789,6 +2977,7 @@ files = [ [package.dependencies] cfgv = ">=2.0.0" identify = ">=1.0.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} nodeenv = ">=0.11.1" pyyaml = ">=5.1" toml = "*" @@ -3120,6 +3309,7 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -3507,6 +3697,7 @@ files = [ pyyaml = "*" requests = ">=2.30.0,<3.0" types-PyYAML = "*" +typing-extensions = {version = "*", markers = "python_version < \"3.8\""} urllib3 = ">=1.25.10,<3.0" [package.extras] @@ -4160,6 +4351,17 @@ files = [ {file = "textwrap3-0.9.2.zip", hash = "sha256:5008eeebdb236f6303dcd68f18b856d355f6197511d952ba74bc75e40e0c3414"}, ] +[[package]] +name = "threadpoolctl" +version = "3.1.0" +description = "threadpoolctl" +optional = true +python-versions = ">=3.6" +files = [ + {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, + {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, +] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4280,6 +4482,7 @@ files = [ [package.dependencies] fs = "*" fsspec = "*" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} numpy = "*" pandas = ">=1.2.0" pyarrow = "*" @@ -4310,6 +4513,56 @@ rfc3986 = ">=1.4.0" rich = ">=12.0.0" urllib3 = ">=1.26.0" +[[package]] +name = "typed-ast" +version = "1.5.5" +description = "a fork of Python 2 and 3 ast modules with type comment support" +optional = false +python-versions = ">=3.6" +files = [ + {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, + {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, + {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, + {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, + {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, + {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, + {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, + {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, + {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, + {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, + {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, + {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, + {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, + {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, + {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, + {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, + {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, + {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, + {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, + {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, + {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, + {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, + {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, + {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, + {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, + {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, + {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, + {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, + {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, + {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, + {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, + {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, + {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, + {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, + {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, + {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, + {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, + {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, + {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, + {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, + {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, +] + [[package]] name = "types-protobuf" version = "4.24.0.4" @@ -4421,6 +4674,7 @@ files = [ appdirs = ">=1.4.3,<2" distlib = ">=0.3.1,<1" filelock = ">=3.0.0,<4" +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} six = ">=1.9.0,<2" [package.extras] @@ -4499,9 +4753,6 @@ files = [ {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ba536fca5f9578fa34d106c243fdccfef7d75b9d1fffb9d93df0debfe8e3ebc"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afa843c68cafa08e82624e6a33d13ab7f00ad0301101960872fe152d5af5ab53"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-win_amd64.whl", hash = "sha256:303d55c37565340c2d21c268c64a712fad612504cc4b98b1d1df848cac6d934f"}, - {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b636cebf5f4d7724437616368199c8e7b153f89dfd396f9e8279a95bf55d817"}, - {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba4519780defebb35c4718ecc13d1b8c38894be722147a047e67b953cd2430ab"}, - {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b4606e5360ce922e6ad770e845c75038d873300fd8a54ea856e99003b3254fc9"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d65fcf8dade1affe50181582b8894929993e37d7daa922d973a811790cd0208"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4845e77c208ae64ada9170e1b92ed0abe28fe311c0fc35f9d8efa6926211ca2"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:02cac1c87ac42d7fc7e6597862ac50bc035825988d21e8a2d763b416e83e845f"}, @@ -4647,5 +4898,5 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" -python-versions = ">=3.8, <4" -content-hash = "b24a566dfd5a4d794ca6cf690e643e5999e49270861d09c0df149aa7b9f0c5e4" +python-versions = ">=3.7.1, <4" +content-hash = "0cd70d40783143b19e150ae4419e0e7957ea67e27288ce88538bbf42c2253900" diff --git a/python/pyproject.toml b/python/pyproject.toml index 0c1c154f04..624f9cdc02 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -11,7 +11,7 @@ include = ["whylogs/core/proto/v0/*.py*", "whylogs/core/proto/*.py*"] [tool.poetry.dependencies] # core dependencies. Be REALLY mindful when touching this list -python = ">=3.8, <4" +python = ">=3.7.1, <4" whylogs-sketching = ">=3.4.1.dev3" protobuf = ">=3.19.4" importlib-metadata = { version = "<4.3", python = "<3.8" } From 1aad5ee006b430a38e3d5d53d5daac80b6b7f30f Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Fri, 1 Nov 2024 16:36:56 +0000 Subject: [PATCH 07/41] CI --- .github/workflows/whylogs-ci.yml | 2 +- python/poetry.lock | 258 +------------------------------ python/pyproject.toml | 4 +- 3 files changed, 5 insertions(+), 259 deletions(-) diff --git a/.github/workflows/whylogs-ci.yml b/.github/workflows/whylogs-ci.yml index 5ccc86d680..c148fc7de0 100644 --- a/.github/workflows/whylogs-ci.yml +++ b/.github/workflows/whylogs-ci.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macOS-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 diff --git a/python/poetry.lock b/python/poetry.lock index 6579c64a64..8a2af47b33 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -97,7 +97,6 @@ files = [ [package.dependencies] lazy-object-proxy = ">=1.4.0" setuptools = ">=20.0" -typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""} typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""} wrapt = ">=1.11,<2" @@ -112,9 +111,6 @@ files = [ {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, ] -[package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] dev = ["attrs[tests]", "pre-commit"] @@ -225,7 +221,6 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} -typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -525,18 +520,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - -[[package]] -name = "cloudpickle" -version = "2.2.1" -description = "Extended pickling support for Python objects" -optional = true -python-versions = ">=3.6" -files = [ - {file = "cloudpickle-2.2.1-py3-none-any.whl", hash = "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f"}, - {file = "cloudpickle-2.2.1.tar.gz", hash = "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"}, -] [[package]] name = "cloudpickle" @@ -989,7 +972,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=1.1.0,<4.3", markers = "python_version < \"3.8\""} mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" @@ -1159,7 +1141,6 @@ files = [ [package.dependencies] gitdb = ">=4.0.1,<5" -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} [package.extras] doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] @@ -1225,7 +1206,6 @@ files = [ [package.dependencies] google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" -importlib-metadata = {version = ">1.0.0", markers = "python_version < \"3.8\""} [package.extras] grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] @@ -1400,7 +1380,6 @@ files = [ ] [package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -1608,17 +1587,6 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -[[package]] -name = "joblib" -version = "1.3.2" -description = "Lightweight pipelining with Python functions" -optional = true -python-versions = ">=3.7" -files = [ - {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, - {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, -] - [[package]] name = "joblib" version = "1.4.2" @@ -1643,11 +1611,9 @@ files = [ [package.dependencies] attrs = ">=17.4.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" -typing-extensions = {version = "*", markers = "python_version < \"3.8\""} [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] @@ -1801,7 +1767,6 @@ files = [ [package.dependencies] mdurl = ">=0.1,<1.0" -typing_extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""} [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] @@ -1948,36 +1913,6 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] -[[package]] -name = "mlflow-skinny" -version = "1.27.0" -description = "MLflow: A Platform for ML Development and Productionization" -optional = true -python-versions = ">=3.7" -files = [ - {file = "mlflow-skinny-1.27.0.tar.gz", hash = "sha256:77240a1bee2d5bbc2bf8eb9609b22fa43a2381067c3728d8c6539702962a268a"}, - {file = "mlflow_skinny-1.27.0-py3-none-any.whl", hash = "sha256:48462b9675a8365d8bd44357db29886098ee2723dbc925e35b16517e1ca43983"}, -] - -[package.dependencies] -click = ">=7.0" -cloudpickle = "*" -databricks-cli = ">=0.8.7" -entrypoints = "*" -gitpython = ">=2.1.0" -importlib-metadata = ">=3.7.0,<4.7.0 || >4.7.0" -packaging = "*" -protobuf = ">=3.12.0" -pytz = "*" -pyyaml = ">=5.1" -requests = ">=2.17.3" - -[package.extras] -aliyun-oss = ["aliyunstoreplugin"] -extras = ["azureml-core (>=1.2.0)", "boto3", "google-cloud-storage (>=1.30.0)", "kubernetes", "mlserver (>=0.5.3)", "mlserver-mlflow (>=0.5.3)", "pyarrow", "pysftp", "scikit-learn", "virtualenv"] -pipelines = ["Jinja2 (>=3.0)", "ipython (>=7.0)", "markdown (>=3.3)", "pandas-profiling (>=3.1)", "pyarrow (>=7.0)", "scikit-learn (>=1.0)", "shap (>=0.40)"] -sqlserver = ["mlflow-dbstore"] - [[package]] name = "mlflow-skinny" version = "2.13.0" @@ -2041,7 +1976,6 @@ files = [ boto3 = ">=1.9.201" botocore = ">=1.12.201" cryptography = ">=3.3.1" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" python-dateutil = ">=2.1,<3.0.0" requests = ">=2.5" @@ -2113,7 +2047,6 @@ files = [ [package.dependencies] mypy-extensions = ">=0.4.3" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -2247,7 +2180,6 @@ files = [ [package.dependencies] fastjsonschema = "*" -importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.8\""} jsonschema = ">=2.6" jupyter-core = "*" traitlets = ">=5.1" @@ -2566,9 +2498,6 @@ files = [ {file = "pandas_stubs-1.2.0.62-py3-none-any.whl", hash = "sha256:32a9e04582173104d42c090135efacc64d70e08c003405455b7dfb1540bd7e6c"}, ] -[package.dependencies] -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} - [[package]] name = "pandocfilters" version = "1.5.1" @@ -2664,85 +2593,6 @@ files = [ {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] -[[package]] -name = "pillow" -version = "9.5.0" -description = "Python Imaging Library (Fork)" -optional = true -python-versions = ">=3.7" -files = [ - {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, - {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, - {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, - {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, - {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, - {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, - {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, - {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, - {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, - {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, - {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, - {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, - {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, - {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, - {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, - {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, - {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, - {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, - {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, - {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, - {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, - {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, - {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, - {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, - {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, - {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, - {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, - {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, - {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, - {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, - {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, -] - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] -tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] - [[package]] name = "pillow" version = "10.3.0" @@ -2865,9 +2715,6 @@ files = [ {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""} - [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] @@ -2883,45 +2730,10 @@ files = [ {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] -[[package]] -name = "polars" -version = "0.18.4" -description = "Blazingly fast DataFrame library" -optional = false -python-versions = ">=3.7" -files = [ - {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, - {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, - {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, - {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, -] - -[package.dependencies] -typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} - -[package.extras] -all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] -connectorx = ["connectorx"] -deltalake = ["deltalake (>=0.8.0)"] -fsspec = ["fsspec"] -matplotlib = ["matplotlib"] -numpy = ["numpy (>=1.16.0)"] -pandas = ["pandas", "pyarrow (>=7.0.0)"] -pyarrow = ["pyarrow (>=7.0.0)"] -sqlalchemy = ["pandas", "sqlalchemy"] -timezone = ["backports.zoneinfo", "tzdata"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -xlsxwriter = ["xlsxwriter"] - [[package]] name = "polars" version = "1.8.2" @@ -2977,7 +2789,6 @@ files = [ [package.dependencies] cfgv = ">=2.0.0" identify = ">=1.0.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} nodeenv = ">=0.11.1" pyyaml = ">=5.1" toml = "*" @@ -3309,7 +3120,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -3697,7 +3507,6 @@ files = [ pyyaml = "*" requests = ">=2.30.0,<3.0" types-PyYAML = "*" -typing-extensions = {version = "*", markers = "python_version < \"3.8\""} urllib3 = ">=1.25.10,<3.0" [package.extras] @@ -4351,17 +4160,6 @@ files = [ {file = "textwrap3-0.9.2.zip", hash = "sha256:5008eeebdb236f6303dcd68f18b856d355f6197511d952ba74bc75e40e0c3414"}, ] -[[package]] -name = "threadpoolctl" -version = "3.1.0" -description = "threadpoolctl" -optional = true -python-versions = ">=3.6" -files = [ - {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, - {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, -] - [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4482,7 +4280,6 @@ files = [ [package.dependencies] fs = "*" fsspec = "*" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} numpy = "*" pandas = ">=1.2.0" pyarrow = "*" @@ -4513,56 +4310,6 @@ rfc3986 = ">=1.4.0" rich = ">=12.0.0" urllib3 = ">=1.26.0" -[[package]] -name = "typed-ast" -version = "1.5.5" -description = "a fork of Python 2 and 3 ast modules with type comment support" -optional = false -python-versions = ">=3.6" -files = [ - {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, - {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, - {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, - {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, - {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, - {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, - {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, - {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, - {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, - {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, - {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, - {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, - {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, - {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, - {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, - {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, - {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, - {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, - {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, - {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, - {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, - {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, - {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, - {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, - {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, - {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, -] - [[package]] name = "types-protobuf" version = "4.24.0.4" @@ -4674,7 +4421,6 @@ files = [ appdirs = ">=1.4.3,<2" distlib = ">=0.3.1,<1" filelock = ">=3.0.0,<4" -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} six = ">=1.9.0,<2" [package.extras] @@ -4898,5 +4644,5 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" -python-versions = ">=3.7.1, <4" -content-hash = "0cd70d40783143b19e150ae4419e0e7957ea67e27288ce88538bbf42c2253900" +python-versions = ">=3.8, <4" +content-hash = "a34330d8cc3c6dc09eaaf1ca88cc11dc37c25f05afceab5693ed3ff6020631b4" diff --git a/python/pyproject.toml b/python/pyproject.toml index 624f9cdc02..7583af4263 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -11,7 +11,7 @@ include = ["whylogs/core/proto/v0/*.py*", "whylogs/core/proto/*.py*"] [tool.poetry.dependencies] # core dependencies. Be REALLY mindful when touching this list -python = ">=3.7.1, <4" +python = ">=3.8, <4" whylogs-sketching = ">=3.4.1.dev3" protobuf = ">=3.19.4" importlib-metadata = { version = "<4.3", python = "<3.8" } @@ -37,7 +37,7 @@ numpy = [ pandas = { version = "*", optional = true } # Polars module. -polars = { version = ">=1.8.2", optional = true } +polars = { version = ">=1.8.2", python = ">=3.8", optional = true } # TODO: do we want polars-u64-idx ? # Doc dependencies From 9243c6d7c012065a820da11b73983a112253d455 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Mon, 4 Nov 2024 14:29:47 +0000 Subject: [PATCH 08/41] allow 3.7 --- python/poetry.lock | 261 +++++++++++++++++++++++++++++++++++++++++- python/pyproject.toml | 2 +- 2 files changed, 260 insertions(+), 3 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 8a2af47b33..af558bf4fd 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -97,6 +97,7 @@ files = [ [package.dependencies] lazy-object-proxy = ">=1.4.0" setuptools = ">=20.0" +typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""} typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""} wrapt = ">=1.11,<2" @@ -111,6 +112,9 @@ files = [ {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, ] +[package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] dev = ["attrs[tests]", "pre-commit"] @@ -221,6 +225,7 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] @@ -520,6 +525,18 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + +[[package]] +name = "cloudpickle" +version = "2.2.1" +description = "Extended pickling support for Python objects" +optional = true +python-versions = ">=3.6" +files = [ + {file = "cloudpickle-2.2.1-py3-none-any.whl", hash = "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f"}, + {file = "cloudpickle-2.2.1.tar.gz", hash = "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"}, +] [[package]] name = "cloudpickle" @@ -972,6 +989,7 @@ files = [ ] [package.dependencies] +importlib-metadata = {version = ">=1.1.0,<4.3", markers = "python_version < \"3.8\""} mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" @@ -1141,6 +1159,7 @@ files = [ [package.dependencies] gitdb = ">=4.0.1,<5" +typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} [package.extras] doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] @@ -1206,6 +1225,7 @@ files = [ [package.dependencies] google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" +importlib-metadata = {version = ">1.0.0", markers = "python_version < \"3.8\""} [package.extras] grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] @@ -1380,6 +1400,7 @@ files = [ ] [package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -1587,6 +1608,17 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joblib" +version = "1.3.2" +description = "Lightweight pipelining with Python functions" +optional = true +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, + {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, +] + [[package]] name = "joblib" version = "1.4.2" @@ -1611,9 +1643,11 @@ files = [ [package.dependencies] attrs = ">=17.4.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" +typing-extensions = {version = "*", markers = "python_version < \"3.8\""} [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] @@ -1767,6 +1801,7 @@ files = [ [package.dependencies] mdurl = ">=0.1,<1.0" +typing_extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""} [package.extras] benchmarking = ["psutil", "pytest", "pytest-benchmark"] @@ -1913,6 +1948,36 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] +[[package]] +name = "mlflow-skinny" +version = "1.27.0" +description = "MLflow: A Platform for ML Development and Productionization" +optional = true +python-versions = ">=3.7" +files = [ + {file = "mlflow-skinny-1.27.0.tar.gz", hash = "sha256:77240a1bee2d5bbc2bf8eb9609b22fa43a2381067c3728d8c6539702962a268a"}, + {file = "mlflow_skinny-1.27.0-py3-none-any.whl", hash = "sha256:48462b9675a8365d8bd44357db29886098ee2723dbc925e35b16517e1ca43983"}, +] + +[package.dependencies] +click = ">=7.0" +cloudpickle = "*" +databricks-cli = ">=0.8.7" +entrypoints = "*" +gitpython = ">=2.1.0" +importlib-metadata = ">=3.7.0,<4.7.0 || >4.7.0" +packaging = "*" +protobuf = ">=3.12.0" +pytz = "*" +pyyaml = ">=5.1" +requests = ">=2.17.3" + +[package.extras] +aliyun-oss = ["aliyunstoreplugin"] +extras = ["azureml-core (>=1.2.0)", "boto3", "google-cloud-storage (>=1.30.0)", "kubernetes", "mlserver (>=0.5.3)", "mlserver-mlflow (>=0.5.3)", "pyarrow", "pysftp", "scikit-learn", "virtualenv"] +pipelines = ["Jinja2 (>=3.0)", "ipython (>=7.0)", "markdown (>=3.3)", "pandas-profiling (>=3.1)", "pyarrow (>=7.0)", "scikit-learn (>=1.0)", "shap (>=0.40)"] +sqlserver = ["mlflow-dbstore"] + [[package]] name = "mlflow-skinny" version = "2.13.0" @@ -1976,6 +2041,7 @@ files = [ boto3 = ">=1.9.201" botocore = ">=1.12.201" cryptography = ">=3.3.1" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" python-dateutil = ">=2.1,<3.0.0" requests = ">=2.5" @@ -2047,6 +2113,7 @@ files = [ [package.dependencies] mypy-extensions = ">=0.4.3" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -2180,6 +2247,7 @@ files = [ [package.dependencies] fastjsonschema = "*" +importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.8\""} jsonschema = ">=2.6" jupyter-core = "*" traitlets = ">=5.1" @@ -2498,6 +2566,9 @@ files = [ {file = "pandas_stubs-1.2.0.62-py3-none-any.whl", hash = "sha256:32a9e04582173104d42c090135efacc64d70e08c003405455b7dfb1540bd7e6c"}, ] +[package.dependencies] +typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} + [[package]] name = "pandocfilters" version = "1.5.1" @@ -2593,6 +2664,85 @@ files = [ {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] +[[package]] +name = "pillow" +version = "9.5.0" +description = "Python Imaging Library (Fork)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, + {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, + {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, + {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, + {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, + {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, + {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, + {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, + {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, + {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, + {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, + {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, + {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, + {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, + {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, + {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, + {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] + [[package]] name = "pillow" version = "10.3.0" @@ -2715,6 +2865,9 @@ files = [ {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, ] +[package.dependencies] +typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.8\""} + [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] @@ -2730,10 +2883,45 @@ files = [ {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "0.18.4" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, + {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, + {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, + {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, + {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} + +[package.extras] +all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +connectorx = ["connectorx"] +deltalake = ["deltalake (>=0.8.0)"] +fsspec = ["fsspec"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports.zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "polars" version = "1.8.2" @@ -2789,6 +2977,7 @@ files = [ [package.dependencies] cfgv = ">=2.0.0" identify = ">=1.0.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} nodeenv = ">=0.11.1" pyyaml = ">=5.1" toml = "*" @@ -3120,6 +3309,7 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -3507,6 +3697,7 @@ files = [ pyyaml = "*" requests = ">=2.30.0,<3.0" types-PyYAML = "*" +typing-extensions = {version = "*", markers = "python_version < \"3.8\""} urllib3 = ">=1.25.10,<3.0" [package.extras] @@ -4160,6 +4351,17 @@ files = [ {file = "textwrap3-0.9.2.zip", hash = "sha256:5008eeebdb236f6303dcd68f18b856d355f6197511d952ba74bc75e40e0c3414"}, ] +[[package]] +name = "threadpoolctl" +version = "3.1.0" +description = "threadpoolctl" +optional = true +python-versions = ">=3.6" +files = [ + {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, + {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, +] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4280,6 +4482,7 @@ files = [ [package.dependencies] fs = "*" fsspec = "*" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} numpy = "*" pandas = ">=1.2.0" pyarrow = "*" @@ -4310,6 +4513,56 @@ rfc3986 = ">=1.4.0" rich = ">=12.0.0" urllib3 = ">=1.26.0" +[[package]] +name = "typed-ast" +version = "1.5.5" +description = "a fork of Python 2 and 3 ast modules with type comment support" +optional = false +python-versions = ">=3.6" +files = [ + {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, + {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, + {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, + {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, + {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, + {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, + {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, + {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, + {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, + {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, + {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, + {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, + {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, + {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, + {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, + {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, + {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, + {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, + {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, + {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, + {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, + {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, + {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, + {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, + {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, + {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, + {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, + {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, + {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, + {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, + {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, + {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, + {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, + {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, + {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, + {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, + {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, + {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, + {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, + {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, + {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, +] + [[package]] name = "types-protobuf" version = "4.24.0.4" @@ -4421,6 +4674,7 @@ files = [ appdirs = ">=1.4.3,<2" distlib = ">=0.3.1,<1" filelock = ">=3.0.0,<4" +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} six = ">=1.9.0,<2" [package.extras] @@ -4499,6 +4753,9 @@ files = [ {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ba536fca5f9578fa34d106c243fdccfef7d75b9d1fffb9d93df0debfe8e3ebc"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afa843c68cafa08e82624e6a33d13ab7f00ad0301101960872fe152d5af5ab53"}, {file = "whylogs_sketching-3.4.1.dev3-cp311-cp311-win_amd64.whl", hash = "sha256:303d55c37565340c2d21c268c64a712fad612504cc4b98b1d1df848cac6d934f"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b636cebf5f4d7724437616368199c8e7b153f89dfd396f9e8279a95bf55d817"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba4519780defebb35c4718ecc13d1b8c38894be722147a047e67b953cd2430ab"}, + {file = "whylogs_sketching-3.4.1.dev3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b4606e5360ce922e6ad770e845c75038d873300fd8a54ea856e99003b3254fc9"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d65fcf8dade1affe50181582b8894929993e37d7daa922d973a811790cd0208"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4845e77c208ae64ada9170e1b92ed0abe28fe311c0fc35f9d8efa6926211ca2"}, {file = "whylogs_sketching-3.4.1.dev3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:02cac1c87ac42d7fc7e6597862ac50bc035825988d21e8a2d763b416e83e845f"}, @@ -4644,5 +4901,5 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" -python-versions = ">=3.8, <4" -content-hash = "a34330d8cc3c6dc09eaaf1ca88cc11dc37c25f05afceab5693ed3ff6020631b4" +python-versions = ">=3.7.1, <4" +content-hash = "c9ea508c59e9c49f99a557feb2da3d7654d5d8f9c0d678ee0e51c6994ce57b25" diff --git a/python/pyproject.toml b/python/pyproject.toml index 7583af4263..a2fdfb71bc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -11,7 +11,7 @@ include = ["whylogs/core/proto/v0/*.py*", "whylogs/core/proto/*.py*"] [tool.poetry.dependencies] # core dependencies. Be REALLY mindful when touching this list -python = ">=3.8, <4" +python = ">=3.7.1, <4" whylogs-sketching = ">=3.4.1.dev3" protobuf = ">=3.19.4" importlib-metadata = { version = "<4.3", python = "<3.8" } From dac3ecdcf9d3b1f56d2685db7aedaeb1f50ecaf0 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:37:45 +0000 Subject: [PATCH 09/41] Update .github/workflows/whylogs-ci.yml --- .github/workflows/whylogs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/whylogs-ci.yml b/.github/workflows/whylogs-ci.yml index c148fc7de0..5ccc86d680 100644 --- a/.github/workflows/whylogs-ci.yml +++ b/.github/workflows/whylogs-ci.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macOS-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 From 1ecb94fa1ba2e22b27a05ac929afe1935052600f Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Mon, 4 Nov 2024 18:24:26 +0000 Subject: [PATCH 10/41] skip polars in python 3.7 --- python/poetry.lock | 19 +++++++++++++++++-- python/pyproject.toml | 4 ++++ python/tests/api/logger/test_logger_polars.py | 8 +++++++- .../tests/api/logger/test_segments_polars.py | 8 +++++++- python/tests/core/test_performance_polars.py | 8 +++++++- .../core/test_udf_schema_polars.py | 8 +++++++- 6 files changed, 49 insertions(+), 6 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index af558bf4fd..86ab24f304 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "2to3" @@ -3989,6 +3989,21 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "setuptools" +version = "57.5.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.6" +files = [ + {file = "setuptools-57.5.0-py3-none-any.whl", hash = "sha256:60d78588f15b048f86e35cdab73003d8b21dd45108ee61a6693881a427f22073"}, + {file = "setuptools-57.5.0.tar.gz", hash = "sha256:d9d3266d50f59c6967b9312844470babbdb26304fe740833a5f8d89829ba3a24"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=8.2)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx", "sphinx-inline-tabs", "sphinxcontrib-towncrier"] +testing = ["flake8-2020", "jaraco.envs", "jaraco.path (>=3.2.0)", "mock", "paver", "pip (>=19.1)", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy", "pytest-virtualenv (>=1.2.7)", "pytest-xdist", "sphinx", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "setuptools" version = "68.0.0" @@ -4902,4 +4917,4 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" python-versions = ">=3.7.1, <4" -content-hash = "c9ea508c59e9c49f99a557feb2da3d7654d5d8f9c0d678ee0e51c6994ce57b25" +content-hash = "7232f71fb9c9bac43eac68c1e624181f3cfc5af4ce3283662d8d7c8d840a61ed" diff --git a/python/pyproject.toml b/python/pyproject.toml index a2fdfb71bc..0981f0059b 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,6 +20,10 @@ whylabs-client = "^0.6.5" requests = "^2.27" backoff = "^2.2.1" platformdirs = "^3.5.0" +setuptools = [ + { version = "<58", python = "<3.8", optional = false }, + { version = "*", python = ">=3.8", optional = false } +] # viz module. Everything after this should be optional pybars3 = { version = "^0.9", optional = true } diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index f8c56a8dda..d77ec4bc79 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -3,8 +3,8 @@ from typing import Any import numpy as np -import polars as pl import pytest +import sys import whylogs as why from whylogs.api.logger import write @@ -14,6 +14,12 @@ from whylogs.core.metrics import StandardMetric from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema +from whylogs.core.stubs import pl + + +if sys.version_info < (3, 8): + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + FLOAT_TYPES = [float, np.float32, np.float64, np.float_] INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py index ff0e51bc5d..795be2c68c 100644 --- a/python/tests/api/logger/test_segments_polars.py +++ b/python/tests/api/logger/test_segments_polars.py @@ -7,8 +7,8 @@ from typing import Any import numpy as np -import polars as pl import pytest +import sys import whylogs as why from whylogs.api.logger.result_set import ( @@ -25,9 +25,15 @@ SegmentFilter, segment_on_column, ) +from whylogs.core.stubs import pl from whylogs.core.view.dataset_profile_view import DatasetProfileView from whylogs.migration.converters import read_v0_to_view + +if sys.version_info < (3, 8): + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + + TEST_LOGGER = getLogger(__name__) diff --git a/python/tests/core/test_performance_polars.py b/python/tests/core/test_performance_polars.py index 0f7e49eab5..250d69ff38 100644 --- a/python/tests/core/test_performance_polars.py +++ b/python/tests/core/test_performance_polars.py @@ -8,14 +8,15 @@ import numpy as np import pandas as pd -import polars as pl import pytest +import sys import whylogs_sketching as ds # type: ignore import whylogs from whylogs.core import ColumnProfile, ColumnSchema from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.metrics.metrics import MetricConfig +from whylogs.core.stubs import pl from whylogs.core.resolvers import ( HistogramCountingTrackingResolver, LimitedTrackingResolver, @@ -23,6 +24,11 @@ StandardResolver, ) + +if sys.version_info < (3, 8): + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + + TEST_LOGGER = getLogger(__name__) _TEST_RESOLVERS = [HistogramCountingTrackingResolver(), LimitedTrackingResolver(), StandardResolver()] diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 45f79bc1af..77cfa10ad3 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -1,6 +1,7 @@ from typing import Any, Tuple -import polars as pl +import pytest +import sys import whylogs as why from whylogs.core.dataset_profile import DatasetProfile @@ -8,6 +9,7 @@ from whylogs.core.metrics import CardinalityMetric, DistributionMetric, StandardMetric from whylogs.core.resolvers import STANDARD_RESOLVER, MetricSpec, ResolverSpec from whylogs.core.segmentation_partition import segment_on_column +from whylogs.core.stubs import pl from whylogs.experimental.core.metrics.udf_metric import register_metric_udf from whylogs.experimental.core.udf_schema import ( UdfSchema, @@ -20,6 +22,10 @@ from whylogs.experimental.core.validators import condition_validator +if sys.version_info < (3, 8): + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + + def test_udf_polars() -> None: schema = UdfSchema( STANDARD_RESOLVER, From ea0cce21af3f37e07293917985d95d5e5132d96d Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Tue, 5 Nov 2024 06:59:29 +0000 Subject: [PATCH 11/41] polars metric tests --- python/pyproject.toml | 1 - .../tests/core/metrics/test_metrics_polars.py | 105 ++++++------------ 2 files changed, 36 insertions(+), 70 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 0981f0059b..6174b34b7d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -190,7 +190,6 @@ mypy-protobuf = ">=3.2.0" types-protobuf = ">=0.1.14" pandas = "*" pandas-stubs = "*" -polars = "*" ipykernel = ">=6.11" # for developing in Jupyter notebook types-python-dateutil = "^2.8.12" moto = ">4.2" diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index 6f2007d6f5..2aab6bb77a 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -1,47 +1,36 @@ from logging import getLogger import numpy as np -import pandas as pd import pytest +import sys import whylogs as why import whylogs.core.configs as cfg from whylogs.core import ColumnProfileView, DatasetSchema -from whylogs.core.datatypes import Integral +from whylogs.core.datatypes import AnyType, Integral from whylogs.core.metrics.maths import VarianceM2Result, parallel_variance_m2 +from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import ( CardinalityMetric, DistributionMetric, MetricConfig, ) from whylogs.core.preprocessing import PreprocessedColumn -from whylogs.core.resolvers import StandardResolver -from whylogs.core.schema import ColumnSchema - -TEST_LOGGER = getLogger(__name__) +from whylogs.core.resolvers import MetricSpec, ResolverSpec, StandardResolver +from whylogs.core.schema import ColumnSchema, DeclarativeSchema +from whylogs.core.stubs import pd, pl -def test_distribution_metrics_numpy() -> None: - dist = DistributionMetric.zero(MetricConfig()) - data = list(range(0, 100)) - arr = np.array(data) - col = PreprocessedColumn.apply(arr) - dist.columnar_update(col) +if sys.version_info < (3, 8): + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") - assert dist.kll.value.get_n() == 100 - assert dist.mean.value == arr.mean() - assert dist.variance == arr.var(ddof=1) - distribution_summary = dist.to_summary_dict() - assert distribution_summary["q_01"] == 1.0 - assert distribution_summary["q_05"] == 5.0 - assert distribution_summary["q_95"] == 95.0 - assert distribution_summary["q_99"] == 99.0 +TEST_LOGGER = getLogger(__name__) def test_distribution_metrics_series() -> None: dist = DistributionMetric.zero(MetricConfig()) - data = pd.Series(list(range(100))) + data = pl.Series(list(range(100))) col = PreprocessedColumn.apply(data) dist.columnar_update(col) @@ -55,6 +44,7 @@ def test_distribution_variance_m2() -> None: dist_list = DistributionMetric.zero(MetricConfig()) dist_pandas = DistributionMetric.zero(MetricConfig()) + dist_polars = DistributionMetric.zero(MetricConfig()) dist_numpy = DistributionMetric.zero(MetricConfig()) test_input = [1, 2, 3, 4] @@ -66,28 +56,34 @@ def test_distribution_variance_m2() -> None: m2 = (n - 1) * variance TEST_LOGGER.info(f"statistic package using input {test_input} has variance={variance}, m2={m2}, n={n}") pandas_test_input = PreprocessedColumn.apply(pd.Series(test_input)) + polars_test_input = PreprocessedColumn.apply(pl.Series(test_input)) numpy_test_input = PreprocessedColumn.apply(np.array(test_input)) dist_list.columnar_update(list_test_input) dist_pandas.columnar_update(pandas_test_input) + dist_polars.columnar_update(polars_test_input) dist_numpy.columnar_update(numpy_test_input) TEST_LOGGER.info(f"dist_list={dist_list.to_summary_dict()}") TEST_LOGGER.info(f"dist_pandas={dist_pandas.to_summary_dict()}") + TEST_LOGGER.info(f"dist_polars={dist_polars.to_summary_dict()}") TEST_LOGGER.info(f"dist_numpy={dist_numpy.to_summary_dict()}") assert dist_list.m2.value == m2 assert dist_pandas.m2.value == m2 + assert dist_polars.m2.value == m2 assert dist_numpy.m2.value == m2 assert dist_list.variance == variance assert dist_pandas.variance == variance + assert dist_polars.variance == variance assert dist_numpy.variance == variance assert dist_list.avg == mean assert dist_pandas.avg == mean + assert dist_polars.avg == mean assert dist_numpy.avg == mean def test_distribution_metrics_indexed_series_single_row() -> None: dist = DistributionMetric.zero(MetricConfig()) - data = pd.Series(list(range(1)), index=[284]) + data = pl.Series(list(range(1))) col = PreprocessedColumn.apply(data) dist.columnar_update(col) @@ -95,39 +91,6 @@ def test_distribution_metrics_indexed_series_single_row() -> None: assert dist.mean.value == data.mean() -def test_distribution_metrics_list() -> None: - dist = DistributionMetric.zero(MetricConfig()) - col = PreprocessedColumn() - data = list(range(0, 100)) - col.list.ints = data - dist.columnar_update(col) - - assert dist.kll.value.get_n() == 100 - assert dist.mean.value == np.array(data).mean() - assert dist.variance == np.array(data).var(ddof=1) - - -def test_distribution_metrics_mixed_np_and_list() -> None: - dist = DistributionMetric.zero(MetricConfig()) - col = PreprocessedColumn() - col.list.ints = list(range(0, 50)) - col.numpy.ints = np.array(range(50, 100)) - dist.columnar_update(col) - - assert dist.kll.value.get_n() == 100 - a = np.array(col.list.ints) - b = col.numpy.ints - - assert dist.mean.value == np.array(np.concatenate([a, b])).mean() - - m2_a = a.var(ddof=1) * (len(a) - 1) - m2_b = b.var(ddof=1) * (len(b) - 1) - a_var = VarianceM2Result(n=len(a), mean=a.mean(), m2=m2_a) - b_var = VarianceM2Result(n=len(b), mean=b.mean(), m2=m2_b) - overall = parallel_variance_m2(first=a_var, second=b_var) - assert dist.variance == overall.m2 / (overall.n - 1) - - def test_distribution_metrics_bool() -> None: import whylogs.core.metrics.metrics as met @@ -155,7 +118,7 @@ def test_distribution_metrics_bool_mixed() -> None: def test_track_single_values_profile_mean() -> None: data = list(range(30)) - df = pd.DataFrame(data, columns=["col1"]) + df = pl.DataFrame({"col1": data}) actual_mean = df["col1"].mean() actual_stddev = df["col1"].std() prof_view_df = why.log(df).profile().view() @@ -177,7 +140,7 @@ def test_track_single_values_profile_mean() -> None: def test_merge_single_values_profile_mean() -> None: data = list(range(30)) - df = pd.DataFrame(data, columns=["col1"]) + df = pl.DataFrame({"col1": data}) actual_mean = df["col1"].mean() actual_stddev = df["col1"].std() prof_view_df = why.log(df).profile().view() @@ -220,7 +183,7 @@ def test_merge_two_profiles_mean(lending_club_df: pd.DataFrame) -> None: def test_frequent_items_handling_int_as_string() -> None: - df = pd.DataFrame({"int": [1, 1, 1]}) + df = pl.DataFrame({"int": [1, 1, 1]}) res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] assert res.array[0][0].value == "1" # type: ignore @@ -230,23 +193,25 @@ def test_frequent_items_handling_bool_as_string() -> None: import whylogs.core.metrics.metrics as met met._BOOL_LIST_CHUNK_SIZE = 2 - df = pd.DataFrame({"bool": [True, True, True, True, False]}) + df = pl.DataFrame({"bool": [True, True, True, True, False]}) - res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] + schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.frequent_items.value)])]) + res = why.log(df, schema=schema).view().to_pandas()["frequent_items/frequent_strings"] assert res.array[0][0].value == "True" # type: ignore assert res.array[0][1].value == "False" # type: ignore def test_frequent_items_bounds_order() -> None: - df_gamma = pd.DataFrame({"feature1": np.random.gamma(1, 2, 1000).astype(int)}) - df_rand = pd.DataFrame({"feature1": np.random.randint(10000, size=9000)}) - df = df_gamma.append(df_rand) + df_gamma = pl.DataFrame({"feature1": np.random.gamma(1, 2, 1000).astype(int)}) + df_rand = pl.DataFrame({"feature1": np.random.randint(10000, size=9000)}) + df = pl.concat([df_gamma, df_rand]) res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"] fi_tuple = res.array[0][0] assert fi_tuple.lower <= fi_tuple.est <= fi_tuple.upper +@pytest.mark.skip("frequent item length is only enforced on pandas view strings") @pytest.mark.parametrize( "config, limit", [ @@ -255,7 +220,7 @@ def test_frequent_items_bounds_order() -> None: ], ) def test_frequent_item_max_size(config: MetricConfig, limit: int) -> None: - df = pd.DataFrame({"str": ["X" * 200]}) + df = pl.DataFrame({"str": ["X" * 200]}) schema = DatasetSchema(default_configs=config) res = why.log(df, schema=schema).view().to_pandas()["frequent_items/frequent_strings"] assert len(res.array[0][0].value) <= limit @@ -263,7 +228,7 @@ def test_frequent_item_max_size(config: MetricConfig, limit: int) -> None: def test_cardinality_metric_booleans() -> None: cardinality = CardinalityMetric.zero(MetricConfig()) - data = pd.Series([True, False, True, True]) + data = pl.Series([True, False, True, True]) col = PreprocessedColumn.apply(data) cardinality.columnar_update(col) @@ -288,9 +253,10 @@ def test_cardinality_metric_booleans_top_level_api() -> None: input_rows = 5 col_name = "p" d = {col_name: [bool(i % 2) for i in range(input_rows)]} - df = pd.DataFrame(data=d) + df = pl.DataFrame(d) - results = why.log(df) + schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])]) + results = why.log(df, schema=schema) col_prof = results.view().get_column(col_name) cardinality: CardinalityMetric = col_prof.get_metric("cardinality") assert cardinality is not None @@ -298,8 +264,9 @@ def test_cardinality_metric_booleans_top_level_api() -> None: def test_cardinality_metric_booleans_all_false() -> None: - df = pd.DataFrame({"b": [False for i in range(3)]}) - col_prof = why.log(df).view().get_column("b") + df = pl.DataFrame({"b": [False for i in range(3)]}) + schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])]) + col_prof = why.log(df, schema=schema).view().get_column("b") cardinality: CardinalityMetric = col_prof.get_metric("cardinality") assert cardinality.estimate == pytest.approx(1, 0.1) From 2e26e4e34daaa479175819d279dd3be1aa570dc8 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Tue, 5 Nov 2024 08:00:02 +0000 Subject: [PATCH 12/41] fix docs --- python/poetry.lock | 56 +++++++++++-------------------------------- python/pyproject.toml | 2 +- 2 files changed, 15 insertions(+), 43 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 86ab24f304..709de63bc4 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "2to3" @@ -2890,43 +2890,11 @@ importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] -[[package]] -name = "polars" -version = "0.18.4" -description = "Blazingly fast DataFrame library" -optional = false -python-versions = ">=3.7" -files = [ - {file = "polars-0.18.4-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3adfd39f84387f8589735e5c57f466c7ba19812140bc64248b9602755915c52f"}, - {file = "polars-0.18.4-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:5658f9751d93451549ecf429eb6486b203a86130132310c520cd1336d15ca258"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bbc04db1d765f7cad287204a014e8e10bb2245f1910e26cd99964333e3682c6"}, - {file = "polars-0.18.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9117544d86542954588e295127f3892c15e09db04c474a0d8d830735154a54c"}, - {file = "polars-0.18.4-cp37-abi3-win_amd64.whl", hash = "sha256:a033ee71d8fde63ac71c7579230d31372cdaddf1df4227a537d96b91a58abd29"}, - {file = "polars-0.18.4.tar.gz", hash = "sha256:136d8cdbf3c1ec33ab577536ac35a10701ec3dfd21b54cb757ee9b0e0f525a85"}, -] - -[package.dependencies] -typing_extensions = {version = ">=4.0.1", markers = "python_version < \"3.8\""} - -[package.extras] -all = ["polars[connectorx,deltalake,fsspec,matplotlib,numpy,pandas,pyarrow,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] -connectorx = ["connectorx"] -deltalake = ["deltalake (>=0.8.0)"] -fsspec = ["fsspec"] -matplotlib = ["matplotlib"] -numpy = ["numpy (>=1.16.0)"] -pandas = ["pandas", "pyarrow (>=7.0.0)"] -pyarrow = ["pyarrow (>=7.0.0)"] -sqlalchemy = ["pandas", "sqlalchemy"] -timezone = ["backports.zoneinfo", "tzdata"] -xlsx2csv = ["xlsx2csv (>=0.8.0)"] -xlsxwriter = ["xlsxwriter"] - [[package]] name = "polars" version = "1.8.2" description = "Blazingly fast DataFrame library" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "polars-1.8.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:114be1ebfb051b794fb9e1f15999430c79cc0824595e237d3f45632be3e56d73"}, @@ -4006,19 +3974,23 @@ testing = ["flake8-2020", "jaraco.envs", "jaraco.path (>=3.2.0)", "mock", "paver [[package]] name = "setuptools" -version = "68.0.0" +version = "75.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"}, - {file = "setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"}, + {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, + {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] [[package]] name = "six" @@ -4917,4 +4889,4 @@ viz = ["Pillow", "Pillow", "ipython", "numpy", "numpy", "pybars3", "scipy", "sci [metadata] lock-version = "2.0" python-versions = ">=3.7.1, <4" -content-hash = "7232f71fb9c9bac43eac68c1e624181f3cfc5af4ce3283662d8d7c8d840a61ed" +content-hash = "505c98c178ad13ef33d72f4b07f79e62c533226082925b7347f3c3682005e511" diff --git a/python/pyproject.toml b/python/pyproject.toml index 6174b34b7d..2edfde8319 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -22,7 +22,7 @@ backoff = "^2.2.1" platformdirs = "^3.5.0" setuptools = [ { version = "<58", python = "<3.8", optional = false }, - { version = "*", python = ">=3.8", optional = false } + { version = ">=75", python = ">=3.8", optional = false } ] # viz module. Everything after this should be optional From 89a649d2fedb6e86043f878ec907bf2d3656f0f3 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:35:02 +0000 Subject: [PATCH 13/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index d77ec4bc79..94426ef072 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -18,7 +18,7 @@ if sys.version_info < (3, 8): - pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") FLOAT_TYPES = [float, np.float32, np.float64, np.float_] From 9e66091572501492255470543b0755ddf4f78375 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:36:39 +0000 Subject: [PATCH 14/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 94426ef072..d7b5c7d958 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -27,7 +27,7 @@ def test_basic_log_schema() -> None: d = {"col1": [1, 2]} - df = pl.DataFrame(data=d) + df = pl.DataFrame(d) logger = why.logger() results = logger.log(df, schema=DatasetSchema()) profile = results.profile() From 6209bfdc2016e548ea4c8342b8ef2ce7cb8a7968 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:56:52 +0000 Subject: [PATCH 15/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index d7b5c7d958..27b7515c51 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -56,11 +56,6 @@ def test_basic_log() -> None: assert profile._columns["col3"]._schema.dtype == pl.Utf8 -def test_log_nothing_raises_error() -> None: - with pytest.raises(LoggingError): - why.log() - - def test_basic_log_row() -> None: d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} From 91df56caafa6521de959cf3f2df2b0dc543ccf0f Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:57:28 +0000 Subject: [PATCH 16/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 27b7515c51..9439f3759b 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -56,18 +56,6 @@ def test_basic_log() -> None: assert profile._columns["col3"]._schema.dtype == pl.Utf8 -def test_basic_log_row() -> None: - d = {"col1": [1, 2], "col2": [3.0, 4.0], "col3": ["a", "b"]} - - results = why.log(row=d) - - profile = results.profile() - - assert profile._columns["col1"]._schema.dtype == list - assert profile._columns["col2"]._schema.dtype == list - assert profile._columns["col3"]._schema.dtype == list - - def test_basic_log_dict_of_lists() -> None: d = {"col1": [np.int64(1), np.int64(2)], "col2": [3.0, 4.0], "col3": ["a", "b"]} From e45edbe38d78cda4c185eb5031ad911a844a0803 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:58:09 +0000 Subject: [PATCH 17/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 9439f3759b..0ca15be6f6 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -56,29 +56,6 @@ def test_basic_log() -> None: assert profile._columns["col3"]._schema.dtype == pl.Utf8 -def test_basic_log_dict_of_lists() -> None: - d = {"col1": [np.int64(1), np.int64(2)], "col2": [3.0, 4.0], "col3": ["a", "b"]} - - results = why.log(d) - - profile = results.profile() - - assert profile._columns["col1"]._schema.dtype == list - assert profile._columns["col2"]._schema.dtype == list - assert profile._columns["col3"]._schema.dtype == list - - -def test_basic_log_dictionary() -> None: - d = {"a": 1.0, "b": 2.0} - - results = why.log(d) - - profile = results.profile() - - assert profile._columns["a"]._schema.dtype == float - assert profile._columns["b"]._schema.dtype == float - - def test_lending_club(lending_club_df: pl.DataFrame) -> None: res = why.log(lending_club_df) view = res.view() From 09d1ad15b0317372a69c776299d8ae8c94b38ff4 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:58:52 +0000 Subject: [PATCH 18/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 0ca15be6f6..6a74ea2692 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -56,8 +56,9 @@ def test_basic_log() -> None: assert profile._columns["col3"]._schema.dtype == pl.Utf8 -def test_lending_club(lending_club_df: pl.DataFrame) -> None: - res = why.log(lending_club_df) +def test_lending_club(lending_club_df: pd.DataFrame) -> None: + df = pl.from_pandas(lending_club_df) + res = why.log(df) view = res.view() df = view.to_pandas() assert len(df) == 151 From 107625549c155418dfb148f6dee2b368882a922d Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 02:59:31 +0000 Subject: [PATCH 19/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 43 ------------------- 1 file changed, 43 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 6a74ea2692..2bae0e6233 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -147,49 +147,6 @@ def test_object_count_dict(input) -> None: assert row_view._columns.get("a")._metrics.get("types").object.value == 1 -@pytest.mark.parametrize( - "input,stub_np,ints,reals,bools,strs,tensors,objs", - [ - ({"a": 1}, False, 1, 0, 0, 0, 0, 0), - ({"a": 1.0}, False, 0, 1, 0, 0, 0, 0), - ({"a": True}, False, 0, 0, 1, 0, 0, 0), - ({"a": "foo"}, False, 0, 0, 0, 1, 0, 0), - ({"a": [1, 2]}, False, 0, 0, 0, 0, 1, 0), - ({"a": [[1, 2], [3, 4]]}, False, 0, 0, 0, 0, 1, 0), - ({"a": [[1, 2.5], [3.14, 4]]}, False, 0, 0, 0, 0, 1, 0), - ({"a": [[1, 2], ["x", "y"]]}, False, 0, 0, 0, 0, 0, 1), - ({"a": np.asarray([1, 2])}, False, 0, 0, 0, 0, 1, 0), - ({"a": np.asarray([[1, 2], [3, 4]])}, False, 0, 0, 0, 0, 1, 0), - ({"a": np.asarray([[1, 2.5], [3.14, 4]])}, False, 0, 0, 0, 0, 1, 0), - ({"a": np.asarray([[1, 2], ["x", "y"]])}, False, 0, 0, 0, 0, 0, 1), - ({"a": []}, False, 0, 0, 0, 0, 0, 1), - ({"a": 1}, True, 1, 0, 0, 0, 0, 0), - ({"a": 1.0}, True, 0, 1, 0, 0, 0, 0), - ({"a": True}, True, 0, 0, 1, 0, 0, 0), - ({"a": "foo"}, True, 0, 0, 0, 1, 0, 0), - ({"a": [1, 2]}, True, 0, 0, 0, 0, 0, 1), - ({"a": [[1, 2], [3, 4]]}, True, 0, 0, 0, 0, 0, 1), - ({"a": [[1, 2.5], [3.14, 4]]}, True, 0, 0, 0, 0, 0, 1), - ({"a": [[1, 2], ["x", "y"]]}, True, 0, 0, 0, 0, 0, 1), - ({"a": np.asarray([1, 2])}, True, 0, 0, 0, 0, 0, 1), - ({"a": np.asarray([[1, 2], [3, 4]])}, True, 0, 0, 0, 0, 0, 1), - ({"a": np.asarray([[1, 2.5], [3.14, 4]])}, True, 0, 0, 0, 0, 0, 1), - ({"a": np.asarray([[1, 2], ["x", "y"]])}, True, 0, 0, 0, 0, 0, 1), - ({"a": []}, True, 0, 0, 0, 0, 0, 1), - ], -) -def test_type_count_dict(input, stub_np, ints, reals, bools, strs, tensors, objs, monkeypatch) -> None: - monkeypatch.setattr("whylogs.core.preprocessing.is_not_stub", lambda x: (not stub_np)) - row_results = why.log(input) - row_view = row_results.view() - assert row_view._columns.get("a")._metrics.get("types").integral.value == ints - assert row_view._columns.get("a")._metrics.get("types").fractional.value == reals - assert row_view._columns.get("a")._metrics.get("types").boolean.value == bools - assert row_view._columns.get("a")._metrics.get("types").string.value == strs - assert row_view._columns.get("a")._metrics.get("types").tensor.value == tensors - assert row_view._columns.get("a")._metrics.get("types").object.value == objs - - def test_bool_count(): data = { "animal": ["cat", "hawk", "snake", "cat"], From b3e39c6f5f8fcac4dbdfaa2642cf115038163010 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:00:20 +0000 Subject: [PATCH 20/41] Update python/tests/api/logger/test_logger_polars.py --- python/tests/api/logger/test_logger_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 2bae0e6233..9552caeb96 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -34,7 +34,7 @@ def test_basic_log_schema() -> None: assert profile._columns["col1"]._schema.dtype == pl.Int64 -def test_basic_log_schem_constructor() -> None: +def test_basic_log_schema_constructor() -> None: d = {"col1": [1, 2]} df = pl.DataFrame(data=d) logger = why.logger(schema=DatasetSchema()) From dffe8675ea9d2163953970855cd6527ec61a1ca5 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:01:10 +0000 Subject: [PATCH 21/41] Update python/tests/api/logger/test_segments_polars.py --- python/tests/api/logger/test_segments_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py index 795be2c68c..054d0e2a51 100644 --- a/python/tests/api/logger/test_segments_polars.py +++ b/python/tests/api/logger/test_segments_polars.py @@ -31,7 +31,7 @@ if sys.version_info < (3, 8): - pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") TEST_LOGGER = getLogger(__name__) From d840894a72fba4bcaed88bf2141104629b1c81b4 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:07:22 +0000 Subject: [PATCH 22/41] Update python/tests/api/logger/test_segments_polars.py --- .../tests/api/logger/test_segments_polars.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py index 054d0e2a51..7ffcae7550 100644 --- a/python/tests/api/logger/test_segments_polars.py +++ b/python/tests/api/logger/test_segments_polars.py @@ -37,36 +37,6 @@ TEST_LOGGER = getLogger(__name__) -def test_single_row_segment() -> None: - segment_column = "col3" - number_of_segments = 1 - - test_segments = segment_on_column("col3") - results: SegmentedResultSet = why.log( - {"col1": 1, "col2": 1.1, "col3": "x0"}, schema=DatasetSchema(segments=test_segments) - ) - assert results.count == number_of_segments - partitions = results.partitions - assert len(partitions) == 1 - partition = partitions[0] - segments = results.segments_in_partition(partition) - assert len(segments) == number_of_segments - - first_segment = next(iter(segments)) - assert first_segment.key == ("x0",) - first_segment_profile = results.profile(first_segment) - assert first_segment_profile is not None - assert first_segment_profile._columns["col1"]._schema.dtype == int - assert first_segment_profile._columns["col2"]._schema.dtype == float - assert first_segment_profile._columns["col3"]._schema.dtype == str - segment_cardinality: CardinalityMetric = ( - first_segment_profile.view().get_column(segment_column).get_metric("cardinality") - ) - cardinality = segment_cardinality.estimate - assert cardinality is not None - assert cardinality == 1.0 - - def test_single_column_segment() -> None: input_rows = 100 segment_column = "col3" From f895fcf790bbb4e61f6a1542eb32621985d06df4 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:08:10 +0000 Subject: [PATCH 23/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index 2aab6bb77a..bee0b4c7c6 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -22,7 +22,7 @@ if sys.version_info < (3, 8): - pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") TEST_LOGGER = getLogger(__name__) From 670418e7ee07bad517385bff3562de0852301574 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:11:25 +0000 Subject: [PATCH 24/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index bee0b4c7c6..2d42940d94 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -91,18 +91,6 @@ def test_distribution_metrics_indexed_series_single_row() -> None: assert dist.mean.value == data.mean() -def test_distribution_metrics_bool() -> None: - import whylogs.core.metrics.metrics as met - - met._BOOL_LIST_CHUNK_SIZE = 2 - - dist = DistributionMetric.zero() - p_col = PreprocessedColumn.apply([True, True, True, True, False, "foo", "bar"]) - operation_result = dist.columnar_update(p_col) - assert operation_result.ok - assert round(dist.mean.value, 3) == 0.8 - - def test_distribution_metrics_bool_mixed() -> None: import whylogs.core.metrics.metrics as met From a52449f1860a3db9dd33d56ecd28b8bcd0dda29e Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:13:23 +0000 Subject: [PATCH 25/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index 2d42940d94..6aa455fe57 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -91,19 +91,6 @@ def test_distribution_metrics_indexed_series_single_row() -> None: assert dist.mean.value == data.mean() -def test_distribution_metrics_bool_mixed() -> None: - import whylogs.core.metrics.metrics as met - - met._BOOL_LIST_CHUNK_SIZE = 2 - - dist = DistributionMetric.zero() - p_col = PreprocessedColumn.apply([True, False, 42]) - operation_result = dist.columnar_update(p_col) - assert operation_result.ok - assert dist.kll.value.get_n() == 3 - assert round(dist.avg, 3) == round(43 / 3, 3) - - def test_track_single_values_profile_mean() -> None: data = list(range(30)) df = pl.DataFrame({"col1": data}) From 45bedc8cf868d74cc5a0ced442ec41337fe25589 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:18:40 +0000 Subject: [PATCH 26/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index 6aa455fe57..e1d3a1aeb3 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -244,16 +244,3 @@ def test_cardinality_metric_booleans_all_false() -> None: col_prof = why.log(df, schema=schema).view().get_column("b") cardinality: CardinalityMetric = col_prof.get_metric("cardinality") assert cardinality.estimate == pytest.approx(1, 0.1) - - -def test_configure_MetricConfig_defaults(): - c0 = MetricConfig() - assert c0.kll_k == cfg.kll_k - assert not c0.fi_disabled - assert "frequent_items" in StandardResolver().resolve("", Integral(), ColumnSchema(Integral, c0)) - cfg.fi_disabled = True - c1 = MetricConfig() - assert c1.fi_disabled - assert not c0.fi_disabled - assert "frequent_items" not in StandardResolver().resolve("", Integral(), ColumnSchema(Integral, c1)) - cfg.fi_disabled = False From 5881b7c75722ef6fad94146c5f14e6487c5240af Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:19:19 +0000 Subject: [PATCH 27/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index e1d3a1aeb3..6ea114ad5a 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -144,7 +144,7 @@ def test_merge_two_profiles_mean(lending_club_df: pd.DataFrame) -> None: actual_mean_1 = first_df["loan_amnt"].mean() actual_mean_2 = second_df["loan_amnt"].mean() - first_profile: ColumnProfileView = why.log(first_df).view().get_column("loan_amnt") + first_profile: ColumnProfileView = why.log(pl.from_pandas(first_df)).view().get_column("loan_amnt") first_profile_mean = first_profile.get_metric("distribution").mean.value second_profile = why.log(second_df).view().get_column("loan_amnt") second_profile_mean = second_profile.get_metric("distribution").mean.value From a17707614787b8e0fed6f7822f41e1770a35f1d6 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:19:47 +0000 Subject: [PATCH 28/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index 6ea114ad5a..a1fdd79456 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -146,7 +146,7 @@ def test_merge_two_profiles_mean(lending_club_df: pd.DataFrame) -> None: first_profile: ColumnProfileView = why.log(pl.from_pandas(first_df)).view().get_column("loan_amnt") first_profile_mean = first_profile.get_metric("distribution").mean.value - second_profile = why.log(second_df).view().get_column("loan_amnt") + second_profile = why.log(pl.from_pandas(second_df)).view().get_column("loan_amnt") second_profile_mean = second_profile.get_metric("distribution").mean.value merged_profile = first_profile.merge(second_profile) From 15c807c8ad3ea2e2356b1ada9961e3505fe1101f Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:20:15 +0000 Subject: [PATCH 29/41] Update python/tests/core/metrics/test_metrics_polars.py --- python/tests/core/metrics/test_metrics_polars.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index a1fdd79456..bea371cb42 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -210,20 +210,6 @@ def test_cardinality_metric_booleans() -> None: assert cardinality.estimate == pytest.approx(2, 0.1) -def test_cardinality_metric_row_booleans() -> None: - column_name = "col1" - data = {column_name: True} - profile = why.log(data).profile() - view = profile.view() - cardinality = view.get_column(column_name).get_metric("cardinality") - - assert cardinality is not None - assert cardinality.estimate == pytest.approx(1, 0.1) - # track a bool value of false in the same column and check that cardinality increased to near 2. - profile.track(row={column_name: False}) - assert cardinality.estimate == pytest.approx(2, 0.1) - - def test_cardinality_metric_booleans_top_level_api() -> None: input_rows = 5 col_name = "p" From 78e4a716289f0669c7732e3dde612232c017c0a2 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:21:21 +0000 Subject: [PATCH 30/41] Update python/tests/core/test_performance_polars.py --- python/tests/core/test_performance_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/core/test_performance_polars.py b/python/tests/core/test_performance_polars.py index 250d69ff38..d2eacf580a 100644 --- a/python/tests/core/test_performance_polars.py +++ b/python/tests/core/test_performance_polars.py @@ -26,7 +26,7 @@ if sys.version_info < (3, 8): - pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") TEST_LOGGER = getLogger(__name__) From bb4ec847f96930bb4074a14c1790b7574c1d8dbc Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:28:21 +0000 Subject: [PATCH 31/41] Update python/tests/experimental/core/test_udf_schema_polars.py --- python/tests/experimental/core/test_udf_schema_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 77cfa10ad3..2765cd8657 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -23,7 +23,7 @@ if sys.version_info < (3, 8): - pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.7") + pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") def test_udf_polars() -> None: From 82e96462dd6e86f8c0c870f795b993fd7a0a9330 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Thu, 7 Nov 2024 03:34:31 +0000 Subject: [PATCH 32/41] Update python/tests/experimental/core/test_udf_schema_polars.py --- python/tests/experimental/core/test_udf_schema_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 2765cd8657..90e2668f01 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -277,7 +277,7 @@ def test_udf_metric_resolving() -> None: assert "udf/bar:counts/n" in foo_summary -def test_udf_segmentation_pandas() -> None: +def test_udf_segmentation_polars() -> None: column_segments = segment_on_column("product") segmented_schema = udf_schema(segments=column_segments, schema_name="polars-unit-tests") data = pl.DataFrame({"col1": [42, 12, 7], "col2": [2, 3, 4], "col3": [2, 3, 4]}) From 290cc62472826575b29b3ab0d17a9d1406c4a350 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Thu, 7 Nov 2024 05:56:33 +0000 Subject: [PATCH 33/41] test fix --- python/tests/api/logger/test_logger_polars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 9552caeb96..7c63cddd82 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -14,7 +14,7 @@ from whylogs.core.metrics import StandardMetric from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema -from whylogs.core.stubs import pl +from whylogs.core.stubs import pd, pl if sys.version_info < (3, 8): From e5831a09d0c6fc053c494bb5b8208e080c99f090 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Tue, 12 Nov 2024 22:33:30 +0000 Subject: [PATCH 34/41] refactor --- python/tests/api/logger/test_logger_polars.py | 2 +- .../core/test_udf_schema_polars.py | 2 +- python/whylogs/api/logger/__init__.py | 9 ++-- .../logger/actor/thread_rolling_logger.py | 2 +- python/whylogs/api/logger/logger.py | 21 ++++----- python/whylogs/api/logger/rolling.py | 4 +- .../whylogs/api/logger/segment_processing.py | 12 ++--- python/whylogs/api/logger/transient.py | 4 +- .../api/whylabs/session/notebook_logger.py | 16 +++++-- python/whylogs/core/dataframe_wrapper.py | 46 +++++++++++-------- python/whylogs/core/dataset_profile.py | 13 ++---- python/whylogs/core/input_resolver.py | 30 +++++------- python/whylogs/core/schema.py | 15 +++--- .../whylogs/experimental/core/udf_schema.py | 18 ++++---- 14 files changed, 100 insertions(+), 94 deletions(-) diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index 7c63cddd82..ce5c842d4c 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -156,7 +156,7 @@ def test_bool_count(): df = pl.DataFrame(data) - results = why.log(polars=df) + results = why.log(dataframe=df) prof_view = results.profile().view() assert prof_view._columns.get("fly")._metrics.get("types").boolean.value == 4 assert prof_view._columns.get("fly")._metrics.get("types").integral.value == 0 diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index 90e2668f01..c7e7c0e2fe 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -331,7 +331,7 @@ def fob(x: pl.DataFrame) -> pl.Series: def test_direct_udfs() -> None: schema = udf_schema(schema_name=["polars", "polars-bob"]) data = pl.DataFrame({"col1": [42, 12, 7]}) - more_data, _ = schema.apply_udfs(polars=data) + more_data, _ = schema.apply_udfs(dataframe=data) udf_columns = set(more_data.columns) result = why.log(data, schema=schema).view() diff --git a/python/whylogs/api/logger/__init__.py b/python/whylogs/api/logger/__init__.py index a56ebbb97c..1376b278ac 100644 --- a/python/whylogs/api/logger/__init__.py +++ b/python/whylogs/api/logger/__init__.py @@ -26,6 +26,7 @@ notebook_session_log_comparison, ) from whylogs.core import DatasetProfile, DatasetSchema +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.metadata import WHYLABS_TRACE_ID_KEY from whylogs.core.model_performance_metrics.model_performance_metrics import ( ModelPerformanceMetrics, @@ -35,7 +36,7 @@ diagnostic_logger = logging.getLogger(__name__) -Loggable = Union["pd.DataFrame", List[Dict[str, Any]]] +Loggable = Union["pd.DataFrame", "pl.DataFrame", DataFrameWrapper, List[Dict[str, Any]]] @deprecated_argument("debug_event") @@ -43,7 +44,7 @@ def log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, name: Optional[str] = None, @@ -71,7 +72,7 @@ def log( result_set = TransientLogger(schema=schema).log( obj, pandas=pandas, - polars=polars, + dataframe=dataframe, row=row, name=name, trace_id=trace_id, @@ -80,7 +81,7 @@ def log( ) if dataset_timestamp is not None: result_set.set_dataset_timestamp(dataset_timestamp) - notebook_session_log(result_set, obj, pandas=pandas, polars=polars, row=row, name=name) + notebook_session_log(result_set, obj, pandas=pandas, dataframe=dataframe, row=row, name=name) if debug_event is not None: if trace_id is None and WHYLABS_TRACE_ID_KEY in result_set.metadata: diff --git a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py index 9d5c264ef5..4565172874 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/thread_rolling_logger.py @@ -78,7 +78,7 @@ def _track_segments(self, data: TrackData) -> None: else: df = data if isinstance(data, pd.DataFrame) else None row = data if isinstance(data, dict) else None # pyright: ignore[reportUnknownVariableType] - df, row = _dataframe_or_dict(df, None, None, row) # pyright: ignore[reportUnknownArgumentType] + df, row = _dataframe_or_dict(df, None, row) # pyright: ignore[reportUnknownArgumentType] df, row = self._schema._run_udfs(df, row) # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType, reportPrivateUsage] input_data: TrackData = cast(TrackData, df if df is not None else row) # type: ignore[no-redef] else: diff --git a/python/whylogs/api/logger/logger.py b/python/whylogs/api/logger/logger.py index d1d80979ba..b008a695f2 100644 --- a/python/whylogs/api/logger/logger.py +++ b/python/whylogs/api/logger/logger.py @@ -15,14 +15,14 @@ from whylogs.api.store import ProfileStore from whylogs.api.writer import Writer, Writers from whylogs.core import DatasetProfile, DatasetSchema -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame from whylogs.core.errors import LoggingError from whylogs.core.input_resolver import _dataframe_or_dict from whylogs.core.metadata import ( _populate_common_profile_metadata, _safe_merge_metadata, ) -from whylogs.core.stubs import pd, pl +from whylogs.core.stubs import pd logger = logging.getLogger(__name__) @@ -71,7 +71,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - dataframe: Optional[DataFrameWrapper] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: @@ -82,7 +82,7 @@ def log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, timestamp_ms: Optional[int] = None, # Not the dataset timestamp, but the timestamp of the data @@ -99,7 +99,7 @@ def log( """ if self._is_closed: raise LoggingError("Cannot log to a closed logger") - if obj is None and pandas is None and polars is None and row is None: + if obj is None and pandas is None and dataframe is None and row is None: # TODO: check for shell environment and emit more verbose error string to let user know how to correct. raise LoggingError("log() was called without passing in any input!") @@ -108,18 +108,17 @@ def log( self._metadata = dict() self._metadata["name"] = name active_schema = schema or self._schema - dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) + dataframe, row = _dataframe_or_dict(obj, dataframe if dataframe is not None else pandas, row) if active_schema: dataframe, row = active_schema._run_udfs(dataframe, row) - obj = None # If segments are defined use segment_processing to return a SegmentedResultSet if active_schema and active_schema.segments: segmented_results: SegmentedResultSet = segment_processing( schema=active_schema, - obj=obj, - pandas=dataframe.pd_df if dataframe else None, - polars=dataframe.pl_df if dataframe else None, + obj=None, + pandas=None, + dataframe=dataframe, row=row, segment_cache=self._segment_cache, segment_key_values=segment_key_values, @@ -132,7 +131,7 @@ def log( profiles = self._get_matching_profiles(obj, dataframe=dataframe, row=row, schema=active_schema) for prof in profiles: - prof.track(obj, dataframe=dataframe, row=row, execute_udfs=False) + prof.track(None, dataframe=dataframe, row=row, execute_udfs=False) prof._metadata = _populate_common_profile_metadata(prof._metadata, trace_id=trace_id, tags=tags) if active_schema: _safe_merge_metadata(prof._metadata, active_schema.metadata) diff --git a/python/whylogs/api/logger/rolling.py b/python/whylogs/api/logger/rolling.py index 2e2541c309..4d8f83398e 100644 --- a/python/whylogs/api/logger/rolling.py +++ b/python/whylogs/api/logger/rolling.py @@ -14,7 +14,7 @@ from whylogs.api.logger.segment_cache import SegmentCache from whylogs.api.writer import Writer from whylogs.core import DatasetProfile, DatasetProfileView, DatasetSchema -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame from whylogs.core.view.segmented_dataset_profile_view import SegmentedDatasetProfileView logger = logging.getLogger(__name__) @@ -147,7 +147,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - dataframe: Optional[DataFrameWrapper] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: diff --git a/python/whylogs/api/logger/segment_processing.py b/python/whylogs/api/logger/segment_processing.py index 2e99175164..5288eaea6e 100644 --- a/python/whylogs/api/logger/segment_processing.py +++ b/python/whylogs/api/logger/segment_processing.py @@ -6,7 +6,7 @@ from whylogs.api.logger.result_set import SegmentedResultSet from whylogs.api.logger.segment_cache import SegmentCache from whylogs.core import DatasetSchema -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.input_resolver import _dataframe_or_dict from whylogs.core.segment import Segment @@ -15,7 +15,7 @@ SegmentationPartition, SegmentFilter, ) -from whylogs.core.stubs import pd, pl +from whylogs.core.stubs import pd logger = logging.getLogger(__name__) @@ -138,10 +138,10 @@ def _log_segment( row: Optional[Mapping[str, Any]] = None, segment_cache: Optional[SegmentCache] = None, segment_key_values: Optional[Dict[str, str]] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, ) -> Dict[Segment, Any]: segments: Dict[Segment, Any] = {} - dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) + dataframe, row = _dataframe_or_dict(obj, dataframe if dataframe is not None else pandas, row) if partition.filter: dataframe, row = _filter_inputs(partition.filter, dataframe, row) if partition.simple: @@ -166,7 +166,7 @@ def segment_processing( row: Optional[Dict[str, Any]] = None, segment_cache: Optional[SegmentCache] = None, segment_key_values: Optional[Dict[str, str]] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, ) -> SegmentedResultSet: number_of_partitions = len(schema.segments) logger.info(f"The specified schema defines segments with {number_of_partitions} partitions.") @@ -195,7 +195,7 @@ def segment_processing( schema=schema, obj=obj, pandas=pandas, - polars=polars, + dataframe=dataframe, row=row, segment_cache=segment_cache, segment_key_values=segment_key_values, diff --git a/python/whylogs/api/logger/transient.py b/python/whylogs/api/logger/transient.py index 9321d48691..218b8e1a08 100644 --- a/python/whylogs/api/logger/transient.py +++ b/python/whylogs/api/logger/transient.py @@ -2,7 +2,7 @@ from whylogs.api.logger.logger import Logger from whylogs.core import DatasetProfile, DatasetSchema -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame class TransientLogger(Logger): @@ -13,7 +13,7 @@ def _get_matching_profiles( self, obj: Any = None, *, - dataframe: Optional[DataFrameWrapper] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, schema: Optional[DatasetSchema] = None, ) -> List[DatasetProfile]: diff --git a/python/whylogs/api/whylabs/session/notebook_logger.py b/python/whylogs/api/whylabs/session/notebook_logger.py index 61a49f236e..c32faf4cbf 100644 --- a/python/whylogs/api/whylabs/session/notebook_logger.py +++ b/python/whylogs/api/whylabs/session/notebook_logger.py @@ -6,11 +6,12 @@ from whylogs.api.whylabs.session.session_manager import get_current_session from whylogs.api.whylabs.session.session_types import InteractiveLogger as il from whylogs.api.whylabs.session.session_types import SessionType +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.stubs import pd, pl def notebook_session_log_comparison( - data: Dict[str, Union["pd.DataFrame", List[Dict[str, Any]]]], result_sets: Dict[str, ResultSet] + data: Dict[str, Union[pd.DataFrame, List[Dict[str, Any]]]], result_sets: Dict[str, ResultSet] ) -> None: session = get_current_session() @@ -47,8 +48,8 @@ def notebook_session_log_comparison( traceback.print_exc() -def _get_loggable_length(loggable: Optional[Union["pd.DataFrame", Dict[str, Any]]]) -> Optional[int]: - if isinstance(loggable, pd.DataFrame): +def _get_loggable_length(loggable: Optional[Union[DataFrame, Dict[str, Any]]]) -> Optional[int]: + if isinstance(loggable, (pd.DataFrame, pl.DataFrame, DataFrameWrapper)): return len(loggable) elif isinstance(loggable, dict): return 1 @@ -61,7 +62,7 @@ def notebook_session_log( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Dict[str, Any]] = None, name: Optional[str] = None, ) -> None: @@ -75,7 +76,12 @@ def notebook_session_log( return # Get the length of whatever was just logged - rows = _get_loggable_length(pandas) or _get_loggable_length(obj) or _get_loggable_length(row) + rows = ( + _get_loggable_length(pandas) + or _get_loggable_length(obj) + or _get_loggable_length(row) + or _get_loggable_length(dataframe) + ) il.message() if rows is not None: diff --git a/python/whylogs/core/dataframe_wrapper.py b/python/whylogs/core/dataframe_wrapper.py index 0a41c7ff89..214e2b1691 100644 --- a/python/whylogs/core/dataframe_wrapper.py +++ b/python/whylogs/core/dataframe_wrapper.py @@ -2,15 +2,19 @@ from whylogs.core.stubs import pd, pl +DataFrame = Union[pd.DataFrame, pl.DataFrame, "DataFrameWrapper"] +Series = Union[pd.Series, pl.Series] + class DataFrameWrapper: - def __init__(self, pandas: Optional[pd.DataFrame] = None, polars: Optional[pl.DataFrame] = None): - # TODO: __init__(self, df: Union[pd.DataFrame, pl.DataFrame]): with isinstance - # TODO: maybe PandasDataFrame, PolarsDataFrame <: DataFrameWrapper - if pandas is not None and polars is not None: - raise ValueError("Cannot pass both pandas and polars params") - if pandas is None and polars is None: - raise ValueError("Must pass either pandas or polars") + def __init__(self, dataframe: DataFrame): + # TODO: PandasDataFrame, PolarsDataFrame <: DataFrameWrapper + if isinstance(dataframe, DataFrameWrapper): + pandas, polars = dataframe.pd_df, dataframe.pl_df + elif isinstance(dataframe, pd.DataFrame): + pandas, polars = dataframe, None + else: + pandas, polars = None, dataframe self.pd_df = pandas self.pl_df = polars @@ -19,29 +23,35 @@ def __init__(self, pandas: Optional[pd.DataFrame] = None, polars: Optional[pl.Da self.dtypes = pandas.dtypes if pandas is not None else polars.schema # type: ignore self.empty = pandas.empty if pandas is not None else len(polars) == 0 # type: ignore + def len(self) -> int: + return len(self.pd_df if self.pd_df is not None else self.pl_df) + + def __len__(self) -> int: + return len(self.pd_df if self.pd_df is not None else self.pl_df) + def _update(self) -> None: self.column_names = list(self.pd_df.columns) if self.pd_df is not None else self.pl_df.columns # type: ignore self.dtypes = self.pd_df.dtypes if self.pd_df is not None else self.pl_df.schema # type: ignore self.empty = self.pd_df.empty if self.pd_df is not None else len(self.pl_df) == 0 # type: ignore - def get(self, column: str) -> Optional[Union[pd.Series, pl.Series]]: + def get(self, column: str) -> Optional[Series]: if self.pd_df is not None: return self.pd_df.get(column) return self.pl_df[column] if column in self.pl_df.schema else None # type: ignore def filter(self, filter: Any) -> Optional["DataFrameWrapper"]: if self.pd_df is not None: - return DataFrameWrapper(pandas=self.pd_df[filter]) + return DataFrameWrapper(self.pd_df[filter]) if self.pl_df is not None: - return DataFrameWrapper(polars=self.pl_df.filter(filter)) + return DataFrameWrapper(self.pl_df.filter(filter)) return None def query(self, query: str) -> Optional["DataFrameWrapper"]: if self.pd_df is not None: - return DataFrameWrapper(pandas=self.pd_df.query(query)) + return DataFrameWrapper(self.pd_df.query(query)) if self.pl_df is not None: ctx = pl.SQLContext(population=self.pl_df, eager=True) - return ctx.execute(query) + return DataFrameWrapper(ctx.execute(query)) return None def group_keys(self, columns: List[str]) -> List[Tuple[Any]]: @@ -109,9 +119,9 @@ def drop_columns(self, columns: List[str]) -> None: def __getitem__(self, key: str) -> "DataFrameWrapper": if self.pd_df is not None: - return DataFrameWrapper(pandas=pd.DataFrame(self.pd_df[key])) + return DataFrameWrapper(pd.DataFrame(self.pd_df[key])) elif self.pl_df is not None: - return DataFrameWrapper(polars=pl.DataFrame(self.pl_df[key])) + return DataFrameWrapper(pl.DataFrame(self.pl_df[key])) raise ValueError("Cannot index empty DataFrame") def __setitem__(self, key: str, value: Union[pd.Series, pl.Series]) -> None: @@ -125,14 +135,14 @@ def __setitem__(self, key: str, value: Union[pd.Series, pl.Series]) -> None: return raise ValueError("Cannot index empty DataFrame") - def apply_udf(self, udf: Callable) -> Union[pd.Series, pl.Series]: + def apply_udf(self, udf: Callable) -> Series: if self.pd_df is not None: return pd.Series(udf(self.pd_df)) elif self.pl_df is not None: return self.pl_df.map_rows(udf)["map"] raise ValueError("Cannot apply UDFs to empty DataFrame") - def apply_type_udf(self, udf: Callable) -> Union[pd.Series, pl.Series]: + def apply_type_udf(self, udf: Callable) -> Series: if self.pd_df is not None: return pd.Series(udf(self.pd_df[self.pd_df.columns[0]])) elif self.pl_df is not None: @@ -141,9 +151,9 @@ def apply_type_udf(self, udf: Callable) -> Union[pd.Series, pl.Series]: def apply_multicolumn_udf(self, udf: Callable) -> "DataFrameWrapper": if self.pd_df is not None: - return DataFrameWrapper(pandas=udf(self.pd_df)) + return DataFrameWrapper(udf(self.pd_df)) elif self.pl_df is not None: - return DataFrameWrapper(polars=udf(self.pl_df)) + return DataFrameWrapper(udf(self.pl_df)) raise ValueError("Cannot apply UDFs to empty DataFrame") def rename(self, columns: Dict[str, str]) -> None: diff --git a/python/whylogs/core/dataset_profile.py b/python/whylogs/core/dataset_profile.py index 85ba7e1d7b..ec9d1d2ed9 100644 --- a/python/whylogs/core/dataset_profile.py +++ b/python/whylogs/core/dataset_profile.py @@ -12,10 +12,10 @@ from whylogs.core.utils.utils import deprecated, deprecated_alias, ensure_timezone from .column_profile import ColumnProfile -from .dataframe_wrapper import DataFrameWrapper +from .dataframe_wrapper import DataFrame, DataFrameWrapper from .input_resolver import _dataframe_or_dict from .schema import DatasetSchema -from .stubs import pd, pl +from .stubs import pd from .view import DatasetProfileView logger = logging.getLogger(__name__) @@ -110,24 +110,21 @@ def track( obj: Any = None, *, pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, - dataframe: Optional[DataFrameWrapper] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Mapping[str, Any]] = None, execute_udfs: bool = True, ) -> None: - if dataframe is None: - dataframe, row = _dataframe_or_dict(obj, pandas, polars, row) + dataframe, row = _dataframe_or_dict(obj, dataframe if dataframe is not None else pandas, row) try: self._is_active = True self._track_count += 1 - self._do_track(obj, dataframe=dataframe, row=row, execute_udfs=execute_udfs) + self._do_track(dataframe=dataframe, row=row, execute_udfs=execute_udfs) finally: self._is_active = False def _do_track( self, - obj: Any = None, *, dataframe: Optional[DataFrameWrapper] = None, row: Optional[Mapping[str, Any]] = None, diff --git a/python/whylogs/core/input_resolver.py b/python/whylogs/core/input_resolver.py index 1a5be55887..b5c22a0270 100644 --- a/python/whylogs/core/input_resolver.py +++ b/python/whylogs/core/input_resolver.py @@ -1,35 +1,29 @@ from typing import Any, Dict, Mapping, Optional, Tuple -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.stubs import pd, pl def _dataframe_or_dict( obj: Any, - pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Mapping[str, Any]] = None, ) -> Tuple[Optional[DataFrameWrapper], Optional[Mapping[str, Any]]]: if obj is not None: - if pandas is not None: - raise ValueError("Cannot pass both obj and pandas params") - if polars is not None: - raise ValueError("Cannot pass both obj and polars params") + if dataframe is not None: + raise ValueError("Cannot pass both obj and dataframe params") if row is not None: raise ValueError("Cannot pass both obj and row params") if isinstance(obj, (dict, Dict, Mapping)): - row = obj - elif pd.DataFrame is not None and isinstance(obj, pd.DataFrame): - pandas = obj - elif pl.DataFrame is not None and isinstance(obj, pl.DataFrame): - polars = obj + return (None, obj) + elif isinstance(obj, DataFrameWrapper): + return (obj, None) + elif isinstance(obj, (pd.DataFrame, pl.DataFrame)): + return (DataFrameWrapper(obj), None) - if pandas is not None and row is not None: - raise ValueError("Cannot pass both pandas and row params") + if dataframe is not None and row is not None: + raise ValueError("Cannot pass both dataframe and row params") - if polars is not None and row is not None: - raise ValueError("Cannot pass both polars and row params") - - df = DataFrameWrapper(pandas, polars) if (pandas is not None or polars is not None) else None + df = DataFrameWrapper(dataframe) if dataframe is not None else None return (df, row) diff --git a/python/whylogs/core/schema.py b/python/whylogs/core/schema.py index cbb43a3bcc..1e3bd199db 100644 --- a/python/whylogs/core/schema.py +++ b/python/whylogs/core/schema.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, TypeVar, Union import whylogs.core.resolvers as res -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.datatypes import StandardTypeMapper, TypeMapper from whylogs.core.metrics.metrics import Metric, MetricConfig from whylogs.core.resolvers import ( @@ -14,7 +14,7 @@ ResolverSpec, ) from whylogs.core.segmentation_partition import SegmentationPartition -from whylogs.core.stubs import pd, pl +from whylogs.core.stubs import pd from whylogs.core.validators.validator import Validator, deepcopy_validators logger = logging.getLogger(__name__) @@ -132,14 +132,13 @@ def resolve( self, *, pandas: Optional[pd.DataFrame] = None, - polars: Optional[pl.DataFrame] = None, - dataframe: Optional[DataFrameWrapper] = None, + dataframe: Optional[DataFrame] = None, row: Optional[Mapping[str, Any]] = None, ) -> bool: - if dataframe: - return self._resolve_dataframe(dataframe) - if pandas is not None or polars is not None: - return self._resolve_dataframe(DataFrameWrapper(pandas, polars)) + if dataframe is not None: + return self._resolve_dataframe(DataFrameWrapper(dataframe)) + if pandas is not None: + return self._resolve_dataframe(DataFrameWrapper(pandas)) if row is not None: for k, v in row.items(): diff --git a/python/whylogs/experimental/core/udf_schema.py b/python/whylogs/experimental/core/udf_schema.py index 06fdcdce27..7992b19696 100644 --- a/python/whylogs/experimental/core/udf_schema.py +++ b/python/whylogs/experimental/core/udf_schema.py @@ -16,7 +16,7 @@ Union, ) -from whylogs.core.dataframe_wrapper import DataFrameWrapper +from whylogs.core.dataframe_wrapper import DataFrame, DataFrameWrapper from whylogs.core.datatypes import DataType, StandardTypeMapper, TypeMapper from whylogs.core.metrics.metrics import Metric, MetricConfig from whylogs.core.resolvers import NO_FI_RESOLVER, MetricSpec, ResolverSpec @@ -246,11 +246,7 @@ def _run_udfs( ) -> Tuple[Optional[DataFrameWrapper], Optional[Dict[str, Any]]]: new_columns = deepcopy(row) if row else None if df: - new_df = ( - DataFrameWrapper(pandas=pd.DataFrame()) - if df.pd_df is not None - else DataFrameWrapper(polars=pl.DataFrame()) - ) + new_df = DataFrameWrapper(pd.DataFrame()) if df.pd_df is not None else DataFrameWrapper(pl.DataFrame()) else: new_df = None @@ -271,9 +267,13 @@ def apply_udfs( self, pandas: Optional[pd.DataFrame] = None, row: Optional[Dict[str, Any]] = None, - polars: Optional[pl.DataFrame] = None, - ) -> Tuple[Optional[Union[pd.DataFrame, pl.DataFrame]], Optional[Mapping[str, Any]]]: - df = DataFrameWrapper(pandas, polars) if (pandas is not None or polars is not None) else None + dataframe: Optional[DataFrame] = None, + ) -> Tuple[Optional[DataFrame], Optional[Mapping[str, Any]]]: + df = ( + DataFrameWrapper(dataframe if dataframe is not None else pandas) + if (pandas is not None or dataframe is not None) + else None + ) df, row = self._run_udfs(df, row) if df is not None: df = df.pd_df if df.pd_df is not None else df.pl_df From 5009755960b6e1fc61aa4f18da0be103665202fa Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Thu, 14 Nov 2024 00:31:52 +0000 Subject: [PATCH 35/41] pre-commit --- python/.pre-commit-config.yaml | 11 +------- python/tests/api/logger/test_logger_polars.py | 9 +++--- .../tests/api/logger/test_segments_polars.py | 8 ++++-- .../tests/core/metrics/test_metrics_polars.py | 28 +++++++++++-------- python/tests/core/test_performance_polars.py | 5 ++-- .../core/test_udf_schema_polars.py | 8 ++++-- python/whylogs/core/dataframe_wrapper.py | 2 +- python/whylogs/core/stubs.py | 4 +++ 8 files changed, 40 insertions(+), 35 deletions(-) diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index f3e7903452..c267119a8b 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -6,12 +6,10 @@ repos: hooks: - id: black exclude: python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger - files: ^(python/whylogs/) - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 - files: ^(python/whylogs/) args: - --max-line-length=160 - --exclude="""\.tox | @@ -32,24 +30,17 @@ repos: hooks: - id: isort args: [--filter-files] - files: ^(python/whylogs/) exclude: python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.942 hooks: - id: mypy language: system - files: ^(python/whylogs/) - exclude: ^(python/.venv/lib/python3.8/site-packages/polars/ml/torch.py|python/tests/|python/examples/|python/examples/integration/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger) - verbose: true - args: - - --exclude python/.venv - - -V + exclude: ^(python/tests/|python/examples/|python/examples/integration/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger) - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.5.1 hooks: - id: prettier - files: ^(python/whylogs/) exclude: python/tests/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 diff --git a/python/tests/api/logger/test_logger_polars.py b/python/tests/api/logger/test_logger_polars.py index ce5c842d4c..2b64ee315d 100644 --- a/python/tests/api/logger/test_logger_polars.py +++ b/python/tests/api/logger/test_logger_polars.py @@ -1,25 +1,26 @@ import os +import sys import tempfile from typing import Any import numpy as np import pytest -import sys import whylogs as why from whylogs.api.logger import write from whylogs.api.logger.result_set import ResultSet, ResultSetReader from whylogs.core import ColumnProfileView, MetricConfig -from whylogs.core.errors import LoggingError from whylogs.core.metrics import StandardMetric from whylogs.core.resolvers import Resolver from whylogs.core.schema import DatasetSchema -from whylogs.core.stubs import pd, pl - +from whylogs.core.stubs import is_stub, pd, pl if sys.version_info < (3, 8): pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") +if is_stub(pl.DataFrame): + pytest.skip(allow_module_level=True, reason="Requires Polars") + FLOAT_TYPES = [float, np.float32, np.float64, np.float_] INTEGER_TYPES = [int, np.intc, np.uintc, np.int_, np.uint, np.longlong, np.ulonglong] diff --git a/python/tests/api/logger/test_segments_polars.py b/python/tests/api/logger/test_segments_polars.py index 7ffcae7550..52ad2279a0 100644 --- a/python/tests/api/logger/test_segments_polars.py +++ b/python/tests/api/logger/test_segments_polars.py @@ -1,6 +1,7 @@ import math import os import pickle +import sys import tempfile from glob import glob from logging import getLogger @@ -8,7 +9,6 @@ import numpy as np import pytest -import sys import whylogs as why from whylogs.api.logger.result_set import ( @@ -25,14 +25,16 @@ SegmentFilter, segment_on_column, ) -from whylogs.core.stubs import pl +from whylogs.core.stubs import is_stub, pl from whylogs.core.view.dataset_profile_view import DatasetProfileView from whylogs.migration.converters import read_v0_to_view - if sys.version_info < (3, 8): pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") +if is_stub(pl.DataFrame): + pytest.skip(allow_module_level=True, reason="Requires Polars") + TEST_LOGGER = getLogger(__name__) diff --git a/python/tests/core/metrics/test_metrics_polars.py b/python/tests/core/metrics/test_metrics_polars.py index bea371cb42..feb10bbcfd 100644 --- a/python/tests/core/metrics/test_metrics_polars.py +++ b/python/tests/core/metrics/test_metrics_polars.py @@ -1,14 +1,12 @@ +import sys from logging import getLogger import numpy as np import pytest -import sys import whylogs as why -import whylogs.core.configs as cfg from whylogs.core import ColumnProfileView, DatasetSchema -from whylogs.core.datatypes import AnyType, Integral -from whylogs.core.metrics.maths import VarianceM2Result, parallel_variance_m2 +from whylogs.core.datatypes import AnyType from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import ( CardinalityMetric, @@ -16,14 +14,16 @@ MetricConfig, ) from whylogs.core.preprocessing import PreprocessedColumn -from whylogs.core.resolvers import MetricSpec, ResolverSpec, StandardResolver -from whylogs.core.schema import ColumnSchema, DeclarativeSchema -from whylogs.core.stubs import pd, pl - +from whylogs.core.resolvers import MetricSpec, ResolverSpec +from whylogs.core.schema import DeclarativeSchema +from whylogs.core.stubs import is_stub, pd, pl if sys.version_info < (3, 8): pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") +if is_stub(pl.DataFrame): + pytest.skip(allow_module_level=True, reason="Requires Polars") + TEST_LOGGER = getLogger(__name__) @@ -170,7 +170,9 @@ def test_frequent_items_handling_bool_as_string() -> None: met._BOOL_LIST_CHUNK_SIZE = 2 df = pl.DataFrame({"bool": [True, True, True, True, False]}) - schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.frequent_items.value)])]) + schema = DeclarativeSchema( + [ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.frequent_items.value)])] + ) res = why.log(df, schema=schema).view().to_pandas()["frequent_items/frequent_strings"] assert res.array[0][0].value == "True" # type: ignore assert res.array[0][1].value == "False" # type: ignore @@ -216,7 +218,9 @@ def test_cardinality_metric_booleans_top_level_api() -> None: d = {col_name: [bool(i % 2) for i in range(input_rows)]} df = pl.DataFrame(d) - schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])]) + schema = DeclarativeSchema( + [ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])] + ) results = why.log(df, schema=schema) col_prof = results.view().get_column(col_name) cardinality: CardinalityMetric = col_prof.get_metric("cardinality") @@ -226,7 +230,9 @@ def test_cardinality_metric_booleans_top_level_api() -> None: def test_cardinality_metric_booleans_all_false() -> None: df = pl.DataFrame({"b": [False for i in range(3)]}) - schema = DeclarativeSchema([ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])]) + schema = DeclarativeSchema( + [ResolverSpec(column_type=AnyType, metrics=[MetricSpec(StandardMetric.cardinality.value)])] + ) col_prof = why.log(df, schema=schema).view().get_column("b") cardinality: CardinalityMetric = col_prof.get_metric("cardinality") assert cardinality.estimate == pytest.approx(1, 0.1) diff --git a/python/tests/core/test_performance_polars.py b/python/tests/core/test_performance_polars.py index d2eacf580a..b0b7715bee 100644 --- a/python/tests/core/test_performance_polars.py +++ b/python/tests/core/test_performance_polars.py @@ -1,6 +1,7 @@ import cProfile import pstats import random +import sys from dataclasses import dataclass, field from io import StringIO from logging import getLogger @@ -9,21 +10,19 @@ import numpy as np import pandas as pd import pytest -import sys import whylogs_sketching as ds # type: ignore import whylogs from whylogs.core import ColumnProfile, ColumnSchema from whylogs.core.dataset_profile import DatasetProfile from whylogs.core.metrics.metrics import MetricConfig -from whylogs.core.stubs import pl from whylogs.core.resolvers import ( HistogramCountingTrackingResolver, LimitedTrackingResolver, Resolver, StandardResolver, ) - +from whylogs.core.stubs import pl if sys.version_info < (3, 8): pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") diff --git a/python/tests/experimental/core/test_udf_schema_polars.py b/python/tests/experimental/core/test_udf_schema_polars.py index c7e7c0e2fe..3d89fe43b1 100644 --- a/python/tests/experimental/core/test_udf_schema_polars.py +++ b/python/tests/experimental/core/test_udf_schema_polars.py @@ -1,7 +1,7 @@ +import sys from typing import Any, Tuple import pytest -import sys import whylogs as why from whylogs.core.dataset_profile import DatasetProfile @@ -9,7 +9,7 @@ from whylogs.core.metrics import CardinalityMetric, DistributionMetric, StandardMetric from whylogs.core.resolvers import STANDARD_RESOLVER, MetricSpec, ResolverSpec from whylogs.core.segmentation_partition import segment_on_column -from whylogs.core.stubs import pl +from whylogs.core.stubs import is_stub, pl from whylogs.experimental.core.metrics.udf_metric import register_metric_udf from whylogs.experimental.core.udf_schema import ( UdfSchema, @@ -21,10 +21,12 @@ ) from whylogs.experimental.core.validators import condition_validator - if sys.version_info < (3, 8): pytest.skip(allow_module_level=True, reason="Polars requires Python >= 3.8") +if is_stub(pl.DataFrame): + pytest.skip(allow_module_level=True, reason="Requires Polars") + def test_udf_polars() -> None: schema = UdfSchema( diff --git a/python/whylogs/core/dataframe_wrapper.py b/python/whylogs/core/dataframe_wrapper.py index 214e2b1691..68a0e79e80 100644 --- a/python/whylogs/core/dataframe_wrapper.py +++ b/python/whylogs/core/dataframe_wrapper.py @@ -10,7 +10,7 @@ class DataFrameWrapper: def __init__(self, dataframe: DataFrame): # TODO: PandasDataFrame, PolarsDataFrame <: DataFrameWrapper if isinstance(dataframe, DataFrameWrapper): - pandas, polars = dataframe.pd_df, dataframe.pl_df + pandas, polars = dataframe.pd_df, dataframe.pl_df # type: ignore elif isinstance(dataframe, pd.DataFrame): pandas, polars = dataframe, None else: diff --git a/python/whylogs/core/stubs.py b/python/whylogs/core/stubs.py index f7949c844c..5ae3f34e63 100644 --- a/python/whylogs/core/stubs.py +++ b/python/whylogs/core/stubs.py @@ -102,6 +102,10 @@ def is_not_stub(stubbed_class: Any) -> bool: return False +def is_stub(stubbed_class: Any) -> bool: + return not is_not_stub(stubbed_class) + + if _np is None: _np = NumpyStub() From 2866ede9a78847fb04d11c3b4ef74b50aa6923ab Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Mon, 18 Nov 2024 04:59:22 +0000 Subject: [PATCH 36/41] pre-commit --- python/.pre-commit-config.yaml | 2 +- python/whylogs/api/whylabs/session/session.py | 2 +- .../core/constraints/metric_constraints.py | 20 +++++++++++-------- .../core/metrics/condition_count_metric.py | 2 +- python/whylogs/core/metrics/metrics.py | 2 +- python/whylogs/core/utils/protobuf_utils.py | 2 +- python/whylogs/core/validators/validator.py | 2 +- .../whylogs/core/view/dataset_profile_view.py | 2 +- 8 files changed, 19 insertions(+), 15 deletions(-) diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index c267119a8b..8294658557 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -35,7 +35,7 @@ repos: rev: v0.942 hooks: - id: mypy - language: system + language: python exclude: ^(python/tests/|python/examples/|python/examples/integration/|python/whylogs/core/proto/|python/docs/|python/whylogs/viz/html/|java|python/whylogs/api/logger/experimental/logger) - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.5.1 diff --git a/python/whylogs/api/whylabs/session/session.py b/python/whylogs/api/whylabs/session/session.py index 5ff8bdb8ca..566e7d7e5a 100644 --- a/python/whylogs/api/whylabs/session/session.py +++ b/python/whylogs/api/whylabs/session/session.py @@ -7,7 +7,7 @@ from functools import partial from typing import Any, Dict, List, Optional, Set, Union, cast -import requests as web_requests +import requests as web_requests # type: ignore from whylabs_client import ApiException # type: ignore from whylabs_client.api.log_api import ( # type: ignore GetProfileObservatoryLinkRequest, diff --git a/python/whylogs/core/constraints/metric_constraints.py b/python/whylogs/core/constraints/metric_constraints.py index 9baa12c193..cd41d9621d 100644 --- a/python/whylogs/core/constraints/metric_constraints.py +++ b/python/whylogs/core/constraints/metric_constraints.py @@ -217,7 +217,7 @@ class DatasetComparisonConstraint: def validate_profile( self, dataset_profile: DatasetProfileView, reference_profile: DatasetProfileView ) -> Tuple[bool, Optional[Dict[str, Any]]]: - validate_result, summary = self.condition(dataset_profile, reference_profile) + (validate_result, summary) = self.condition(dataset_profile, reference_profile) # type: ignore return (validate_result, summary) @@ -245,7 +245,7 @@ def _get_metric_summary(self, metrics: Dict[str, Metric]) -> Optional[Dict[str, def validate_profile(self, dataset_profile: DatasetProfileView) -> Tuple[bool, Optional[Dict[str, Any]]]: try: - validate_result, metrics = self.condition(dataset_profile) + (validate_result, metrics) = self.condition(dataset_profile) # type: ignore except MissingMetric as e: if self.require_column_existence: logger.info(f"validate_profile could not get metric {str(e)} so returning False.") @@ -522,11 +522,15 @@ def validate(self, profile_view: Optional[DatasetProfileView] = None) -> bool: logger.info(f"{constraint_name} failed on column {column_name}") return False - for constraint in self.dataset_constraints + self.dataset_comparison_constraints: - if isinstance(constraint, DatasetConstraint): - (result, _) = constraint.validate_profile(profile) - elif isinstance(constraint, DatasetComparisonConstraint): - (result, _) = constraint.validate_profile(profile, self.reference_profile_view) + constraint: Union[DatasetConstraint, DatasetComparisonConstraint] + for constraint in self.dataset_constraints: + (result, _) = constraint.validate_profile(profile) + if not result: + logger.info(f"{constraint.name} failed on dataset") + return False + + for constraint in self.dataset_comparison_constraints: + (result, _) = constraint.validate_profile(profile, self.reference_profile_view) if not result: logger.info(f"{constraint.name} failed on dataset") return False @@ -605,7 +609,7 @@ def generate_constraints_report( results.append(metric_report) - for constraint in self.dataset_constraints + self.dataset_comparison_constraints: + for constraint in self.dataset_constraints + self.dataset_comparison_constraints: # type: ignore metric_report = self._generate_dataset_report( profile_view=profile, constraint=constraint, diff --git a/python/whylogs/core/metrics/condition_count_metric.py b/python/whylogs/core/metrics/condition_count_metric.py index 8d3bf49a12..61f4c8d684 100644 --- a/python/whylogs/core/metrics/condition_count_metric.py +++ b/python/whylogs/core/metrics/condition_count_metric.py @@ -200,7 +200,7 @@ def columnar_update(self, data: PreprocessedColumn) -> OperationResult: count += 1 for cond_name, condition in self.conditions.items(): try: - if condition.relation(datum): + if condition.relation(datum): # type: ignore self.matches[cond_name].set(self.matches[cond_name].value + 1) else: if condition.log_on_failure: diff --git a/python/whylogs/core/metrics/metrics.py b/python/whylogs/core/metrics/metrics.py index 2d103c53d3..b4343bde0e 100644 --- a/python/whylogs/core/metrics/metrics.py +++ b/python/whylogs/core/metrics/metrics.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import whylogs_sketching as ds # type: ignore -from google.protobuf.struct_pb2 import Struct +from google.protobuf.struct_pb2 import Struct # type: ignore import whylogs.core.configs as conf from whylogs.core.metrics.maths import ( diff --git a/python/whylogs/core/utils/protobuf_utils.py b/python/whylogs/core/utils/protobuf_utils.py index a3745c52da..7f92498e23 100644 --- a/python/whylogs/core/utils/protobuf_utils.py +++ b/python/whylogs/core/utils/protobuf_utils.py @@ -6,7 +6,7 @@ from logging import getLogger from typing import IO, Type, TypeVar -from google.protobuf.message import Message +from google.protobuf.message import Message # type: ignore from whylogs.core.errors import DeserializationError diff --git a/python/whylogs/core/validators/validator.py b/python/whylogs/core/validators/validator.py index 06a71cd7c9..87a33408b5 100644 --- a/python/whylogs/core/validators/validator.py +++ b/python/whylogs/core/validators/validator.py @@ -6,7 +6,7 @@ from whylogs.core.metrics.condition_count_metric import Condition -@dataclass +@dataclass # type: ignore class Validator(ABC): name: str conditions: Dict[str, Union[Condition, Callable[[Any], bool]]] diff --git a/python/whylogs/core/view/dataset_profile_view.py b/python/whylogs/core/view/dataset_profile_view.py index 686e6f75f0..f3f1a358cc 100644 --- a/python/whylogs/core/view/dataset_profile_view.py +++ b/python/whylogs/core/view/dataset_profile_view.py @@ -6,7 +6,7 @@ from enum import Enum from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union -from google.protobuf.message import DecodeError +from google.protobuf.message import DecodeError # type: ignore from whylogs.api.writer.writer import _Writable from whylogs.core.configs import SummaryConfig From f2900b1175c3797b04970fc2b2330e21a320f140 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Tue, 10 Dec 2024 22:53:23 +0000 Subject: [PATCH 37/41] move setup dependency --- python/pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 67477f49df..de3fbce206 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,10 +20,6 @@ whylabs-client = "^0.6.15" requests = "^2.27" backoff = "^2.2.1" platformdirs = "^3.5.0" -setuptools = [ - { version = "<58", python = "<3.8", optional = false }, - { version = ">=75", python = ">=3.8", optional = false } -] # viz module. Everything after this should be optional pybars3 = { version = "^0.9", optional = true } @@ -201,6 +197,10 @@ pyright = "^1.1.383" httpretty = { version = "^1.1.4", python = ">=3.6,<3.10"} ruff = "^0.4.4" pytest-xdist = {version = "^3.6.1", python = ">=3.8,<4"} +setuptools = [ + { version = "<58", python = "<3.8", optional = false }, + { version = ">=75", python = ">=3.8", optional = false } +] [build-system] requires = ["poetry-core>=1.0.0"] From 6cf62bbc72c833fb032f609eb9d9df751cd5ae37 Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Wed, 11 Dec 2024 02:46:37 +0000 Subject: [PATCH 38/41] use pandas stubs --- .../api/logger/experimental/logger/actor/data_logger.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py index dc8b923816..65ee6db63c 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py @@ -1,11 +1,7 @@ from abc import abstractmethod from typing import Any, Dict, Generic, List, Optional, TypeVar, Union -# TODO: stubs? -try: - import pandas as pd # type: ignore -except ImportError: - pd: Any = None # type: ignore +from whylogs.core.stubs import pd # TODO add strong typing here. whylogs takes pretty much anything in log() so we don't actually From 190b098eaa0dc4b28b5b8aba41adb322f77d678b Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Wed, 11 Dec 2024 02:51:56 +0000 Subject: [PATCH 39/41] Update python/whylogs/core/dataframe_wrapper.py --- python/whylogs/core/dataframe_wrapper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/whylogs/core/dataframe_wrapper.py b/python/whylogs/core/dataframe_wrapper.py index 68a0e79e80..342a63e782 100644 --- a/python/whylogs/core/dataframe_wrapper.py +++ b/python/whylogs/core/dataframe_wrapper.py @@ -67,8 +67,6 @@ def groupby( if self.pd_df is not None: grouped = self.pd_df.groupby(columns) return grouped - d = {g: grouped.get_group(g) for g in grouped.groups.keys()} - return d elif self.pl_df is not None: return self.pl_df.group_by(columns) From 3ec799e0d325633cc0b0794864f21525efe11bb9 Mon Sep 17 00:00:00 2001 From: richard-rogers <93153899+richard-rogers@users.noreply.github.com> Date: Wed, 11 Dec 2024 03:03:41 +0000 Subject: [PATCH 40/41] Update python/whylogs/core/datatypes.py --- python/whylogs/core/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/whylogs/core/datatypes.py b/python/whylogs/core/datatypes.py index 5755be771a..643f9e390b 100644 --- a/python/whylogs/core/datatypes.py +++ b/python/whylogs/core/datatypes.py @@ -57,7 +57,7 @@ def _do_match(cls, dtype_or_type: Any, maybe_type: Optional[Any]) -> bool: if not isinstance(dtype_or_type, type): return False - if issubclass(dtype_or_type, (bool, int, np.number, np.bool_, pl.datatypes.IntegerType)): + if issubclass(dtype_or_type, (bool, int, np.number, np.bool_)): if is_not_stub(np.issubdtype) and np.issubdtype(dtype_or_type, np.floating): return False if issubclass(dtype_or_type, (np.datetime64, np.timedelta64)): From dad1d17959e28b8ca4aa3f5d2a0b450435b6635e Mon Sep 17 00:00:00 2001 From: Richard Rogers Date: Wed, 11 Dec 2024 07:38:52 +0000 Subject: [PATCH 41/41] dont use stubs --- .../api/logger/experimental/logger/actor/data_logger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py index 65ee6db63c..10ccf20f39 100644 --- a/python/whylogs/api/logger/experimental/logger/actor/data_logger.py +++ b/python/whylogs/api/logger/experimental/logger/actor/data_logger.py @@ -1,7 +1,10 @@ from abc import abstractmethod from typing import Any, Dict, Generic, List, Optional, TypeVar, Union -from whylogs.core.stubs import pd +try: + import pandas as pd +except ImportError: + pd: Any = None # type: ignore # TODO add strong typing here. whylogs takes pretty much anything in log() so we don't actually