From 6ab5a909cecd139c96dad81254c98b623aeed92e Mon Sep 17 00:00:00 2001 From: James Weakley Date: Wed, 13 Jan 2021 22:05:39 +1100 Subject: [PATCH] QuantileTransformer working on BigQuery, clarify that it's untested on Redshift --- README.md | 2 +- dbt_project.yml | 2 +- .../quantile_transformer_model_macro.sql | 12 +++++- macros/quantile_transformer.sql | 37 +++++++++++++++++-- 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 45e4b1d..2ea3d21 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The macros are: | [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler) | min_max_scaler | Y | Y | Y | [![example](images/min_max_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#minmaxscaler) | | [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer) | normalizer | Y | Y | Y | [![example](images/normalizer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#normalizer) | | [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) | one_hot_encoder | Y | Y | Y | ![example](images/one_hot_encoder.gif) | -| [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer) | quantile_transformer | Y | N | Y | [![example](images/quantile_transformer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#quantiletransformer-uniform-output) | +| [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer) | quantile_transformer | Y | Y | N | [![example](images/quantile_transformer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#quantiletransformer-uniform-output) | | [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) | robust_scaler | Y | Y | Y | [![example](images/robust_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#robustscaler) | | [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler) | standard_scaler | Y | Y | Y | [![example](images/standard_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#standardscaler) | diff --git a/dbt_project.yml b/dbt_project.yml index 1a55b5d..91b6017 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -1,5 +1,5 @@ name: 'dbt_ml_preprocessing' -version: '0.3.0' +version: '0.4.0' require-dbt-version: ">=0.15.1" diff --git a/integration_tests/macros/quantile_transformer_model_macro.sql b/integration_tests/macros/quantile_transformer_model_macro.sql index b846545..19e5220 100644 --- a/integration_tests/macros/quantile_transformer_model_macro.sql +++ b/integration_tests/macros/quantile_transformer_model_macro.sql @@ -8,7 +8,17 @@ with data as ( select * from data {% endmacro %} +-- other adapters we generate an empty test result to force a test pass +{% macro bigquery__quantile_transformer_model_macro() %} +with data as ( + + {{ dbt_ml_preprocessing.quantile_transformer( ref('data_quantile_transformer') ,'col_to_transform') }} + +) +select * from data +{% endmacro %} + -- other adapters we generate an empty test result to force a test pass {% macro default__quantile_transformer_model_macro() %} select 1 as empty_result from (select 1) where 1=2 -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/macros/quantile_transformer.sql b/macros/quantile_transformer.sql index cb4074c..f1b33ae 100644 --- a/macros/quantile_transformer.sql +++ b/macros/quantile_transformer.sql @@ -33,11 +33,42 @@ coalesce(y1 + ((x-x1)/(x2-x1)) * (y2-y1),0) as {{ source_column }}_transformed from linear_interpolation_variables {% endmacro %} +{% macro bigquery__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} +with quantile_values as( + {% for quartile_index in range(n_quantiles) %} + {% set quartile = quartile_index / (n_quantiles-1) %} + select distinct {{ quartile }} as quantile,percentile_cont({{ source_column }},{{ quartile }}) OVER() as quantile_value from {{ source_table }} + {% if not loop.last %} union all {% endif %} + {% endfor %} +), +-- fold all quantiles and quantile values into a single row, an array of structs that we can safely cross join on +quantile_values_array as( +select ARRAY_AGG(struct (quantile, quantile_value)) as quantile_values from quantile_values +), +-- prepare to apply linear interpolation formula +linear_interpolation_variables as( + select + {{include_columns}}, + {{ source_column }} as x, + (select max(b.quantile) from UNNEST(quantile_values) b where b.quantile_value=a.{{ source_column }}) as y2, + (select max(b.quantile_value) from UNNEST(quantile_values) b where b.quantile_value=a.{{ source_column }}) as x2 + from {{ source_table }} a, + quantile_values_array + where {{ source_column }} is not null + order by {{ source_column }} +) +select +{{include_columns}}, +coalesce(y1 + ((x-x1)/(x2-x1)) * (y2-y1),0) as {{ source_column }}_transformed +from linear_interpolation_variables +{% endmacro %} + {% macro default__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} {% set error_message %} -The `quantile_transformer` macro is only supported on Snowflake at this time. It should work on other DBs, it just requires some rework. +The `quantile_transformer` macro is only supported on Snowflake and BigQuery at this time. It should work on other DBs, it just requires some rework. {% endset %} {%- do exceptions.raise_compiler_error(error_message) -%} - -{% endmacro %} +{% endmacro %} \ No newline at end of file