From eb80c723b521ae3b06ccb4889b39b0f85d314ede Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Mon, 27 Mar 2023 08:28:21 -0300 Subject: [PATCH] fix: remove computation from missing data plots (#1294) --- .../model/pandas/missing_pandas.py | 28 ++++++++- .../model/spark/missing_spark.py | 28 +++++++-- src/ydata_profiling/visualisation/missing.py | 63 ++++++++++++------- src/ydata_profiling/visualisation/plot.py | 51 +++++++-------- 4 files changed, 109 insertions(+), 61 deletions(-) diff --git a/src/ydata_profiling/model/pandas/missing_pandas.py b/src/ydata_profiling/model/pandas/missing_pandas.py index e9c3a40e0..d2a1303f9 100644 --- a/src/ydata_profiling/model/pandas/missing_pandas.py +++ b/src/ydata_profiling/model/pandas/missing_pandas.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd from ydata_profiling.config import Settings @@ -11,14 +12,35 @@ @missing_bar.register def pandas_missing_bar(config: Settings, df: pd.DataFrame) -> str: - return plot_missing_bar(config, df) + notnull_counts = len(df) - df.isnull().sum() + return plot_missing_bar( + config, + notnull_counts=notnull_counts, + nrows=len(df), + columns=list(df.columns), + ) @missing_matrix.register def pandas_missing_matrix(config: Settings, df: pd.DataFrame) -> str: - return plot_missing_matrix(config, df) + return plot_missing_matrix( + config, + columns=list(df.columns), + notnull=df.notnull().values, + nrows=len(df), + ) @missing_heatmap.register def pandas_missing_heatmap(config: Settings, df: pd.DataFrame) -> str: - return plot_missing_heatmap(config, df) + # Remove completely filled or completely empty variables. + columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0] + df = df.iloc[:, columns] + + # Create and mask the correlation matrix. Construct the base heatmap. + corr_mat = df.isnull().corr() + mask = np.zeros_like(corr_mat) + mask[np.triu_indices_from(mask)] = True + return plot_missing_heatmap( + config, corr_mat=corr_mat, mask=mask, columns=list(df.columns) + ) diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index 4c268e6a3..a8c46a5a6 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -1,5 +1,6 @@ from typing import Any, List, Optional +import numpy as np from pyspark.sql import DataFrame from ydata_profiling.config import Settings @@ -67,18 +68,33 @@ def spark_missing_bar(config: Settings, df: DataFrame) -> str: ) return plot_missing_bar( - config, - MissingnoBarSparkPatch( - df=data_nan_counts, columns=df.columns, original_df_size=df.count() - ), + config, notnull_counts=data_nan_counts, columns=df.columns, nrows=df.count() ) @missing_matrix.register def spark_missing_matrix(config: Settings, df: DataFrame) -> str: - return plot_missing_matrix(config, MissingnoBarSparkPatch(df)) + df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) + return plot_missing_matrix( + config, + columns=df.columns, + notnull=df.notnull().values, + nrows=len(df), + ) @missing_heatmap.register def spark_missing_heatmap(config: Settings, df: DataFrame) -> str: - return plot_missing_heatmap(config, MissingnoBarSparkPatch(df)) + df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) + + # Remove completely filled or completely empty variables. + columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0] + df = df.iloc[:, columns] + + # Create and mask the correlation matrix. Construct the base heatmap. + corr_mat = df.isnull().corr() + mask = np.zeros_like(corr_mat) + mask[np.triu_indices_from(mask)] = True + return plot_missing_heatmap( + config, corr_mat=corr_mat, mask=mask, columns=list(df.columns) + ) diff --git a/src/ydata_profiling/visualisation/missing.py b/src/ydata_profiling/visualisation/missing.py index 7632b376d..9dd1068d0 100644 --- a/src/ydata_profiling/visualisation/missing.py +++ b/src/ydata_profiling/visualisation/missing.py @@ -1,5 +1,6 @@ """Plotting functions for the missing values diagrams""" -import pandas as pd +from typing import Any, List + from matplotlib import pyplot as plt from ydata_profiling.config import Settings @@ -12,22 +13,22 @@ from ydata_profiling.visualisation.utils import hex_to_rgb, plot_360_n0sc0pe -def get_font_size(data: pd.DataFrame) -> float: +def get_font_size(columns: List[str]) -> float: """Calculate font size based on number of columns Args: - data: DataFrame + columns: List of column names. Returns: Font size for missing values plots. """ - max_label_length = max(len(label) for label in data.columns) + max_label_length = max(len(label) for label in columns) - if len(data.columns) < 20: + if len(columns) < 20: font_size = 13.0 - elif 20 <= len(data.columns) < 40: + elif 20 <= len(columns) < 40: font_size = 12.0 - elif 40 <= len(data.columns) < 60: + elif 40 <= len(columns) < 60: font_size = 10.0 else: font_size = 8.0 @@ -37,21 +38,27 @@ def get_font_size(data: pd.DataFrame) -> float: @manage_matplotlib_context() -def plot_missing_matrix(config: Settings, data: pd.DataFrame) -> str: +def plot_missing_matrix( + config: Settings, notnull: Any, columns: List[str], nrows: int +) -> str: """Generate missing values matrix plot Args: config: report Settings object - data: Pandas DataFrame to generate missing values matrix from. + notnull: Missing data indicator matrix. + columns: List of column names. + nrows: Number of rows in the dataframe. Returns: The resulting missing values matrix encoded as a string. """ missing_matrix( - data, + notnull=notnull, + height=nrows, + columns=columns, figsize=(10, 4), - fontsize=get_font_size(data) / 20 * 16, + fontsize=get_font_size(columns) / 20 * 16, color=hex_to_rgb(config.html.style.primary_colors[0]), labels=config.plot.missing.force_labels, ) @@ -60,20 +67,25 @@ def plot_missing_matrix(config: Settings, data: pd.DataFrame) -> str: @manage_matplotlib_context() -def plot_missing_bar(config: Settings, data: pd.DataFrame) -> str: +def plot_missing_bar( + config: Settings, notnull_counts: list, nrows: int, columns: List[str] +) -> str: """Generate missing values bar plot. Args: config: report Settings object - data: Pandas DataFrame to generate missing values bar plot from. + notnull_counts: Number of nonnull values per column. + nrows: Number of rows in the dataframe. + columns: List of column names. Returns: The resulting missing values bar plot encoded as a string. """ missing_bar( - data, + notnull_counts=notnull_counts, + nrows=nrows, figsize=(10, 5), - fontsize=get_font_size(data), + fontsize=get_font_size(columns), color=hex_to_rgb(config.html.style.primary_colors[0]), labels=config.plot.missing.force_labels, ) @@ -85,35 +97,40 @@ def plot_missing_bar(config: Settings, data: pd.DataFrame) -> str: @manage_matplotlib_context() -def plot_missing_heatmap(config: Settings, data: pd.DataFrame) -> str: +def plot_missing_heatmap( + config: Settings, corr_mat: Any, mask: Any, columns: List[str] +) -> str: """Generate missing values heatmap plot. Args: config: report Settings object - data: Pandas DataFrame to generate missing values heatmap plot from. + corr_mat: Correlation matrix. + maks: Upper-triangle mask. + columns: List of column names. Returns: The resulting missing values heatmap plot encoded as a string. """ height = 4 - if len(data.columns) > 10: - height += int((len(data.columns) - 10) / 5) + if len(columns) > 10: + height += int((len(columns) - 10) / 5) height = min(height, 10) - font_size = get_font_size(data) - if len(data.columns) > 40: + font_size = get_font_size(columns) + if len(columns) > 40: font_size /= 1.4 missing_heatmap( - data, + corr_mat=corr_mat, + mask=mask, figsize=(10, height), fontsize=font_size, cmap=config.plot.missing.cmap, labels=config.plot.missing.force_labels, ) - if len(data.columns) > 40: + if len(columns) > 40: plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.3) else: plt.subplots_adjust(left=0.2, right=0.9, top=0.8, bottom=0.3) diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index d93f4436f..fd8764b38 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -761,7 +761,8 @@ def _set_visibility( def missing_bar( - data: pd.DataFrame, + notnull_counts: pd.Series, + nrows: int, figsize: Tuple[float, float] = (25, 10), fontsize: float = 16, labels: bool = True, @@ -774,7 +775,8 @@ def missing_bar( Inspired by https://github.com/ResidentMario/missingno Args: - data: The input DataFrame. + notnull_counts: Number of nonnull values per column. + nrows: Number of rows in the dataframe. figsize: The size of the figure to display. fontsize: The figure's font size. This default to 16. labels: Whether or not to display the column names. Would need to be turned off on particularly large @@ -784,12 +786,10 @@ def missing_bar( Returns: The plot axis. """ - null_counts = len(data) - data.isnull().sum() - values = null_counts.values - null_counts = null_counts / len(data) + percentage = notnull_counts / nrows - if len(values) <= 50: - ax0 = null_counts.plot.bar(figsize=figsize, fontsize=fontsize, color=color) + if len(notnull_counts) <= 50: + ax0 = percentage.plot.bar(figsize=figsize, fontsize=fontsize, color=color) ax0.set_xticklabels( ax0.get_xticklabels(), ha="right", @@ -801,17 +801,17 @@ def missing_bar( ax1.set_xticks(ax0.get_xticks()) ax1.set_xlim(ax0.get_xlim()) ax1.set_xticklabels( - values, ha="left", fontsize=fontsize, rotation=label_rotation + notnull_counts, ha="left", fontsize=fontsize, rotation=label_rotation ) else: - ax0 = null_counts.plot.barh(figsize=figsize, fontsize=fontsize, color=color) + ax0 = percentage.plot.barh(figsize=figsize, fontsize=fontsize, color=color) ylabels = ax0.get_yticklabels() if labels else [] ax0.set_yticklabels(ylabels, fontsize=fontsize) ax1 = ax0.twinx() ax1.set_yticks(ax0.get_yticks()) ax1.set_ylim(ax0.get_ylim()) - ax1.set_yticklabels(values, fontsize=fontsize) + ax1.set_yticklabels(notnull_counts, fontsize=fontsize) for ax in [ax0, ax1]: ax = _set_visibility(ax) @@ -820,7 +820,9 @@ def missing_bar( def missing_matrix( - data: pd.DataFrame, + notnull: Any, + columns: List[str], + height: int, figsize: Tuple[float, float] = (25, 10), color: Tuple[float, ...] = (0.41, 0.41, 0.41), fontsize: float = 16, @@ -833,7 +835,9 @@ def missing_matrix( Inspired by https://github.com/ResidentMario/missingno Args: - data: The input DataFrame. + notnull: Missing data indicator matrix. + columns: List of column names. + height: Number of rows in the dataframe. figsize: The size of the figure to display. fontsize: The figure's font size. Default to 16. labels: Whether or not to display the column names when there is more than 50 columns. @@ -842,9 +846,7 @@ def missing_matrix( Returns: The plot axis. """ - height, width = data.shape - - notnull = data.notnull().values + width = len(columns) missing_grid = np.zeros((height, width, 3), dtype=np.float32) missing_grid[notnull] = color @@ -860,9 +862,7 @@ def missing_matrix( ha = "left" ax.set_xticks(list(range(0, width))) - ax.set_xticklabels( - list(data.columns), rotation=label_rotation, ha=ha, fontsize=fontsize - ) + ax.set_xticklabels(columns, rotation=label_rotation, ha=ha, fontsize=fontsize) ax.set_yticks([0, height - 1]) ax.set_yticklabels([1, height], fontsize=fontsize) @@ -878,7 +878,8 @@ def missing_matrix( def missing_heatmap( - data: pd.DataFrame, + corr_mat: Any, + mask: Any, figsize: Tuple[float, float] = (20, 12), fontsize: float = 16, labels: bool = True, @@ -895,7 +896,8 @@ def missing_heatmap( Inspired by https://github.com/ResidentMario/missingno Args: - data: The input DataFrame. + corr_mat: correlation matrix. + mask: Upper-triangle mask. figsize: The size of the figure to display. Defaults to (20, 12). fontsize: The figure's font size. labels: Whether or not to label each matrix entry with its correlation (default is True). @@ -906,15 +908,6 @@ def missing_heatmap( The plot axis. """ _, ax = plt.subplots(1, 1, figsize=figsize) - - # Remove completely filled or completely empty variables. - columns = [i for i, n in enumerate(np.var(data.isnull(), axis="rows")) if n > 0] - data = data.iloc[:, columns] - - # Create and mask the correlation matrix. Construct the base heatmap. - corr_mat = data.isnull().corr() - mask = np.zeros_like(corr_mat) - mask[np.triu_indices_from(mask)] = True norm_args = {"vmin": -1, "vmax": 1} if normalized_cmap else {} if labels: