From 7ce9068abc6a245b6ed9a94b87859a8075e1510c Mon Sep 17 00:00:00 2001 From: Adam <41971533+jcadam14@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:09:03 -0600 Subject: [PATCH] Removed memory resource testing code --- poetry.lock | 31 +------------------ pyproject.toml | 1 - src/regtech_data_validator/cli.py | 16 ++-------- src/regtech_data_validator/data_formatters.py | 26 +++++----------- 4 files changed, 12 insertions(+), 62 deletions(-) diff --git a/poetry.lock b/poetry.lock index a6c0629..bdd0b3c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1691,35 +1691,6 @@ timezone = ["backports-zoneinfo", "tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] xlsxwriter = ["xlsxwriter"] -[[package]] -name = "psutil" -version = "6.0.0" -description = "Cross-platform lib for process and system monitoring in Python." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" -files = [ - {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"}, - {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"}, - {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"}, - {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"}, - {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"}, - {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"}, - {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"}, - {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"}, - {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"}, - {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"}, -] - -[package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] - [[package]] name = "pyarrow" version = "17.0.0" @@ -2479,4 +2450,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.12,<4" -content-hash = "7ed32854123b4d9f53db949030062fc4dc27c81ffb621a93a8c377571f0809d6" +content-hash = "ceb8e75e8d9dfe9d1ba3a72e650cb551c9d471048b5ecf975b1f69e5a19f70bb" diff --git a/pyproject.toml b/pyproject.toml index 997ca91..acbe581 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,6 @@ matplotlib = "^3.9.0" fsspec = "^2024.6.1" s3fs = "^2024.6.1" polars-lts-cpu = "^1.6.0" -psutil = "^6.0.0" pyarrow = "^17.0.0" [tool.poetry.group.dev.dependencies] diff --git a/src/regtech_data_validator/cli.py b/src/regtech_data_validator/cli.py index 49e9c4d..3b80dff 100644 --- a/src/regtech_data_validator/cli.py +++ b/src/regtech_data_validator/cli.py @@ -4,13 +4,10 @@ from regtech_data_validator.data_formatters import df_to_csv, df_to_str, df_to_json, df_to_table, df_to_download from typing import Annotated, Optional -import fsspec -from fsspec import AbstractFileSystem, filesystem import polars as pl import typer import typer.core -from regtech_data_validator.checks import Severity from regtech_data_validator.validator import validate_batch_csv from regtech_data_validator.validation_results import ValidationPhase @@ -90,7 +87,7 @@ def validate( final_phase = ValidationPhase.LOGICAL all_findings = [] final_df = pl.DataFrame() - #path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv" + # path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv" for findings, phase in validate_batch_csv(path, context_dict, batch_size=50000, batch_count=5): total_findings += findings.height final_phase = phase @@ -102,11 +99,7 @@ def validate( if all_findings: final_df = pl.concat(all_findings, how="diagonal") - - print(f"Single Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'single-field').height}") - print(f"Multi Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'multi-field').height}") - print(f"Single Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'single-field').height}") - print(f"Multi Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'multi-field').height}") + status = "SUCCESS" if total_findings == 0 else "FAILURE" match output: @@ -119,10 +112,7 @@ def validate( case OutputFormat.TABLE: print(df_to_table(final_df)) case OutputFormat.DOWNLOAD: - # uses streaming sink_csv, which doesn't print out - # to a string to save memory - df_to_download(final_df, "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156_report.csv") - #df_to_download(final_df) + df_to_download(final_df) case _: raise ValueError(f'output format "{output}" not supported') diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py index 331daff..60b7b9c 100644 --- a/src/regtech_data_validator/data_formatters.py +++ b/src/regtech_data_validator/data_formatters.py @@ -1,16 +1,11 @@ import ujson -import pandas as pd import polars as pl import fsspec -import threading -import s3fs -import psutil -import csv from tabulate import tabulate from functools import partial -from fsspec import AbstractFileSystem, filesystem + def find_check(group_name, checks): gen = (check for check in checks if check.title == group_name) @@ -22,7 +17,7 @@ def find_check(group_name, checks): # which corresponds to severity, error/warning code, name of error/warning, row number in sblar, UID, fig link, # error/warning description (markdown formatted), single/multi/register, and the fields and values associated with the error/warning. # Each row in the final dataframe represents all data for that one finding. -def format_findings(df: pd.DataFrame, checks): +def format_findings(df: pl.DataFrame, checks): final_df = pl.DataFrame() sorted_df = df.with_columns(pl.col('validation_id').cast(pl.Categorical(ordering='lexical'))).sort('validation_id') @@ -99,10 +94,10 @@ def format_findings(df: pd.DataFrame, checks): return final_df -def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"): +def df_to_download(df: pl.DataFrame, path: str = "download_report.csv"): if df.is_empty(): # return headers of csv for 'emtpy' report - pl.DataFrame( + empty_df = pl.DataFrame( { "validation_type": [], "validation_id": [], @@ -112,7 +107,9 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"): "fig_link": [], "validation_description": [], } - ).lazy().sink_csv(report_name, quote_style='non_numeric') + ) + with fsspec.open(path, mode='wb') as f: + empty_df.write_csv(f, quote_style='non_numeric') return sorted_df = ( @@ -122,14 +119,7 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"): ) with fsspec.open(path, mode='wb') as f: - df.write_csv(f, quote_style='non_numeric') - - -def monitor_memory(writing_thread, interval=1): - import time - while writing_thread.is_alive(): - get_memory_usage() - time.sleep(interval) + sorted_df.write_csv(f, quote_style='non_numeric') def df_to_csv(df: pl.DataFrame) -> str: