From 7ce9068abc6a245b6ed9a94b87859a8075e1510c Mon Sep 17 00:00:00 2001
From: Adam <41971533+jcadam14@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:09:03 -0600
Subject: [PATCH] Removed memory resource testing code

---
 poetry.lock                                   | 31 +------------------
 pyproject.toml                                |  1 -
 src/regtech_data_validator/cli.py             | 16 ++--------
 src/regtech_data_validator/data_formatters.py | 26 +++++-----------
 4 files changed, 12 insertions(+), 62 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a6c0629..bdd0b3c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1691,35 +1691,6 @@ timezone = ["backports-zoneinfo", "tzdata"]
 xlsx2csv = ["xlsx2csv (>=0.8.0)"]
 xlsxwriter = ["xlsxwriter"]
 
-[[package]]
-name = "psutil"
-version = "6.0.0"
-description = "Cross-platform lib for process and system monitoring in Python."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
-files = [
-    {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"},
-    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"},
-    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"},
-    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"},
-    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"},
-    {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"},
-    {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"},
-    {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"},
-    {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"},
-    {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"},
-    {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"},
-    {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"},
-    {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"},
-    {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"},
-]
-
-[package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
-
 [[package]]
 name = "pyarrow"
 version = "17.0.0"
@@ -2479,4 +2450,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.12,<4"
-content-hash = "7ed32854123b4d9f53db949030062fc4dc27c81ffb621a93a8c377571f0809d6"
+content-hash = "ceb8e75e8d9dfe9d1ba3a72e650cb551c9d471048b5ecf975b1f69e5a19f70bb"
diff --git a/pyproject.toml b/pyproject.toml
index 997ca91..acbe581 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,6 @@ matplotlib = "^3.9.0"
 fsspec = "^2024.6.1"
 s3fs = "^2024.6.1"
 polars-lts-cpu = "^1.6.0"
-psutil = "^6.0.0"
 pyarrow = "^17.0.0"
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/regtech_data_validator/cli.py b/src/regtech_data_validator/cli.py
index 49e9c4d..3b80dff 100644
--- a/src/regtech_data_validator/cli.py
+++ b/src/regtech_data_validator/cli.py
@@ -4,13 +4,10 @@
 from regtech_data_validator.data_formatters import df_to_csv, df_to_str, df_to_json, df_to_table, df_to_download
 from typing import Annotated, Optional
 
-import fsspec
-from fsspec import AbstractFileSystem, filesystem
 import polars as pl
 import typer
 import typer.core
 
-from regtech_data_validator.checks import Severity
 from regtech_data_validator.validator import validate_batch_csv
 from regtech_data_validator.validation_results import ValidationPhase
 
@@ -90,7 +87,7 @@ def validate(
     final_phase = ValidationPhase.LOGICAL
     all_findings = []
     final_df = pl.DataFrame()
-    #path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
+    # path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
     for findings, phase in validate_batch_csv(path, context_dict, batch_size=50000, batch_count=5):
         total_findings += findings.height
         final_phase = phase
@@ -102,11 +99,7 @@ def validate(
 
     if all_findings:
         final_df = pl.concat(all_findings, how="diagonal")
-    
-    print(f"Single Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'single-field').height}")
-    print(f"Multi Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'multi-field').height}")
-    print(f"Single Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'single-field').height}")
-    print(f"Multi Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'multi-field').height}")
+
     status = "SUCCESS" if total_findings == 0 else "FAILURE"
 
     match output:
@@ -119,10 +112,7 @@ def validate(
         case OutputFormat.TABLE:
             print(df_to_table(final_df))
         case OutputFormat.DOWNLOAD:
-            # uses streaming sink_csv, which doesn't print out
-            # to a string to save memory
-           df_to_download(final_df, "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156_report.csv")
-           #df_to_download(final_df)
+            df_to_download(final_df)
         case _:
             raise ValueError(f'output format "{output}" not supported')
 
diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py
index 331daff..60b7b9c 100644
--- a/src/regtech_data_validator/data_formatters.py
+++ b/src/regtech_data_validator/data_formatters.py
@@ -1,16 +1,11 @@
 import ujson
-import pandas as pd
 import polars as pl
 import fsspec
-import threading
-import s3fs
-import psutil
-import csv
 
 from tabulate import tabulate
 
 from functools import partial
-from fsspec import AbstractFileSystem, filesystem
+
 
 def find_check(group_name, checks):
     gen = (check for check in checks if check.title == group_name)
@@ -22,7 +17,7 @@ def find_check(group_name, checks):
 # which corresponds to severity, error/warning code, name of error/warning, row number in sblar, UID, fig link,
 # error/warning description (markdown formatted), single/multi/register, and the fields and values associated with the error/warning.
 # Each row in the final dataframe represents all data for that one finding.
-def format_findings(df: pd.DataFrame, checks):
+def format_findings(df: pl.DataFrame, checks):
     final_df = pl.DataFrame()
 
     sorted_df = df.with_columns(pl.col('validation_id').cast(pl.Categorical(ordering='lexical'))).sort('validation_id')
@@ -99,10 +94,10 @@ def format_findings(df: pd.DataFrame, checks):
     return final_df
 
 
-def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
+def df_to_download(df: pl.DataFrame, path: str = "download_report.csv"):
     if df.is_empty():
         # return headers of csv for 'emtpy' report
-        pl.DataFrame(
+        empty_df = pl.DataFrame(
             {
                 "validation_type": [],
                 "validation_id": [],
@@ -112,7 +107,9 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
                 "fig_link": [],
                 "validation_description": [],
             }
-        ).lazy().sink_csv(report_name, quote_style='non_numeric')
+        )
+        with fsspec.open(path, mode='wb') as f:
+            empty_df.write_csv(f, quote_style='non_numeric')
         return
 
     sorted_df = (
@@ -122,14 +119,7 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
     )
 
     with fsspec.open(path, mode='wb') as f:
-        df.write_csv(f, quote_style='non_numeric')
-
-
-def monitor_memory(writing_thread, interval=1):
-    import time
-    while writing_thread.is_alive():
-        get_memory_usage()
-        time.sleep(interval)
+        sorted_df.write_csv(f, quote_style='non_numeric')
 
 
 def df_to_csv(df: pl.DataFrame) -> str: