Removed memory resource testing code

cfpb · Aug 30, 2024 · 7ce9068 · 7ce9068
1 parent 186a7e9
commit 7ce9068
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 62 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,6 @@ matplotlib = "^3.9.0"
 fsspec = "^2024.6.1"
 s3fs = "^2024.6.1"
 polars-lts-cpu = "^1.6.0"
-psutil = "^6.0.0"
 pyarrow = "^17.0.0"
 
 [tool.poetry.group.dev.dependencies]

diff --git a/src/regtech_data_validator/cli.py b/src/regtech_data_validator/cli.py
@@ -4,13 +4,10 @@
 from regtech_data_validator.data_formatters import df_to_csv, df_to_str, df_to_json, df_to_table, df_to_download
 from typing import Annotated, Optional
 
-import fsspec
-from fsspec import AbstractFileSystem, filesystem
 import polars as pl
 import typer
 import typer.core
 
-from regtech_data_validator.checks import Severity
 from regtech_data_validator.validator import validate_batch_csv
 from regtech_data_validator.validation_results import ValidationPhase
 
@@ -90,7 +87,7 @@ def validate(
     final_phase = ValidationPhase.LOGICAL
     all_findings = []
     final_df = pl.DataFrame()
-    #path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
+    # path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
     for findings, phase in validate_batch_csv(path, context_dict, batch_size=50000, batch_count=5):
         total_findings += findings.height
         final_phase = phase
@@ -102,11 +99,7 @@ def validate(
 
     if all_findings:
         final_df = pl.concat(all_findings, how="diagonal")
-
-    print(f"Single Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'single-field').height}")
-    print(f"Multi Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'multi-field').height}")
-    print(f"Single Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'single-field').height}")
-    print(f"Multi Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'multi-field').height}")
+
     status = "SUCCESS" if total_findings == 0 else "FAILURE"
 
     match output:
@@ -119,10 +112,7 @@ def validate(
         case OutputFormat.TABLE:
             print(df_to_table(final_df))
         case OutputFormat.DOWNLOAD:
-            # uses streaming sink_csv, which doesn't print out
-            # to a string to save memory
-           df_to_download(final_df, "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156_report.csv")
-           #df_to_download(final_df)
+            df_to_download(final_df)
         case _:
             raise ValueError(f'output format "{output}" not supported')
 

diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py
@@ -1,16 +1,11 @@
 import ujson
-import pandas as pd
 import polars as pl
 import fsspec
-import threading
-import s3fs
-import psutil
-import csv
 
 from tabulate import tabulate
 
 from functools import partial
-from fsspec import AbstractFileSystem, filesystem
+
 
 def find_check(group_name, checks):
     gen = (check for check in checks if check.title == group_name)
@@ -22,7 +17,7 @@ def find_check(group_name, checks):
 # which corresponds to severity, error/warning code, name of error/warning, row number in sblar, UID, fig link,
 # error/warning description (markdown formatted), single/multi/register, and the fields and values associated with the error/warning.
 # Each row in the final dataframe represents all data for that one finding.
-def format_findings(df: pd.DataFrame, checks):
+def format_findings(df: pl.DataFrame, checks):
     final_df = pl.DataFrame()
 
     sorted_df = df.with_columns(pl.col('validation_id').cast(pl.Categorical(ordering='lexical'))).sort('validation_id')
@@ -99,10 +94,10 @@ def format_findings(df: pd.DataFrame, checks):
     return final_df
 
 
-def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
+def df_to_download(df: pl.DataFrame, path: str = "download_report.csv"):
     if df.is_empty():
         # return headers of csv for 'emtpy' report
-        pl.DataFrame(
+        empty_df = pl.DataFrame(
             {
                 "validation_type": [],
                 "validation_id": [],
@@ -112,7 +107,9 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
                 "fig_link": [],
                 "validation_description": [],
             }
-        ).lazy().sink_csv(report_name, quote_style='non_numeric')
+        )
+        with fsspec.open(path, mode='wb') as f:
+            empty_df.write_csv(f, quote_style='non_numeric')
         return
 
     sorted_df = (
@@ -122,14 +119,7 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
     )
 
     with fsspec.open(path, mode='wb') as f:
-        df.write_csv(f, quote_style='non_numeric')
-
-
-def monitor_memory(writing_thread, interval=1):
-    import time
-    while writing_thread.is_alive():
-        get_memory_usage()
-        time.sleep(interval)
+        sorted_df.write_csv(f, quote_style='non_numeric')
 
 
 def df_to_csv(df: pl.DataFrame) -> str: