Skip to content

Commit

Permalink
Removed memory resource testing code
Browse files Browse the repository at this point in the history
  • Loading branch information
jcadam14 committed Aug 30, 2024
1 parent 186a7e9 commit 7ce9068
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 62 deletions.
31 changes: 1 addition & 30 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ matplotlib = "^3.9.0"
fsspec = "^2024.6.1"
s3fs = "^2024.6.1"
polars-lts-cpu = "^1.6.0"
psutil = "^6.0.0"
pyarrow = "^17.0.0"

[tool.poetry.group.dev.dependencies]
Expand Down
16 changes: 3 additions & 13 deletions src/regtech_data_validator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@
from regtech_data_validator.data_formatters import df_to_csv, df_to_str, df_to_json, df_to_table, df_to_download
from typing import Annotated, Optional

import fsspec
from fsspec import AbstractFileSystem, filesystem
import polars as pl
import typer
import typer.core

from regtech_data_validator.checks import Severity
from regtech_data_validator.validator import validate_batch_csv
from regtech_data_validator.validation_results import ValidationPhase

Expand Down Expand Up @@ -90,7 +87,7 @@ def validate(
final_phase = ValidationPhase.LOGICAL
all_findings = []
final_df = pl.DataFrame()
#path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
# path = "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156.csv"
for findings, phase in validate_batch_csv(path, context_dict, batch_size=50000, batch_count=5):
total_findings += findings.height
final_phase = phase
Expand All @@ -102,11 +99,7 @@ def validate(

if all_findings:
final_df = pl.concat(all_findings, how="diagonal")

print(f"Single Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'single-field').height}")
print(f"Multi Errors: {final_df.filter(pl.col('validation_type') == Severity.ERROR, pl.col('scope') == 'multi-field').height}")
print(f"Single Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'single-field').height}")
print(f"Multi Warns: {final_df.filter(pl.col('validation_type') == Severity.WARNING, pl.col('scope') == 'multi-field').height}")

status = "SUCCESS" if total_findings == 0 else "FAILURE"

match output:
Expand All @@ -119,10 +112,7 @@ def validate(
case OutputFormat.TABLE:
print(df_to_table(final_df))
case OutputFormat.DOWNLOAD:
# uses streaming sink_csv, which doesn't print out
# to a string to save memory
df_to_download(final_df, "s3://cfpb-devpub-regtech-sbl-filing-main/upload/2024/1234364890REGTECH006/156_report.csv")
#df_to_download(final_df)
df_to_download(final_df)
case _:
raise ValueError(f'output format "{output}" not supported')

Expand Down
26 changes: 8 additions & 18 deletions src/regtech_data_validator/data_formatters.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import ujson
import pandas as pd
import polars as pl
import fsspec
import threading
import s3fs
import psutil
import csv

from tabulate import tabulate

from functools import partial
from fsspec import AbstractFileSystem, filesystem


def find_check(group_name, checks):
gen = (check for check in checks if check.title == group_name)
Expand All @@ -22,7 +17,7 @@ def find_check(group_name, checks):
# which corresponds to severity, error/warning code, name of error/warning, row number in sblar, UID, fig link,
# error/warning description (markdown formatted), single/multi/register, and the fields and values associated with the error/warning.
# Each row in the final dataframe represents all data for that one finding.
def format_findings(df: pd.DataFrame, checks):
def format_findings(df: pl.DataFrame, checks):
final_df = pl.DataFrame()

sorted_df = df.with_columns(pl.col('validation_id').cast(pl.Categorical(ordering='lexical'))).sort('validation_id')
Expand Down Expand Up @@ -99,10 +94,10 @@ def format_findings(df: pd.DataFrame, checks):
return final_df


def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
def df_to_download(df: pl.DataFrame, path: str = "download_report.csv"):
if df.is_empty():
# return headers of csv for 'emtpy' report
pl.DataFrame(
empty_df = pl.DataFrame(
{
"validation_type": [],
"validation_id": [],
Expand All @@ -112,7 +107,9 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
"fig_link": [],
"validation_description": [],
}
).lazy().sink_csv(report_name, quote_style='non_numeric')
)
with fsspec.open(path, mode='wb') as f:
empty_df.write_csv(f, quote_style='non_numeric')
return

sorted_df = (
Expand All @@ -122,14 +119,7 @@ def df_to_download(df: pd.DataFrame, path: str = "download_report.csv"):
)

with fsspec.open(path, mode='wb') as f:
df.write_csv(f, quote_style='non_numeric')


def monitor_memory(writing_thread, interval=1):
import time
while writing_thread.is_alive():
get_memory_usage()
time.sleep(interval)
sorted_df.write_csv(f, quote_style='non_numeric')


def df_to_csv(df: pl.DataFrame) -> str:
Expand Down

0 comments on commit 7ce9068

Please sign in to comment.