122 add fig anchor metadata to validations (#124)

Closes #122 I created a script that read through the fig_source.html, which I saved from browser, and found each of the elements containing the validation ID for each SBLCheck. Then I grabbed the href and stored that in the fig_anchor in the phase_validations.py file. I created a pytest to use the same html to loop through both the phase validation fig_anchors and the hrefs to compare to ensure each validation ID had the correct fig anchor. I also updated the existing test_cli.py formats to include the new fig_anchor field. Might be a good idea to store this tool somewhere, it's all local currently. Just in case they change something, it makes it easy to loop through the py file and insert the fig_anchor instead of manually copy/pasting. Updated the schema to include the fig_anchor in the check, which automatically carries over into the validation results. Updated the df_to_json to include the fig_anchor in the json we send to clients. The other df_to's automatically get the fig_anchor. I don't think we need another story in the filing-api, the fig_anchor for each result will be in the JSON blob. I would LOVE to figure out how to automatically pull down the FIG html in an actual usable way for the pytests. However, because it's all javascripted, the actual hrefs don't come across as full links if you do a request.get, or curl, or wget. The only way I've found to have both the full href link and the Validation ID associated with it is saving the page off in a browser.
cfpb · Apr 2, 2024 · c0727bc · c0727bc
1 parent 32c0074
commit c0727bc
Show file tree

Hide file tree

Showing 7 changed files with 717 additions and 156 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,11 +14,14 @@ packages = [{ include = "regtech_data_validator", from = "src" }]
 python = ">=3.12,<4"
 pandas = "^2.2.1"
 pandera = "^0.18.3"
+requests = "^2.31.0"
 tabulate = "^0.9.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "8.1.1"
 pytest-cov = "5.0.0"
+beautifulsoup4 = "^4.12.3"
+lxml = "^5.1.1"
 
 [tool.poetry.group.data.dependencies]
 openpyxl = "^3.1.2"

diff --git a/src/regtech_data_validator/checks.py b/src/regtech_data_validator/checks.py
@@ -25,7 +25,16 @@ class SBLCheck(Check):
     SBLWarningCheck subclasses below.
     """
 
-    def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
+    def __init__(
+        self,
+        check_fn: Callable,
+        id: str,
+        name: str,
+        description: str,
+        severity: Severity,
+        fig_link: str,
+        **check_kwargs
+    ):
         """
         Subclass of Pandera's `Check`, with special handling for severity level
         Args:
@@ -38,6 +47,7 @@ def __init__(self, check_fn: Callable, id: str, name: str, description: str, sev
         """
 
         self.severity = severity
+        self.fig_link = fig_link
 
         super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)
 

diff --git a/src/regtech_data_validator/create_schemas.py b/src/regtech_data_validator/create_schemas.py
@@ -85,6 +85,7 @@ def _add_validation_metadata(failed_check_fields_df: pd.DataFrame, check: SBLChe
 
     validation_fields_df = (
         failed_check_fields_df.assign(validation_severity=check.severity)
+        .assign(fig_link=check.fig_link)
         .assign(validation_id=check.title)
         .assign(validation_name=check.name)
         .assign(validation_desc=check.description)

diff --git a/src/regtech_data_validator/global_data.py b/src/regtech_data_validator/global_data.py
@@ -1,6 +1,9 @@
 import csv
 from importlib.resources import files
 
+fig_base_url = (
+    "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/"
+)
 
 # global variable for NAICS codes
 naics_codes: dict[str, str] = {}

diff --git a/src/regtech_data_validator/phase_validations.py b/src/regtech_data_validator/phase_validations.py
diff --git a/tests/test_fig_links.py b/tests/test_fig_links.py
@@ -0,0 +1,39 @@
+import requests
+
+from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
+from regtech_data_validator.global_data import fig_base_url
+from bs4 import BeautifulSoup
+
+
+class TestFigAnchors:
+
+    def test_fig_links(self):
+
+        html_text = requests.get(
+            "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4"
+        ).text
+        source_links = BeautifulSoup(html_text, 'html.parser')
+
+        validators = get_phase_1_and_2_validations_for_lei()
+        checks = []
+        validator_anchors = []
+        fig_links = []
+
+        for k in validators.keys():
+            v = validators[k]
+            for p in v.keys():
+                checks.extend(v[p])
+
+        for check in checks:
+            validator_anchors.append({"id": check.title, "anchor": check.fig_link})
+
+        elements = source_links.find_all(lambda tag: tag.name == "a" and "Validation ID:" in tag.text)
+        for e in elements:
+            anchor = e.get('href')
+            id = e.text.split("Validation ID:")[1].strip()
+            fig_links.append({"id": id, "anchor": fig_base_url + anchor})
+
+        validator_anchors = sorted(validator_anchors, key=lambda d: d['id'])
+        fig_links = sorted(fig_links, key=lambda d: d['id'])
+        anchors = zip(validator_anchors, fig_links)
+        assert not any(x != y for x, y in anchors)