Merge pull request #133 from DSACMS/sort-and-design-multi-field-compa…

…rison Patch Address Parsing and Print Comparison
DSACMS · Sep 10, 2024 · 8da4c12 · 8da4c12
2 parents 1f90279 + e2b8945
commit 8da4c12
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 13 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10", "3.11","3.12"]
     steps:
       - uses: actions/checkout@v4
       - name: Install poetry

diff --git a/cli/deduplifhirLib/settings.py b/cli/deduplifhirLib/settings.py
@@ -48,10 +48,21 @@ def get_additional_comparison_rules(parsed_data_df):
 
     for col in parsed_data_columns:
         if 'street_address' in col:
-            yield cl.ExactMatch(col).configure(term_frequency_adjustments=True)
+            yield cl.ExactMatch(col)
         elif 'postal_code' in col:
             yield cl.PostcodeComparison(col)
 
+def create_blocking_rules():
+    blocking_rules = []
+    for rule in BLOCKING_RULE_STRINGS:
+        if isinstance(rule, list):
+            blocking_rules.append(block_on(*rule))
+        else:
+            blocking_rules.append(block_on(rule))
+
+    return blocking_rules
+
+
 def create_settings(parsed_data_df):
     """
     This function generates a Splink SettingsCreator object based on the parsed
@@ -65,12 +76,7 @@ def create_settings(parsed_data_df):
         A splink SettingsCreator object to be used with a splink linker object
     """
 
-    blocking_rules = []
-    for rule in BLOCKING_RULE_STRINGS:
-        if isinstance(rule, list):
-            blocking_rules.append(block_on(*rule))
-        else:
-            blocking_rules.append(block_on(rule))
+    blocking_rules = create_blocking_rules()
 
     comparison_rules = [item for item in get_additional_comparison_rules(parsed_data_df)]
     comparison_rules.extend([
@@ -109,7 +115,7 @@ def parse_fhir_dates(fhir_json_obj):
     """
     addresses = fhir_json_obj['entry'][0]['resource']['address']
 
-    for addr,n in enumerate(addresses):
+    for addr,n in enumerate(sorted(addresses)):
         yield {
             f"street_address{n}": [normalize_addr_text(''.join(addr['line']))],
             f"city{n}": [normalize_addr_text(addr['city'])],

diff --git a/cli/deduplifhirLib/splink_settings.json b/cli/deduplifhirLib/splink_settings.json
@@ -3,7 +3,6 @@
     "blocking_rules_to_generate_predictions": [
          "birth_date",
         ["ssn", "birth_date"],
-        ["ssn", "street_address0"],
         "phone"
     ],
     "max_iterations": 20,

diff --git a/cli/deduplifhirLib/utils.py b/cli/deduplifhirLib/utils.py
@@ -13,9 +13,10 @@
 from functools import wraps
 import pandas as pd
 from splink import DuckDBAPI, Linker
+from splink.blocking_analysis import cumulative_comparisons_to_be_scored_from_blocking_rules_data
 
 from deduplifhirLib.settings import (
-    create_settings, BLOCKING_RULE_STRINGS, read_fhir_data
+    create_settings, BLOCKING_RULE_STRINGS, read_fhir_data, create_blocking_rules
 )
 from deduplifhirLib.normalization import (
     normalize_addr_text, normalize_name_text, normalize_date_text
@@ -92,7 +93,7 @@ def parse_csv_dict_row_addresses(row):
     """
     parsed = row
 
-    address_keys = ["street_address","city","state","postal_code"]
+    address_keys = ["address","city","state","postal_code"]
 
     for k,v in row.items():
         if any(match in k.lower() for match in address_keys):
@@ -147,7 +148,7 @@ def parse_test_data(path,marked=False):
                 normal_row = parse_csv_dict_row_names(normal_row)
                 normal_row["birth_date"] = normalize_date_text(normal_row["birth_date"])
 
-                patient_dict.update({k.lower():[v] for k,v in row.items()})
+                patient_dict.update({k.lower():[v] for k,v in normal_row.items()})
                 #print(len(row))
 
                 #print(patient_dict)
@@ -222,6 +223,17 @@ def wrapper(*args,**kwargs):
                 raise e
 
         #lnkr = DuckDBLinker(train_frame, SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE)
+
+        preprocessing_metadata = cumulative_comparisons_to_be_scored_from_blocking_rules_data(
+            table_or_tables=train_frame,
+            blocking_rules=create_blocking_rules(),
+            link_type="dedupe_only",
+            db_api=DuckDBAPI()
+        )
+
+        print("Stats for nerds:")
+        print(preprocessing_metadata.to_string())
+
         lnkr = Linker(train_frame,create_settings(train_frame),db_api=DuckDBAPI())
         lnkr.training.estimate_u_using_random_sampling(max_pairs=5e6)