Skip to content

Commit

Permalink
Merge pull request #133 from DSACMS/sort-and-design-multi-field-compa…
Browse files Browse the repository at this point in the history
…rison

Patch Address Parsing and Print Comparison
  • Loading branch information
IsaacMilarky authored Sep 10, 2024
2 parents 1f90279 + e2b8945 commit 8da4c12
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11"]
python-version: ["3.10", "3.11","3.12"]
steps:
- uses: actions/checkout@v4
- name: Install poetry
Expand Down
22 changes: 14 additions & 8 deletions cli/deduplifhirLib/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,21 @@ def get_additional_comparison_rules(parsed_data_df):

for col in parsed_data_columns:
if 'street_address' in col:
yield cl.ExactMatch(col).configure(term_frequency_adjustments=True)
yield cl.ExactMatch(col)
elif 'postal_code' in col:
yield cl.PostcodeComparison(col)

def create_blocking_rules():
blocking_rules = []
for rule in BLOCKING_RULE_STRINGS:
if isinstance(rule, list):
blocking_rules.append(block_on(*rule))
else:
blocking_rules.append(block_on(rule))

return blocking_rules


def create_settings(parsed_data_df):
"""
This function generates a Splink SettingsCreator object based on the parsed
Expand All @@ -65,12 +76,7 @@ def create_settings(parsed_data_df):
A splink SettingsCreator object to be used with a splink linker object
"""

blocking_rules = []
for rule in BLOCKING_RULE_STRINGS:
if isinstance(rule, list):
blocking_rules.append(block_on(*rule))
else:
blocking_rules.append(block_on(rule))
blocking_rules = create_blocking_rules()

comparison_rules = [item for item in get_additional_comparison_rules(parsed_data_df)]
comparison_rules.extend([
Expand Down Expand Up @@ -109,7 +115,7 @@ def parse_fhir_dates(fhir_json_obj):
"""
addresses = fhir_json_obj['entry'][0]['resource']['address']

for addr,n in enumerate(addresses):
for addr,n in enumerate(sorted(addresses)):
yield {
f"street_address{n}": [normalize_addr_text(''.join(addr['line']))],
f"city{n}": [normalize_addr_text(addr['city'])],
Expand Down
1 change: 0 additions & 1 deletion cli/deduplifhirLib/splink_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"blocking_rules_to_generate_predictions": [
"birth_date",
["ssn", "birth_date"],
["ssn", "street_address0"],
"phone"
],
"max_iterations": 20,
Expand Down
18 changes: 15 additions & 3 deletions cli/deduplifhirLib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
from functools import wraps
import pandas as pd
from splink import DuckDBAPI, Linker
from splink.blocking_analysis import cumulative_comparisons_to_be_scored_from_blocking_rules_data

from deduplifhirLib.settings import (
create_settings, BLOCKING_RULE_STRINGS, read_fhir_data
create_settings, BLOCKING_RULE_STRINGS, read_fhir_data, create_blocking_rules
)
from deduplifhirLib.normalization import (
normalize_addr_text, normalize_name_text, normalize_date_text
Expand Down Expand Up @@ -92,7 +93,7 @@ def parse_csv_dict_row_addresses(row):
"""
parsed = row

address_keys = ["street_address","city","state","postal_code"]
address_keys = ["address","city","state","postal_code"]

for k,v in row.items():
if any(match in k.lower() for match in address_keys):
Expand Down Expand Up @@ -147,7 +148,7 @@ def parse_test_data(path,marked=False):
normal_row = parse_csv_dict_row_names(normal_row)
normal_row["birth_date"] = normalize_date_text(normal_row["birth_date"])

patient_dict.update({k.lower():[v] for k,v in row.items()})
patient_dict.update({k.lower():[v] for k,v in normal_row.items()})
#print(len(row))

#print(patient_dict)
Expand Down Expand Up @@ -222,6 +223,17 @@ def wrapper(*args,**kwargs):
raise e

#lnkr = DuckDBLinker(train_frame, SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE)

preprocessing_metadata = cumulative_comparisons_to_be_scored_from_blocking_rules_data(
table_or_tables=train_frame,
blocking_rules=create_blocking_rules(),
link_type="dedupe_only",
db_api=DuckDBAPI()
)

print("Stats for nerds:")
print(preprocessing_metadata.to_string())

lnkr = Linker(train_frame,create_settings(train_frame),db_api=DuckDBAPI())
lnkr.training.estimate_u_using_random_sampling(max_pairs=5e6)

Expand Down

0 comments on commit 8da4c12

Please sign in to comment.