diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 40ff740..2bec556 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11"] + python-version: ["3.10", "3.11","3.12"] steps: - uses: actions/checkout@v4 - name: Install poetry diff --git a/cli/deduplifhirLib/settings.py b/cli/deduplifhirLib/settings.py index 5e29c18..1614623 100644 --- a/cli/deduplifhirLib/settings.py +++ b/cli/deduplifhirLib/settings.py @@ -48,10 +48,21 @@ def get_additional_comparison_rules(parsed_data_df): for col in parsed_data_columns: if 'street_address' in col: - yield cl.ExactMatch(col).configure(term_frequency_adjustments=True) + yield cl.ExactMatch(col) elif 'postal_code' in col: yield cl.PostcodeComparison(col) +def create_blocking_rules(): + blocking_rules = [] + for rule in BLOCKING_RULE_STRINGS: + if isinstance(rule, list): + blocking_rules.append(block_on(*rule)) + else: + blocking_rules.append(block_on(rule)) + + return blocking_rules + + def create_settings(parsed_data_df): """ This function generates a Splink SettingsCreator object based on the parsed @@ -65,12 +76,7 @@ def create_settings(parsed_data_df): A splink SettingsCreator object to be used with a splink linker object """ - blocking_rules = [] - for rule in BLOCKING_RULE_STRINGS: - if isinstance(rule, list): - blocking_rules.append(block_on(*rule)) - else: - blocking_rules.append(block_on(rule)) + blocking_rules = create_blocking_rules() comparison_rules = [item for item in get_additional_comparison_rules(parsed_data_df)] comparison_rules.extend([ @@ -109,7 +115,7 @@ def parse_fhir_dates(fhir_json_obj): """ addresses = fhir_json_obj['entry'][0]['resource']['address'] - for addr,n in enumerate(addresses): + for addr,n in enumerate(sorted(addresses)): yield { f"street_address{n}": [normalize_addr_text(''.join(addr['line']))], f"city{n}": [normalize_addr_text(addr['city'])], diff --git a/cli/deduplifhirLib/splink_settings.json b/cli/deduplifhirLib/splink_settings.json index 8a04f84..e0a63fc 100644 --- a/cli/deduplifhirLib/splink_settings.json +++ b/cli/deduplifhirLib/splink_settings.json @@ -3,7 +3,6 @@ "blocking_rules_to_generate_predictions": [ "birth_date", ["ssn", "birth_date"], - ["ssn", "street_address0"], "phone" ], "max_iterations": 20, diff --git a/cli/deduplifhirLib/utils.py b/cli/deduplifhirLib/utils.py index 233fd1f..f55ce0a 100644 --- a/cli/deduplifhirLib/utils.py +++ b/cli/deduplifhirLib/utils.py @@ -13,9 +13,10 @@ from functools import wraps import pandas as pd from splink import DuckDBAPI, Linker +from splink.blocking_analysis import cumulative_comparisons_to_be_scored_from_blocking_rules_data from deduplifhirLib.settings import ( - create_settings, BLOCKING_RULE_STRINGS, read_fhir_data + create_settings, BLOCKING_RULE_STRINGS, read_fhir_data, create_blocking_rules ) from deduplifhirLib.normalization import ( normalize_addr_text, normalize_name_text, normalize_date_text @@ -92,7 +93,7 @@ def parse_csv_dict_row_addresses(row): """ parsed = row - address_keys = ["street_address","city","state","postal_code"] + address_keys = ["address","city","state","postal_code"] for k,v in row.items(): if any(match in k.lower() for match in address_keys): @@ -147,7 +148,7 @@ def parse_test_data(path,marked=False): normal_row = parse_csv_dict_row_names(normal_row) normal_row["birth_date"] = normalize_date_text(normal_row["birth_date"]) - patient_dict.update({k.lower():[v] for k,v in row.items()}) + patient_dict.update({k.lower():[v] for k,v in normal_row.items()}) #print(len(row)) #print(patient_dict) @@ -222,6 +223,17 @@ def wrapper(*args,**kwargs): raise e #lnkr = DuckDBLinker(train_frame, SPLINK_LINKER_SETTINGS_PATIENT_DEDUPE) + + preprocessing_metadata = cumulative_comparisons_to_be_scored_from_blocking_rules_data( + table_or_tables=train_frame, + blocking_rules=create_blocking_rules(), + link_type="dedupe_only", + db_api=DuckDBAPI() + ) + + print("Stats for nerds:") + print(preprocessing_metadata.to_string()) + lnkr = Linker(train_frame,create_settings(train_frame),db_api=DuckDBAPI()) lnkr.training.estimate_u_using_random_sampling(max_pairs=5e6)