From 4c0d48b9a88023b89716237c0aabda81d7e4a1d7 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Mon, 6 Jan 2025 20:18:42 +0000 Subject: [PATCH 1/6] add handling for missing disease results --- .../analyse/disease_prioritisation_analysis.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/pheval/analyse/disease_prioritisation_analysis.py b/src/pheval/analyse/disease_prioritisation_analysis.py index 88bcbeb6..9ffebdb5 100644 --- a/src/pheval/analyse/disease_prioritisation_analysis.py +++ b/src/pheval/analyse/disease_prioritisation_analysis.py @@ -38,11 +38,16 @@ def assess_disease_prioritisation( for _i, row in df.iterrows(): result = ( self.conn.execute( - f"SELECT * FROM '{standardised_disease_result_path}' " - f"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR)," - f" '{row['disease_identifier']}') " - f"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), " - f"'{row['disease_name']}')" + ( + f"SELECT * FROM '{standardised_disease_result_path}' " + f"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR)," + f" '{row['disease_identifier']}') " + f"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), " + f"'{row['disease_name']}')" + ) + if standardised_disease_result_path.exists() + and standardised_disease_result_path.stat().st_size > 0 + else "SELECT NULL WHERE FALSE" ) .fetchdf() .to_dict(orient="records") From f9d97c8d34d06c38ecde9b535f1cc1aaf27c5d1f Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Mon, 6 Jan 2025 20:18:52 +0000 Subject: [PATCH 2/6] add handling for missing gene results --- .../analyse/gene_prioritisation_analysis.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/pheval/analyse/gene_prioritisation_analysis.py b/src/pheval/analyse/gene_prioritisation_analysis.py index 407ed82f..05c465ad 100644 --- a/src/pheval/analyse/gene_prioritisation_analysis.py +++ b/src/pheval/analyse/gene_prioritisation_analysis.py @@ -36,11 +36,16 @@ def assess_gene_prioritisation( for _i, row in df.iterrows(): result = ( self.conn.execute( - f"SELECT * FROM '{standardised_gene_result_path}' " - f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR)," - f" '{row['gene_identifier']}') " - f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), " - f"'{row['gene_symbol']}')" + ( + f"SELECT * FROM '{standardised_gene_result_path}' " + f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR), " + f"'{row['gene_identifier']}') " + f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), " + f"'{row['gene_symbol']}')" + ) + if standardised_gene_result_path.exists() + and standardised_gene_result_path.stat().st_size > 0 + else "SELECT NULL WHERE FALSE" ) .fetchdf() .to_dict(orient="records") From c7434185f6d8e2d20231630eadc0f852407c6746 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Mon, 6 Jan 2025 20:19:40 +0000 Subject: [PATCH 3/6] add handling for missing variant results --- .../variant_prioritisation_analysis.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/pheval/analyse/variant_prioritisation_analysis.py b/src/pheval/analyse/variant_prioritisation_analysis.py index 39a6f639..3e384e89 100644 --- a/src/pheval/analyse/variant_prioritisation_analysis.py +++ b/src/pheval/analyse/variant_prioritisation_analysis.py @@ -44,12 +44,16 @@ def assess_variant_prioritisation( ) result = ( self.conn.execute( - f"SELECT * FROM '{standardised_variant_result_path}' " - f"WHERE " - f"chromosome == '{causative_variant.chrom}' AND " - f"start == {causative_variant.pos} AND " - f"ref == '{causative_variant.ref}' AND " - f"alt == '{causative_variant.alt}'" + ( + f"SELECT * FROM '{standardised_variant_result_path}' " + f"WHERE " + f"chromosome == '{causative_variant.chrom}' AND " + f"start == {causative_variant.pos} AND " + f"ref == '{causative_variant.ref}' AND " + f"alt == '{causative_variant.alt}'" + ) + if standardised_variant_result_path.exists() + else "SELECT NULL WHERE FALSE" ) .fetchdf() .to_dict(orient="records") @@ -66,7 +70,8 @@ def assess_variant_prioritisation( f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?', (variant_match, primary_key), ) - + elif len(result) == 0: + relevant_ranks.append(0) binary_classification_stats.add_classification( self.db_connection.parse_table_into_dataclass( str(standardised_variant_result_path), RankedPhEvalVariantResult From 88343bfa7c871b93a90dad72e27740df9c45cd41 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Tue, 7 Jan 2025 09:53:02 +0000 Subject: [PATCH 4/6] add handling for missing results --- .../disease_prioritisation_analysis.py | 4 ++- .../analyse/gene_prioritisation_analysis.py | 4 ++- .../variant_prioritisation_analysis.py | 26 +++++++++---------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/pheval/analyse/disease_prioritisation_analysis.py b/src/pheval/analyse/disease_prioritisation_analysis.py index 9ffebdb5..c3c42536 100644 --- a/src/pheval/analyse/disease_prioritisation_analysis.py +++ b/src/pheval/analyse/disease_prioritisation_analysis.py @@ -61,10 +61,12 @@ def assess_disease_prioritisation( f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?', (disease_match, primary_key), ) + elif len(result) == 0: + relevant_ranks.append(0) binary_classification_stats.add_classification( self.db_connection.parse_table_into_dataclass( str(standardised_disease_result_path), RankedPhEvalDiseaseResult - ), + ) if standardised_disease_result_path.exists() else [], relevant_ranks, ) diff --git a/src/pheval/analyse/gene_prioritisation_analysis.py b/src/pheval/analyse/gene_prioritisation_analysis.py index 05c465ad..4be04a72 100644 --- a/src/pheval/analyse/gene_prioritisation_analysis.py +++ b/src/pheval/analyse/gene_prioritisation_analysis.py @@ -58,10 +58,12 @@ def assess_gene_prioritisation( f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?', (gene_match, primary_key), ) + if not result: + relevant_ranks.append(0) binary_classification_stats.add_classification( self.db_connection.parse_table_into_dataclass( str(standardised_gene_result_path), RankedPhEvalGeneResult - ), + ) if standardised_gene_result_path.exists() else [], relevant_ranks, ) diff --git a/src/pheval/analyse/variant_prioritisation_analysis.py b/src/pheval/analyse/variant_prioritisation_analysis.py index 3e384e89..6835bb4e 100644 --- a/src/pheval/analyse/variant_prioritisation_analysis.py +++ b/src/pheval/analyse/variant_prioritisation_analysis.py @@ -15,10 +15,10 @@ class AssessVariantPrioritisation(AssessPrioritisationBase): """Class for assessing variant prioritisation based on thresholds and scoring orders.""" def assess_variant_prioritisation( - self, - standardised_variant_result_path: Path, - phenopacket_path: Path, - binary_classification_stats: BinaryClassificationStats, + self, + standardised_variant_result_path: Path, + phenopacket_path: Path, + binary_classification_stats: BinaryClassificationStats, ) -> None: """ Assess variant prioritisation. @@ -75,16 +75,16 @@ def assess_variant_prioritisation( binary_classification_stats.add_classification( self.db_connection.parse_table_into_dataclass( str(standardised_variant_result_path), RankedPhEvalVariantResult - ), + ) if standardised_variant_result_path.exists() else [], relevant_ranks, ) def assess_phenopacket_variant_prioritisation( - phenopacket_path: Path, - run: RunConfig, - variant_binary_classification_stats: BinaryClassificationStats, - variant_benchmarker: AssessVariantPrioritisation, + phenopacket_path: Path, + run: RunConfig, + variant_binary_classification_stats: BinaryClassificationStats, + variant_benchmarker: AssessVariantPrioritisation, ) -> None: """ Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results @@ -107,10 +107,10 @@ def assess_phenopacket_variant_prioritisation( def benchmark_variant_prioritisation( - benchmark_name: str, - run: RunConfig, - score_order: str, - threshold: float, + benchmark_name: str, + run: RunConfig, + score_order: str, + threshold: float, ): """ Benchmark a directory based on variant prioritisation results. From c6604ffaf170b9f45eb20e808b0590b66bce6f86 Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Tue, 7 Jan 2025 09:53:12 +0000 Subject: [PATCH 5/6] modify tests to mock path --- tests/test_analysis.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 28f8b6dd..12811019 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,7 +1,7 @@ import unittest from copy import copy from pathlib import Path -from unittest.mock import patch +from unittest.mock import patch, MagicMock import duckdb @@ -130,9 +130,13 @@ def test_assess_gene_with_threshold_meets_cutoff(self): ) def test_assess_gene_prioritisation_no_threshold(self): + mock_path = MagicMock(spec=Path) + mock_path.exists.return_value = True + mock_path.stat.return_value.st_size = 100 + mock_path.__str__.return_value = "result" self.db_connector.add_contains_function() self.assess_gene_prioritisation.assess_gene_prioritisation( - "result", + mock_path, Path("/path/to/phenopacket_1.json"), self.binary_classification_stats, ) @@ -150,7 +154,7 @@ def test_assess_gene_prioritisation_no_threshold(self): true_positives=1, true_negatives=3, false_positives=0, - false_negatives=0, + false_negatives=1, labels=[1, 0, 0, 0], scores=[0.8764, 0.5777, 0.5777, 0.3765], ), @@ -282,9 +286,13 @@ def test_assess_variant_with_threshold_meets_cutoff(self): ) def test_assess_variant_prioritisation(self): + mock_path = MagicMock(spec=Path) + mock_path.exists.return_value = True + mock_path.stat.return_value.st_size = 100 + mock_path.__str__.return_value = "result" self.db_connector.add_contains_function() self.assess_variant_prioritisation.assess_variant_prioritisation( - "result", + mock_path, Path("/path/to/phenopacket_1.json"), self.binary_classification_stats, ) @@ -318,7 +326,7 @@ def test_assess_variant_prioritisation(self): true_positives=0, true_negatives=0, false_positives=2, - false_negatives=1, + false_negatives=2, labels=[0, 0, 1], scores=[0.0484, 0.0484, 0.0484], ), @@ -439,9 +447,13 @@ def test_assess_disease_with_threshold_meets_cutoff(self): ) def test_assess_disease_prioritisation(self): + mock_path = MagicMock(spec=Path) + mock_path.exists.return_value = True + mock_path.stat.return_value.st_size = 100 + mock_path.__str__.return_value = "result" self.db_connector.add_contains_function() self.assess_disease_prioritisation.assess_disease_prioritisation( - "result", + mock_path, Path("/path/to/phenopacket_1.json"), self.binary_classification_stats, ) From 8930cd2bffa6bdf82a0a5d258bcfe08ddb698d7b Mon Sep 17 00:00:00 2001 From: Yasemin Bridges Date: Tue, 7 Jan 2025 09:54:41 +0000 Subject: [PATCH 6/6] tox lint --- .../disease_prioritisation_analysis.py | 10 ++++-- .../analyse/gene_prioritisation_analysis.py | 10 ++++-- .../variant_prioritisation_analysis.py | 34 +++++++++++-------- tests/test_analysis.py | 2 +- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/src/pheval/analyse/disease_prioritisation_analysis.py b/src/pheval/analyse/disease_prioritisation_analysis.py index c3c42536..0b6d5c47 100644 --- a/src/pheval/analyse/disease_prioritisation_analysis.py +++ b/src/pheval/analyse/disease_prioritisation_analysis.py @@ -64,9 +64,13 @@ def assess_disease_prioritisation( elif len(result) == 0: relevant_ranks.append(0) binary_classification_stats.add_classification( - self.db_connection.parse_table_into_dataclass( - str(standardised_disease_result_path), RankedPhEvalDiseaseResult - ) if standardised_disease_result_path.exists() else [], + ( + self.db_connection.parse_table_into_dataclass( + str(standardised_disease_result_path), RankedPhEvalDiseaseResult + ) + if standardised_disease_result_path.exists() + else [] + ), relevant_ranks, ) diff --git a/src/pheval/analyse/gene_prioritisation_analysis.py b/src/pheval/analyse/gene_prioritisation_analysis.py index 4be04a72..55b47ff9 100644 --- a/src/pheval/analyse/gene_prioritisation_analysis.py +++ b/src/pheval/analyse/gene_prioritisation_analysis.py @@ -61,9 +61,13 @@ def assess_gene_prioritisation( if not result: relevant_ranks.append(0) binary_classification_stats.add_classification( - self.db_connection.parse_table_into_dataclass( - str(standardised_gene_result_path), RankedPhEvalGeneResult - ) if standardised_gene_result_path.exists() else [], + ( + self.db_connection.parse_table_into_dataclass( + str(standardised_gene_result_path), RankedPhEvalGeneResult + ) + if standardised_gene_result_path.exists() + else [] + ), relevant_ranks, ) diff --git a/src/pheval/analyse/variant_prioritisation_analysis.py b/src/pheval/analyse/variant_prioritisation_analysis.py index 6835bb4e..0f26bf72 100644 --- a/src/pheval/analyse/variant_prioritisation_analysis.py +++ b/src/pheval/analyse/variant_prioritisation_analysis.py @@ -15,10 +15,10 @@ class AssessVariantPrioritisation(AssessPrioritisationBase): """Class for assessing variant prioritisation based on thresholds and scoring orders.""" def assess_variant_prioritisation( - self, - standardised_variant_result_path: Path, - phenopacket_path: Path, - binary_classification_stats: BinaryClassificationStats, + self, + standardised_variant_result_path: Path, + phenopacket_path: Path, + binary_classification_stats: BinaryClassificationStats, ) -> None: """ Assess variant prioritisation. @@ -73,18 +73,22 @@ def assess_variant_prioritisation( elif len(result) == 0: relevant_ranks.append(0) binary_classification_stats.add_classification( - self.db_connection.parse_table_into_dataclass( - str(standardised_variant_result_path), RankedPhEvalVariantResult - ) if standardised_variant_result_path.exists() else [], + ( + self.db_connection.parse_table_into_dataclass( + str(standardised_variant_result_path), RankedPhEvalVariantResult + ) + if standardised_variant_result_path.exists() + else [] + ), relevant_ranks, ) def assess_phenopacket_variant_prioritisation( - phenopacket_path: Path, - run: RunConfig, - variant_binary_classification_stats: BinaryClassificationStats, - variant_benchmarker: AssessVariantPrioritisation, + phenopacket_path: Path, + run: RunConfig, + variant_binary_classification_stats: BinaryClassificationStats, + variant_benchmarker: AssessVariantPrioritisation, ) -> None: """ Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results @@ -107,10 +111,10 @@ def assess_phenopacket_variant_prioritisation( def benchmark_variant_prioritisation( - benchmark_name: str, - run: RunConfig, - score_order: str, - threshold: float, + benchmark_name: str, + run: RunConfig, + score_order: str, + threshold: float, ): """ Benchmark a directory based on variant prioritisation results. diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 12811019..7e6bbc8f 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,7 +1,7 @@ import unittest from copy import copy from pathlib import Path -from unittest.mock import patch, MagicMock +from unittest.mock import MagicMock, patch import duckdb