def prepare_corpus(
phenopacket_dir: Path,
variant_analysis: bool,
gene_analysis: bool,
@@ -1583,6 +1588,11 @@ f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
)
continue
+ elif phenopacket_util.check_variant_alleles():
+ info_log.warning(
+ f"Removed {phenopacket_path.name} from the corpus due to identical "
+ "reference and alternate allele fields."
+ )
if gene_analysis:
if phenopacket_util.check_incomplete_gene_record():
info_log.warning(
diff --git a/api/pheval/utils/phenopacket_utils/index.html b/api/pheval/utils/phenopacket_utils/index.html
index e0050faf..1a9a2db0 100644
--- a/api/pheval/utils/phenopacket_utils/index.html
+++ b/api/pheval/utils/phenopacket_utils/index.html
@@ -1161,6 +1161,13 @@
check_incomplete_variant_record
+
+
+
+
+ check_variant_alleles
+
+
@@ -1674,6 +1681,13 @@
check_incomplete_variant_record
+
+
+
+
+ check_variant_alleles
+
+
@@ -1879,20 +1893,7 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 638
-639
-640
-641
-642
-643
-644
-645
-646
-647
-648
-649
-650
-651
+ 651
652
653
654
@@ -1986,7 +1987,20 @@ 742
743
744
-745 | class GeneIdentifierUpdater:
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
| class GeneIdentifierUpdater:
"""Class for updating gene identifiers within genomic interpretations."""
def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
@@ -2184,19 +2198,19 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 641
-642
-643
-644
-645
-646
-647
-648
-649
-650
-651
-652
-653 | def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
+ 654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666 | def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
"""
Initialise the GeneIdentifierUpdater.
@@ -2286,23 +2300,23 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 655
-656
-657
-658
-659
-660
-661
-662
-663
-664
-665
-666
-667
-668
+ | def find_identifier(self, gene_symbol: str) -> str:
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
| def find_identifier(self, gene_symbol: str) -> str:
"""
Find the specified gene identifier for a gene symbol.
@@ -2396,17 +2410,17 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 673
-674
-675
-676
-677
-678
-679
-680
-681
-682
-683 | def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
+ 686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696 | def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
"""
Obtain gene symbol from a gene identifier.
@@ -2494,20 +2508,7 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 713
-714
-715
-716
-717
-718
-719
-720
-721
-722
-723
-724
-725
-726
+ 726
727
728
729
@@ -2526,7 +2527,20 @@ 742
743
744
-745 | def update_genomic_interpretations_gene_identifier(
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
| def update_genomic_interpretations_gene_identifier(
self, interpretations: List[Interpretation], phenopacket_path: Path
) -> List[Interpretation]:
"""
@@ -2922,20 +2936,7 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 537
-538
-539
-540
-541
-542
-543
-544
-545
-546
-547
-548
-549
-550
+ 550
551
552
553
@@ -2990,7 +2991,20 @@ 602
603
604
-605 | class PhenopacketRebuilder:
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
| class PhenopacketRebuilder:
"""Class for rebuilding a Phenopacket"""
def __init__(self, phenopacket: Union[Phenopacket, Family]):
@@ -3117,13 +3131,13 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 540
-541
-542
-543
-544
-545
-546 | def __init__(self, phenopacket: Union[Phenopacket, Family]):
+ 553
+554
+555
+556
+557
+558
+559 | def __init__(self, phenopacket: Union[Phenopacket, Family]):
"""Initialise PhenopacketUtil
Attributes:
@@ -3207,24 +3221,24 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 569
-570
-571
-572
-573
-574
-575
-576
-577
-578
-579
-580
-581
-582
+ | def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) -> Union[Phenopacket, Family]:
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
| def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) -> Union[Phenopacket, Family]:
"""
Add randomised phenotypic profiles to a Phenopacket or Family.
@@ -3269,24 +3283,24 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 588
-589
-590
-591
-592
-593
-594
-595
-596
-597
-598
-599
-600
-601
+ | def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Union[Phenopacket, Family]:
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
| def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Union[Phenopacket, Family]:
"""
Add a spiked VCF path to a Phenopacket or Family.
@@ -3381,26 +3395,26 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 548
-549
-550
-551
-552
-553
-554
-555
-556
-557
-558
-559
-560
-561
+ 561
562
563
564
565
566
-567 | def update_interpretations(
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
| def update_interpretations(
self, interpretations: [Interpretation]
) -> Union[Phenopacket, Family]:
"""
@@ -3768,7 +3782,20 @@ 531
532
533
-534
| class PhenopacketUtil:
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
| class PhenopacketUtil:
"""Class for retrieving data from a Phenopacket or Family object"""
def __init__(self, phenopacket_contents: Union[Phenopacket, Family]):
@@ -4055,6 +4082,19 @@ return True
return False
+ def check_variant_alleles(self) -> bool:
+ """
+ Check if any variant record in the phenopacket has identical reference and alternate alleles.
+
+ Returns:
+ bool: True if the reference and alternate alleles are identical, False otherwise.
+ """
+ variants = self.diagnosed_variants()
+ for variant in variants:
+ if variant.ref == variant.alt:
+ return True
+ return False
+
def check_incomplete_gene_record(self) -> bool:
"""
Check if any gene record in the phenopacket has incomplete information.
@@ -4308,19 +4348,19 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 522
-523
-524
-525
-526
-527
-528
-529
-530
-531
-532
-533
-534 | def check_incomplete_disease_record(self) -> bool:
+ 535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547 | def check_incomplete_disease_record(self) -> bool:
"""
Check if any disease record in the phenopacket has incomplete information.
@@ -4382,21 +4422,21 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 506
-507
-508
-509
-510
-511
-512
-513
-514
-515
-516
-517
-518
-519
-520 | def check_incomplete_gene_record(self) -> bool:
+ 519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533 | def check_incomplete_gene_record(self) -> bool:
"""
Check if any gene record in the phenopacket has incomplete information.
@@ -4515,6 +4555,76 @@
+ check_variant_alleles()
+
+
+
+
+
+
+ Check if any variant record in the phenopacket has identical reference and alternate alleles.
+
+
+
+ Returns:
+
+
+
+Name | Type |
+ Description |
+
+
+
+
+bool |
+ bool
+ |
+
+
+ True if the reference and alternate alleles are identical, False otherwise.
+
+ |
+
+
+
+
+
+ Source code in src/pheval/utils/phenopacket_utils.py
+ 506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517 | def check_variant_alleles(self) -> bool:
+ """
+ Check if any variant record in the phenopacket has identical reference and alternate alleles.
+
+ Returns:
+ bool: True if the reference and alternate alleles are identical, False otherwise.
+ """
+ variants = self.diagnosed_variants()
+ for variant in variants:
+ if variant.ref == variant.alt:
+ return True
+ return False
+
|
+
+
+
+
+
+
+
+
+
+
diagnosed_genes()
@@ -6044,17 +6154,17 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 608
-609
-610
-611
-612
-613
-614
-615
-616
-617
-618 | def create_json_message(phenopacket: Union[Phenopacket, Family]) -> str:
+ 621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631 | def create_json_message(phenopacket: Union[Phenopacket, Family]) -> str:
"""
Create a JSON message for writing to a file.
@@ -6336,21 +6446,21 @@
Source code in src/pheval/utils/phenopacket_utils.py
- 621
-622
-623
-624
-625
-626
-627
-628
-629
-630
-631
-632
-633
-634
-635 | def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path) -> None:
+ 634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648 | def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path) -> None:
"""
Write a Phenopacket or Family object to a file in JSON format.
diff --git a/developing_a_pheval_plugin/index.html b/developing_a_pheval_plugin/index.html
index 3530ddec..5dbf069e 100644
--- a/developing_a_pheval_plugin/index.html
+++ b/developing_a_pheval_plugin/index.html
@@ -2265,7 +2265,20 @@ Utility methods
531
532
533
-534
| class PhenopacketUtil:
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
| class PhenopacketUtil:
"""Class for retrieving data from a Phenopacket or Family object"""
def __init__(self, phenopacket_contents: Union[Phenopacket, Family]):
@@ -2552,6 +2565,19 @@ Utility methods
return True
return False
+ def check_variant_alleles(self) -> bool:
+ """
+ Check if any variant record in the phenopacket has identical reference and alternate alleles.
+
+ Returns:
+ bool: True if the reference and alternate alleles are identical, False otherwise.
+ """
+ variants = self.diagnosed_variants()
+ for variant in variants:
+ if variant.ref == variant.alt:
+ return True
+ return False
+
def check_incomplete_gene_record(self) -> bool:
"""
Check if any gene record in the phenopacket has incomplete information.
diff --git a/objects.inv b/objects.inv
index 8498aff7..d589b25d 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/search/search_index.json b/search/search_index.json
index c5f01e8b..fc06ec0b 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Home Introduction PhEval - Phenotypic Inference Evaluation Framework PhEval: Tool-specific processing (VP pipeline) flowchart LR PC-->DP PC[(Phenopackets Corpus)] SSSOM[Semantic Similarity Profiles Mapping Commons]-->|OAK-SEMSIM|DP[Data Prepare] KG[Source data KG - Monarch KG]-->|KGX-BIOLINK|DP[Data Prepare] ONT[Ontologies - Phenio]-->|OAK-ONTO|DP[Data Prepare] DP-->RP[Run Prepare] RP-->PR[PhEval Runner] PR-->DP2[Data Process] ER[Exomiser Runner]-->PR EDP[Exomiser Data Prepare]-->DP ERP[Exomiser Run Prepare]-->RP PPP[Disease-profile similarity prediction Post-process]-->DP2 PV[Phenotype/Variant]-->DP2 GVP[Gene VP Post-process]-->DP2 EPP[Exomiser Post Process]-->GVP GVP-->VPR[VP Report] Quick links: GitHub page","title":"Home"},{"location":"#home","text":"","title":"Home"},{"location":"#introduction","text":"PhEval - Phenotypic Inference Evaluation Framework","title":"Introduction"},{"location":"#pheval-tool-specific-processing-vp-pipeline","text":"flowchart LR PC-->DP PC[(Phenopackets Corpus)] SSSOM[Semantic Similarity Profiles Mapping Commons]-->|OAK-SEMSIM|DP[Data Prepare] KG[Source data KG - Monarch KG]-->|KGX-BIOLINK|DP[Data Prepare] ONT[Ontologies - Phenio]-->|OAK-ONTO|DP[Data Prepare] DP-->RP[Run Prepare] RP-->PR[PhEval Runner] PR-->DP2[Data Process] ER[Exomiser Runner]-->PR EDP[Exomiser Data Prepare]-->DP ERP[Exomiser Run Prepare]-->RP PPP[Disease-profile similarity prediction Post-process]-->DP2 PV[Phenotype/Variant]-->DP2 GVP[Gene VP Post-process]-->DP2 EPP[Exomiser Post Process]-->GVP GVP-->VPR[VP Report] Quick links: GitHub page","title":"PhEval: Tool-specific processing (VP pipeline)"},{"location":"CODE_OF_CONDUCT/","text":"Contributor Covenant Code of Conduct Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. Our Standards Examples of behavior that contributes to creating a positive environment include: Using welcoming and inclusive language Being respectful of differing viewpoints and experiences Gracefully accepting constructive criticism Focusing on what is best for the community Showing empathy towards other community members Examples of unacceptable behavior by participants include: The use of sexualized language or imagery and unwelcome sexual attention or advances Trolling, insulting/derogatory comments, and personal or political attacks Public or private harassment Publishing others' private information, such as a physical or electronic address, without explicit permission Other conduct which could reasonably be considered inappropriate in a professional setting Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. Attribution This code of conduct has been derived from the excellent code of conduct of the ATOM project which in turn is adapted from the Contributor Covenant , version 1.4, available at https://contributor-covenant.org/version/1/4","title":"Contributor Covenant Code of Conduct"},{"location":"CODE_OF_CONDUCT/#contributor-covenant-code-of-conduct","text":"","title":"Contributor Covenant Code of Conduct"},{"location":"CODE_OF_CONDUCT/#our-pledge","text":"In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.","title":"Our Pledge"},{"location":"CODE_OF_CONDUCT/#our-standards","text":"Examples of behavior that contributes to creating a positive environment include: Using welcoming and inclusive language Being respectful of differing viewpoints and experiences Gracefully accepting constructive criticism Focusing on what is best for the community Showing empathy towards other community members Examples of unacceptable behavior by participants include: The use of sexualized language or imagery and unwelcome sexual attention or advances Trolling, insulting/derogatory comments, and personal or political attacks Public or private harassment Publishing others' private information, such as a physical or electronic address, without explicit permission Other conduct which could reasonably be considered inappropriate in a professional setting","title":"Our Standards"},{"location":"CODE_OF_CONDUCT/#our-responsibilities","text":"Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.","title":"Our Responsibilities"},{"location":"CODE_OF_CONDUCT/#scope","text":"This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.","title":"Scope"},{"location":"CODE_OF_CONDUCT/#enforcement","text":"Instances of abusive, harassing, or otherwise unacceptable behavior. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.","title":"Enforcement"},{"location":"CODE_OF_CONDUCT/#attribution","text":"This code of conduct has been derived from the excellent code of conduct of the ATOM project which in turn is adapted from the Contributor Covenant , version 1.4, available at https://contributor-covenant.org/version/1/4","title":"Attribution"},{"location":"about/","text":"PhEval - Phenotypic Inference Evaluation Framework Many variant prioritization tools (such as Exomiser and other computational approaches) rely on ontologies and phenotype matching, sometimes involving complex processes such as cross-species inference. The performance of such tools is exceedingly hard to evaluate because of the many factors involved: changes to the structure of the ontology, cross-species mappings, and semantic similarity algorithms can have significant consequences. Furthermore, the lack of suitable real-world problems/corpora leads to the situation that many algorithms are evaluated using simulations, which may fail to capture real-world scenarios. The lack of an evaluation framework that enables studying effects on data and knowledge inputs on real-world problems makes it difficult to optimize algorithms. To this end, we are developing a modular Phenotypic Inference Evaluation Framework (PhEval), which is delivered as a community resource.","title":"About"},{"location":"about/#pheval-phenotypic-inference-evaluation-framework","text":"Many variant prioritization tools (such as Exomiser and other computational approaches) rely on ontologies and phenotype matching, sometimes involving complex processes such as cross-species inference. The performance of such tools is exceedingly hard to evaluate because of the many factors involved: changes to the structure of the ontology, cross-species mappings, and semantic similarity algorithms can have significant consequences. Furthermore, the lack of suitable real-world problems/corpora leads to the situation that many algorithms are evaluated using simulations, which may fail to capture real-world scenarios. The lack of an evaluation framework that enables studying effects on data and knowledge inputs on real-world problems makes it difficult to optimize algorithms. To this end, we are developing a modular Phenotypic Inference Evaluation Framework (PhEval), which is delivered as a community resource.","title":"PhEval - Phenotypic Inference Evaluation Framework"},{"location":"contact/","text":"Contact The preferred way to contact the PhEval team is through the issue tracker (for problems with PhEval) or the GitHub discussions (for general questions). You can find any of the members of the PhEval core team on GitHub: https://github.com/orgs/monarch-initiative/teams/pheval-team Their GitHub profiles usually also provide email addresses.","title":"Contact Us"},{"location":"contact/#contact","text":"The preferred way to contact the PhEval team is through the issue tracker (for problems with PhEval) or the GitHub discussions (for general questions). You can find any of the members of the PhEval core team on GitHub: https://github.com/orgs/monarch-initiative/teams/pheval-team Their GitHub profiles usually also provide email addresses.","title":"Contact"},{"location":"contributing/","text":"Contributions First of all: Thank you for taking the time to contribute! The following is a set of guidelines for contributing to the PhEval framework. These guidelines are not strict rules. Use your best judgment, and feel free to propose changes to this document in a pull request. Table Of Contents Contributions Table Of Contents Code of Conduct Guidelines for Contributions and Requests Reporting problems with the data model Code of Conduct The monarch-technical-documentation team strives to create a welcoming environment for editors, users and other contributors. Please carefully read our Code of Conduct . Guidelines for Contributions and Requests Reporting problems with the data model Please use our Issue Tracker for reporting problems with the ontology.","title":"Contributions"},{"location":"contributing/#contributions","text":"First of all: Thank you for taking the time to contribute! The following is a set of guidelines for contributing to the PhEval framework. These guidelines are not strict rules. Use your best judgment, and feel free to propose changes to this document in a pull request.","title":"Contributions"},{"location":"contributing/#table-of-contents","text":"Contributions Table Of Contents Code of Conduct Guidelines for Contributions and Requests Reporting problems with the data model","title":"Table Of Contents"},{"location":"contributing/#code-of-conduct","text":"The monarch-technical-documentation team strives to create a welcoming environment for editors, users and other contributors. Please carefully read our Code of Conduct .","title":"Code of Conduct"},{"location":"contributing/#guidelines-for-contributions-and-requests","text":"","title":"Guidelines for Contributions and Requests"},{"location":"contributing/#reporting-problems-with-the-data-model","text":"Please use our Issue Tracker for reporting problems with the ontology.","title":"Reporting problems with the data model"},{"location":"developing_a_pheval_plugin/","text":"Developing a PhEval Plugin Description Plugin development allows PhEval to be extensible, as we have designed it. The plugin goal is to be flexible through custom runner implementations. This plugin development enhances the PhEval functionality. You can build one quickly using this step-by-step process. All custom Runners implementations must implement all PhevalRunner methods Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data Step-by-Step Plugin Development Process The plugin structure is derived from a cookiecutter template, Sphintoxetry-cookiecutter , and it uses Sphinx , tox and poetry as core dependencies. This allows PhEval extensibility to be standardized in terms of documentation and dependency management. 1. Sphintoxetry-cookiecutter scaffold First, install the cruft package. Cruft enables keeping projects up-to-date with future updates made to this original template. Install the latest release of cruft from pip pip install cruft NOTE: You may encounter an error with the naming of the project layout if using an older release of cruft. To avoid this, make sure you have installed the latest release version. Next, create a project using the sphintoxetry-cookiecutter template. cruft create https://github.com/monarch-initiative/monarch-project-template 2. Further setup Install poetry if you haven't already. pip install poetry Install dependencies poetry install Add PhEval dependency poetry add pheval Run tox to see if the setup works poetry run tox 3. Implement PhEval Custom Runner The runner name is arbitrary and custom Runner name was chose by demonstrative purposes Create a runner file inside the plugin project, e.g: \"\"\"Custom Pheval Runner.\"\"\" from dataclasses import dataclass from pathlib import Path from pheval.runners.runner import PhEvalRunner @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) 4. Add PhEval Plugins section to the pyproject.toml file [tool.poetry.plugins. \"pheval.plugins\" ] customrunner = \"pheval_plugin_example.runner:CustomPhevalRunner\" Replace the value above with the path to your custom runner plugin 5. Implementing PhEval helper methods Streamlining the creation of your custom PhEval runner can be facilitated by leveraging PhEval's versatile helper methods, where applicable. Within PhEval, numerous public methods have been designed to assist in your runner methods. The utilisation of these helper methods is optional, yet they are crafted to enhance the overall implementation process. Utility methods The PhenopacketUtil class is designed to aid in the collection of specific data from a Phenopacket. Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False PhenopacketUtil proves particularly beneficial in scenarios where the tool for which you're crafting a runner implementation does not directly accept Phenopackets as inputs. Instead, it might require elements\u2014such as HPO IDs\u2014 via the command-line interface (CLI). In this context, leveraging PhenopacketUtil within the runner's preparation phase enables the extraction of observed phenotypic features from the Phenopacket input, facilitating seamless processing. An example of how this could be implemented is outlined here: from pheval.utils.phenopacket_utils import phenopacket_reader from pheval.utils.phenopacket_utils import PhenopacketUtil phenopacket = phenopacket_reader ( \"/path/to/phenopacket.json\" ) phenopacket_util = PhenopacketUtil ( phenopacket ) # To return a list of all observed phenotypes for a phenopacket observed_phenotypes = phenopacket_util . observed_phenotypic_features () # To extract just the HPO ID as a list observed_phenotypes_hpo_ids = [ observed_phenotype . id for observed_phenotype in observed_phenotypes ] Additional tool-specific configurations For the pheval run command to execute successfully, a config.yaml should be found within the input directory supplied on the CLI. tool : tool_version : variant_analysis : gene_analysis : disease_analysis : tool_specific_configuration_options : The tool_specific_configuration_options is an optional field that can be populated with any variables specific to your runner implementation that is required for the running of your tool. All other fields are required to be filled in. The variant_analysis , gene_analysis , and disease_analysis are set as booleans and are for specifying what type of analysis/prioritisation the tool outputs. To populate the tool_specific_configurations_options with customised data, we suggest using the pydantic package as it can easily parse the data from the yaml structure. e.g., Define a BaseModel class with the fields that will populate the tool_specific_configuration_options from pydantic import BaseModel , Field class CustomisedConfigurations ( BaseModel ): \"\"\" Class for defining the customised configurations in tool_specific_configurations field, within the input_dir config.yaml Args: environment (str): Environment to run \"\"\" environment : str = Field ( ... ) Within your runner parse the field into an object. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) config = CustomisedConfigurations . parse_obj ( self . input_dir_config . tool_specific_configuration_options ) environment = config . environment def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) Post-processing methods PhEval currently supports the benchmarking of gene, variant, and disease prioritisation results. To benchmark these result types, PhEval TSV result files need to be generated. PhEval can deal with the ranking and generation of these files to the correct location. However, the runner implementation must handle the extraction of essential data from the tool-specific raw results. This involves transforming them into a list comprising PhEval data classes, with each instance representing a result entry. The dataclasses representing essential information extracted from tool-specific output for gene, variant, and disease prioritisation are defined as follows: Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float The generate_pheval_result() can be implemented in your runner to write out the PhEval TSV results. An example of how the method can be called is outlined here: from pheval.post_processing.post_processing import generate_pheval_result generate_pheval_result ( pheval_result = pheval_gene_result , # this is the list of extracted PhEval result requirements sort_order_str = \"descending\" , # or can be ascending - this determines in which order the scores will be ranked output_dir = output_directory , # this can be accessed from the runner instance e.g., self.output_dir tool_result_path = tool_result_json # this is the path to the tool-specific raw results file ) Adding metadata to the results.yml By default, PhEval will write a results.yml to the output directory supplied on the CLI. The results.yml contains basic metadata regarding the run configuration, however, there is also the option to add customised run metadata to the results.yml in the tool_specific_configuration_options field. To achieve this, you'll need to create a construct_meta_data() method within your runner implementation. This method is responsible for appending customised metadata to the metadata object in the form of a defined dataclass. It should return the entire metadata object once the addition is completed. e.g., Defined customised metadata dataclass: from dataclasses import dataclass @dataclass class CustomisedMetaData : customised_field : str Example of implementation in the runner. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) def construct_meta_data ( self ): \"\"\"Add metadata.\"\"\" self . meta_data . tool_specific_configuration_options = CustomisedMetaData ( customised_field = \"customised_value\" ) return self . meta_data 6. Test it. To update your custom pheval runner implementation, you must first install the package poetry install Now you have to be able to run PhEval passing your custom runner as parameter. e.g., pheval run -i ./input_dir -t ./test_data_dir -r 'customphevalrunner' -o output_dir The -r parameter stands for your plugin runner class name, and it must be entirely lowercase. Output: preparing running with custom pheval Runner post processing Pay attention to \" running with custom pheval Runner \" line, this is exactly what we had implemented in the CustomPhevalRunner Example","title":"Developing a PhEval Plugin"},{"location":"developing_a_pheval_plugin/#developing-a-pheval-plugin","text":"","title":"Developing a PhEval Plugin"},{"location":"developing_a_pheval_plugin/#description","text":"Plugin development allows PhEval to be extensible, as we have designed it. The plugin goal is to be flexible through custom runner implementations. This plugin development enhances the PhEval functionality. You can build one quickly using this step-by-step process. All custom Runners implementations must implement all PhevalRunner methods Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"Description"},{"location":"developing_a_pheval_plugin/#step-by-step-plugin-development-process","text":"The plugin structure is derived from a cookiecutter template, Sphintoxetry-cookiecutter , and it uses Sphinx , tox and poetry as core dependencies. This allows PhEval extensibility to be standardized in terms of documentation and dependency management.","title":"Step-by-Step Plugin Development Process"},{"location":"developing_a_pheval_plugin/#1-sphintoxetry-cookiecutter-scaffold","text":"First, install the cruft package. Cruft enables keeping projects up-to-date with future updates made to this original template. Install the latest release of cruft from pip pip install cruft NOTE: You may encounter an error with the naming of the project layout if using an older release of cruft. To avoid this, make sure you have installed the latest release version. Next, create a project using the sphintoxetry-cookiecutter template. cruft create https://github.com/monarch-initiative/monarch-project-template","title":"1. Sphintoxetry-cookiecutter scaffold"},{"location":"developing_a_pheval_plugin/#2-further-setup","text":"","title":"2. Further setup"},{"location":"developing_a_pheval_plugin/#install-poetry-if-you-havent-already","text":"pip install poetry","title":"Install poetry if you haven't already."},{"location":"developing_a_pheval_plugin/#install-dependencies","text":"poetry install","title":"Install dependencies"},{"location":"developing_a_pheval_plugin/#add-pheval-dependency","text":"poetry add pheval","title":"Add PhEval dependency"},{"location":"developing_a_pheval_plugin/#run-tox-to-see-if-the-setup-works","text":"poetry run tox","title":"Run tox to see if the setup works"},{"location":"developing_a_pheval_plugin/#3-implement-pheval-custom-runner","text":"The runner name is arbitrary and custom Runner name was chose by demonstrative purposes Create a runner file inside the plugin project, e.g: \"\"\"Custom Pheval Runner.\"\"\" from dataclasses import dataclass from pathlib import Path from pheval.runners.runner import PhEvalRunner @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" )","title":"3. Implement PhEval Custom Runner"},{"location":"developing_a_pheval_plugin/#4-add-pheval-plugins-section-to-the-pyprojecttoml-file","text":"[tool.poetry.plugins. \"pheval.plugins\" ] customrunner = \"pheval_plugin_example.runner:CustomPhevalRunner\" Replace the value above with the path to your custom runner plugin","title":"4. Add PhEval Plugins section to the pyproject.toml file"},{"location":"developing_a_pheval_plugin/#5-implementing-pheval-helper-methods","text":"Streamlining the creation of your custom PhEval runner can be facilitated by leveraging PhEval's versatile helper methods, where applicable. Within PhEval, numerous public methods have been designed to assist in your runner methods. The utilisation of these helper methods is optional, yet they are crafted to enhance the overall implementation process.","title":"5. Implementing PhEval helper methods"},{"location":"developing_a_pheval_plugin/#utility-methods","text":"The PhenopacketUtil class is designed to aid in the collection of specific data from a Phenopacket. Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False PhenopacketUtil proves particularly beneficial in scenarios where the tool for which you're crafting a runner implementation does not directly accept Phenopackets as inputs. Instead, it might require elements\u2014such as HPO IDs\u2014 via the command-line interface (CLI). In this context, leveraging PhenopacketUtil within the runner's preparation phase enables the extraction of observed phenotypic features from the Phenopacket input, facilitating seamless processing. An example of how this could be implemented is outlined here: from pheval.utils.phenopacket_utils import phenopacket_reader from pheval.utils.phenopacket_utils import PhenopacketUtil phenopacket = phenopacket_reader ( \"/path/to/phenopacket.json\" ) phenopacket_util = PhenopacketUtil ( phenopacket ) # To return a list of all observed phenotypes for a phenopacket observed_phenotypes = phenopacket_util . observed_phenotypic_features () # To extract just the HPO ID as a list observed_phenotypes_hpo_ids = [ observed_phenotype . id for observed_phenotype in observed_phenotypes ]","title":"Utility methods"},{"location":"developing_a_pheval_plugin/#additional-tool-specific-configurations","text":"For the pheval run command to execute successfully, a config.yaml should be found within the input directory supplied on the CLI. tool : tool_version : variant_analysis : gene_analysis : disease_analysis : tool_specific_configuration_options : The tool_specific_configuration_options is an optional field that can be populated with any variables specific to your runner implementation that is required for the running of your tool. All other fields are required to be filled in. The variant_analysis , gene_analysis , and disease_analysis are set as booleans and are for specifying what type of analysis/prioritisation the tool outputs. To populate the tool_specific_configurations_options with customised data, we suggest using the pydantic package as it can easily parse the data from the yaml structure. e.g., Define a BaseModel class with the fields that will populate the tool_specific_configuration_options from pydantic import BaseModel , Field class CustomisedConfigurations ( BaseModel ): \"\"\" Class for defining the customised configurations in tool_specific_configurations field, within the input_dir config.yaml Args: environment (str): Environment to run \"\"\" environment : str = Field ( ... ) Within your runner parse the field into an object. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) config = CustomisedConfigurations . parse_obj ( self . input_dir_config . tool_specific_configuration_options ) environment = config . environment def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" )","title":"Additional tool-specific configurations"},{"location":"developing_a_pheval_plugin/#post-processing-methods","text":"PhEval currently supports the benchmarking of gene, variant, and disease prioritisation results. To benchmark these result types, PhEval TSV result files need to be generated. PhEval can deal with the ranking and generation of these files to the correct location. However, the runner implementation must handle the extraction of essential data from the tool-specific raw results. This involves transforming them into a list comprising PhEval data classes, with each instance representing a result entry. The dataclasses representing essential information extracted from tool-specific output for gene, variant, and disease prioritisation are defined as follows: Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float The generate_pheval_result() can be implemented in your runner to write out the PhEval TSV results. An example of how the method can be called is outlined here: from pheval.post_processing.post_processing import generate_pheval_result generate_pheval_result ( pheval_result = pheval_gene_result , # this is the list of extracted PhEval result requirements sort_order_str = \"descending\" , # or can be ascending - this determines in which order the scores will be ranked output_dir = output_directory , # this can be accessed from the runner instance e.g., self.output_dir tool_result_path = tool_result_json # this is the path to the tool-specific raw results file )","title":"Post-processing methods"},{"location":"developing_a_pheval_plugin/#adding-metadata-to-the-resultsyml","text":"By default, PhEval will write a results.yml to the output directory supplied on the CLI. The results.yml contains basic metadata regarding the run configuration, however, there is also the option to add customised run metadata to the results.yml in the tool_specific_configuration_options field. To achieve this, you'll need to create a construct_meta_data() method within your runner implementation. This method is responsible for appending customised metadata to the metadata object in the form of a defined dataclass. It should return the entire metadata object once the addition is completed. e.g., Defined customised metadata dataclass: from dataclasses import dataclass @dataclass class CustomisedMetaData : customised_field : str Example of implementation in the runner. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) def construct_meta_data ( self ): \"\"\"Add metadata.\"\"\" self . meta_data . tool_specific_configuration_options = CustomisedMetaData ( customised_field = \"customised_value\" ) return self . meta_data","title":"Adding metadata to the results.yml"},{"location":"developing_a_pheval_plugin/#6-test-it","text":"To update your custom pheval runner implementation, you must first install the package poetry install Now you have to be able to run PhEval passing your custom runner as parameter. e.g., pheval run -i ./input_dir -t ./test_data_dir -r 'customphevalrunner' -o output_dir The -r parameter stands for your plugin runner class name, and it must be entirely lowercase. Output: preparing running with custom pheval Runner post processing Pay attention to \" running with custom pheval Runner \" line, this is exactly what we had implemented in the CustomPhevalRunner Example","title":"6. Test it."},{"location":"executing_a_benchmark/","text":"Executing a Benchmark PhEval is designed for benchmarking algorithms across various datasets. To execute a benchmark using PhEval, you need to: Execute your runner; generating the PhEval standardised TSV outputs for gene/variant/disease prioritisation. Configure the benchmarking parameters. Run the benchmark. PhEval will generate various performance reports, allowing you to easily compare the effectiveness of different algorithms. After the Runner Execution After executing a run, you may be left with an output directory structure like so: . \u251c\u2500\u2500 pheval_disease_results \u2502 \u251c\u2500\u2500 patient_1-pheval_disease_result.tsv \u251c\u2500\u2500 pheval_gene_results \u2502 \u251c\u2500\u2500 patient_1-pheval_gene_result.tsv \u251c\u2500\u2500 pheval_variant_results \u2502 \u251c\u2500\u2500 patient_1-pheval_variant_result.tsv \u251c\u2500\u2500 raw_results \u2502 \u251c\u2500\u2500 patient_1.json \u251c\u2500\u2500 results.yml \u2514\u2500\u2500 tool_input_commands \u2514\u2500\u2500 tool_input_commands.txt Whether you have populated pheval_disease_results , pheval_gene_results , and pheval_variant_results directories will depend on what is specified in the config.yaml for the runner execution. It is the results in these directories that are consumed in the benchmarking to produce the statistical comparison reports. Benchmarking Configuration File To configure the benchmarking parameters, a YAML configuration file should be created and supplied to the CLI command. An outline of the configuration file structure follows below: benchmark_name : exomiser_14_benchmark runs : - run_identifier : run_identifier_1 results_dir : /path/to/results_dir_1 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : False disease_analysis : True threshold : score_order : descending - run_identifier : run_identifier_2 results_dir : /path/to/results_dir_2 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : True disease_analysis : True threshold : score_order : descending plot_customisation : gene_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : disease_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : variant_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : The benchmark_name is what will be used to name the duckdb database that will contain all the ranking and binary statistics as well as comparisons between runs. The name provided should not have any whitespace or special characters. Runs section The runs section specifies which run configurations should be included in the benchmarking. For each run configuration you will need to populate the following parameters: run_identifier : The identifier associated with the run - this should be meaningful as it will be used in the naming in tables and plots. results_dir : The full path to the root directory where the directories pheval_gene_results / pheval_variant_results / pheval_disease_results can be found. phenopacket_dir : The full path to the phenopacket directory used during the runner execution. gene_analysis : Boolean specifying whether to perform benchmarking for gene prioritisation analysis. variant_analysis : Boolean specifying whether to perform benchmarking for variant prioritisation analysis disease_analysis : Boolean specifying whether to perform benchmarking for disease prioritisation analysis threshold : OPTIONAL score threshold to consider for inclusion of results. score_order : Ordering of results for ranking. Either ascending or descending. Plot customisation section The plot_customisation section specifies any additional customisation to the plots output from the benchmarking. Here you can specify title names for all the plots output, as well as the plot type for displaying the summary ranking stats. This section is split by the plots output from the gene, variant and disease prioritisation benchmarking. The parameters in this section do not need to be populated - however, if left blank it will default to generic titles. The parameters as follows are: plot_type : The plot type output for the summary rank stats plot. This can be either, bar_cumulative, bar_non_cumulative or bar_stacked. rank_plot_title : The customised title for the summary rank stats plot. roc_curve_title : The customised title for the ROC curve plot. precision_recall_title The customised title for the precision-recall curve plot. Executing the benchmark After configuring the benchmarking YAML, executing the benchmark is relatively simple. pheval-utils generate-benchmark-stats --run-yaml benchmarking_config.yaml","title":"Executing a Benchmark"},{"location":"executing_a_benchmark/#executing-a-benchmark","text":"PhEval is designed for benchmarking algorithms across various datasets. To execute a benchmark using PhEval, you need to: Execute your runner; generating the PhEval standardised TSV outputs for gene/variant/disease prioritisation. Configure the benchmarking parameters. Run the benchmark. PhEval will generate various performance reports, allowing you to easily compare the effectiveness of different algorithms.","title":"Executing a Benchmark"},{"location":"executing_a_benchmark/#after-the-runner-execution","text":"After executing a run, you may be left with an output directory structure like so: . \u251c\u2500\u2500 pheval_disease_results \u2502 \u251c\u2500\u2500 patient_1-pheval_disease_result.tsv \u251c\u2500\u2500 pheval_gene_results \u2502 \u251c\u2500\u2500 patient_1-pheval_gene_result.tsv \u251c\u2500\u2500 pheval_variant_results \u2502 \u251c\u2500\u2500 patient_1-pheval_variant_result.tsv \u251c\u2500\u2500 raw_results \u2502 \u251c\u2500\u2500 patient_1.json \u251c\u2500\u2500 results.yml \u2514\u2500\u2500 tool_input_commands \u2514\u2500\u2500 tool_input_commands.txt Whether you have populated pheval_disease_results , pheval_gene_results , and pheval_variant_results directories will depend on what is specified in the config.yaml for the runner execution. It is the results in these directories that are consumed in the benchmarking to produce the statistical comparison reports.","title":"After the Runner Execution"},{"location":"executing_a_benchmark/#benchmarking-configuration-file","text":"To configure the benchmarking parameters, a YAML configuration file should be created and supplied to the CLI command. An outline of the configuration file structure follows below: benchmark_name : exomiser_14_benchmark runs : - run_identifier : run_identifier_1 results_dir : /path/to/results_dir_1 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : False disease_analysis : True threshold : score_order : descending - run_identifier : run_identifier_2 results_dir : /path/to/results_dir_2 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : True disease_analysis : True threshold : score_order : descending plot_customisation : gene_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : disease_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : variant_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : The benchmark_name is what will be used to name the duckdb database that will contain all the ranking and binary statistics as well as comparisons between runs. The name provided should not have any whitespace or special characters.","title":"Benchmarking Configuration File"},{"location":"executing_a_benchmark/#runs-section","text":"The runs section specifies which run configurations should be included in the benchmarking. For each run configuration you will need to populate the following parameters: run_identifier : The identifier associated with the run - this should be meaningful as it will be used in the naming in tables and plots. results_dir : The full path to the root directory where the directories pheval_gene_results / pheval_variant_results / pheval_disease_results can be found. phenopacket_dir : The full path to the phenopacket directory used during the runner execution. gene_analysis : Boolean specifying whether to perform benchmarking for gene prioritisation analysis. variant_analysis : Boolean specifying whether to perform benchmarking for variant prioritisation analysis disease_analysis : Boolean specifying whether to perform benchmarking for disease prioritisation analysis threshold : OPTIONAL score threshold to consider for inclusion of results. score_order : Ordering of results for ranking. Either ascending or descending.","title":"Runs section"},{"location":"executing_a_benchmark/#plot-customisation-section","text":"The plot_customisation section specifies any additional customisation to the plots output from the benchmarking. Here you can specify title names for all the plots output, as well as the plot type for displaying the summary ranking stats. This section is split by the plots output from the gene, variant and disease prioritisation benchmarking. The parameters in this section do not need to be populated - however, if left blank it will default to generic titles. The parameters as follows are: plot_type : The plot type output for the summary rank stats plot. This can be either, bar_cumulative, bar_non_cumulative or bar_stacked. rank_plot_title : The customised title for the summary rank stats plot. roc_curve_title : The customised title for the ROC curve plot. precision_recall_title The customised title for the precision-recall curve plot.","title":"Plot customisation section"},{"location":"executing_a_benchmark/#executing-the-benchmark","text":"After configuring the benchmarking YAML, executing the benchmark is relatively simple. pheval-utils generate-benchmark-stats --run-yaml benchmarking_config.yaml","title":"Executing the benchmark"},{"location":"exomiser_pipeline/","text":"PhEval Pipeline Exomiser Runner Step by Step to PhEval Run Pipeline (with ExomiserRunner) 1. Download Exomiser Software wget https://github.com/exomiser/Exomiser/releases/download/13.2.0/exomiser-cli-13.2.0-distribution.zip 2. Download Phenotype Data wget https://data.monarchinitiative.org/exomiser/latest/2302_hg19.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_hg38.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_phenotype.zip 3. Unzip data # unzip the distribution and data files - this will create a directory called 'exomiser-cli-13.1.0' in the current working directory unzip exomiser-cli-13.2.0-distribution.zip unzip 2302_hg19.zip -d exomiser-cli-13.2.0/data unzip 2302_hg38.zip -d exomiser-cli-13.2.0/data 4. Clone PhEval repo and follow steps described in Pipeline Documentation: git clone https://github.com/monarch-initiative/pheval.git cd pheval poetry shell poetry install pip install pheval.exomiser 5. Set PhEval Config YAML File directories : tmp : data/tmp exomiser : /path_where_exomiser_was_extracted phenotype : /path_where_phenotype_was_extracted workspace : /pheval's_path # path where pheval was cloned corpora : - id : small_test scrambled : - factor : 0.5 - factor : 0.7 custom_variants : - id : no_phenotype configs : - tool : exomiser version : 13.2.0 configuration : default exomiser_db : semsim1 runs : - tool : exomiser configuration : default corpus : small_test corpusvariant : scrambled-0.5 version : 13.2.0 6. Generate Makefile based on configuration bash ./resources/generatemakefile.sh 7. Exomiser Runner requires the following configuration The config.yaml file should be formatted like the example below and must be placed in exomiser: /path_where_exomiser_was_extracted declared in pheval-config.yaml file. tool : exomiser tool_version : 13.2.0 variant_analysis : True gene_analysis : True disease_analysis : True tool_specific_configuration_options : environment : local exomiser_software_directory : . analysis_configuration_file : preset-exome-analysis.yml max_jobs : 0 application_properties : remm_version : cadd_version : hg19_data_version : 2302 hg19_local_frequency_path : hg38_data_version : 2302 phenotype_data_version : 2302 cache_type : cache_caffeine_spec : post_process : score_name : combinedScore sort_order : DESCENDING 8. Preset Exome Analysis File Exomiser requires a preset-exome-analysis.yml file saved at /path_where_exomiser_was_extracted/preset-exome-analysis.yml This is an example of preset-exome-analysis.yml file ## Exomiser Analysis Template. # These are all the possible options for running exomiser. Use this as a template for # your own set-up. --- analysisMode : PASS_ONLY inheritanceModes : { AUTOSOMAL_DOMINANT : 0.1 , AUTOSOMAL_RECESSIVE_HOM_ALT : 0.1 , AUTOSOMAL_RECESSIVE_COMP_HET : 2.0 , X_DOMINANT : 0.1 , X_RECESSIVE_HOM_ALT : 0.1 , X_RECESSIVE_COMP_HET : 2.0 , MITOCHONDRIAL : 0.2 } frequencySources : [ THOUSAND_GENOMES , TOPMED , UK10K , ESP_AFRICAN_AMERICAN , ESP_EUROPEAN_AMERICAN , ESP_ALL , EXAC_AFRICAN_INC_AFRICAN_AMERICAN , EXAC_AMERICAN , EXAC_SOUTH_ASIAN , EXAC_EAST_ASIAN , EXAC_FINNISH , EXAC_NON_FINNISH_EUROPEAN , EXAC_OTHER , GNOMAD_E_AFR , GNOMAD_E_AMR , # GNOMAD_E_ASJ, GNOMAD_E_EAS , GNOMAD_E_FIN , GNOMAD_E_NFE , GNOMAD_E_OTH , GNOMAD_E_SAS , GNOMAD_G_AFR , GNOMAD_G_AMR , # GNOMAD_G_ASJ, GNOMAD_G_EAS , GNOMAD_G_FIN , GNOMAD_G_NFE , GNOMAD_G_OTH , GNOMAD_G_SAS ] # Possible pathogenicitySources: (POLYPHEN, MUTATION_TASTER, SIFT), (REVEL, MVP), CADD, REMM # REMM is trained on non-coding regulatory regions # *WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files # and updated their location in the application.properties. Exomiser will not run without this. pathogenicitySources : [ REVEL , MVP ] #this is the standard exomiser order. steps : [ failedVariantFilter : { }, variantEffectFilter : { remove : [ FIVE_PRIME_UTR_EXON_VARIANT , FIVE_PRIME_UTR_INTRON_VARIANT , THREE_PRIME_UTR_EXON_VARIANT , THREE_PRIME_UTR_INTRON_VARIANT , NON_CODING_TRANSCRIPT_EXON_VARIANT , NON_CODING_TRANSCRIPT_INTRON_VARIANT , CODING_TRANSCRIPT_INTRON_VARIANT , UPSTREAM_GENE_VARIANT , DOWNSTREAM_GENE_VARIANT , INTERGENIC_VARIANT , REGULATORY_REGION_VARIANT ] }, frequencyFilter : { maxFrequency : 2.0 }, pathogenicityFilter : { keepNonPathogenic : true }, inheritanceFilter : { }, omimPrioritiser : { }, hiPhivePrioritiser : { } ] 9. PhEval Run make pheval run","title":"PhEval Pipeline Exomiser Runner"},{"location":"exomiser_pipeline/#pheval-pipeline-exomiser-runner","text":"","title":"PhEval Pipeline Exomiser Runner"},{"location":"exomiser_pipeline/#step-by-step-to-pheval-run-pipeline-with-exomiserrunner","text":"","title":"Step by Step to PhEval Run Pipeline (with ExomiserRunner)"},{"location":"exomiser_pipeline/#1-download-exomiser-software","text":"wget https://github.com/exomiser/Exomiser/releases/download/13.2.0/exomiser-cli-13.2.0-distribution.zip","title":"1. Download Exomiser Software"},{"location":"exomiser_pipeline/#2-download-phenotype-data","text":"wget https://data.monarchinitiative.org/exomiser/latest/2302_hg19.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_hg38.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_phenotype.zip","title":"2. Download Phenotype Data"},{"location":"exomiser_pipeline/#3-unzip-data","text":"# unzip the distribution and data files - this will create a directory called 'exomiser-cli-13.1.0' in the current working directory unzip exomiser-cli-13.2.0-distribution.zip unzip 2302_hg19.zip -d exomiser-cli-13.2.0/data unzip 2302_hg38.zip -d exomiser-cli-13.2.0/data","title":"3. Unzip data"},{"location":"exomiser_pipeline/#4-clone-pheval-repo-and-follow-steps-described-in-pipeline-documentation","text":"git clone https://github.com/monarch-initiative/pheval.git cd pheval poetry shell poetry install pip install pheval.exomiser","title":"4. Clone PhEval repo and follow steps described in Pipeline Documentation:"},{"location":"exomiser_pipeline/#5-set-pheval-config-yaml-file","text":"directories : tmp : data/tmp exomiser : /path_where_exomiser_was_extracted phenotype : /path_where_phenotype_was_extracted workspace : /pheval's_path # path where pheval was cloned corpora : - id : small_test scrambled : - factor : 0.5 - factor : 0.7 custom_variants : - id : no_phenotype configs : - tool : exomiser version : 13.2.0 configuration : default exomiser_db : semsim1 runs : - tool : exomiser configuration : default corpus : small_test corpusvariant : scrambled-0.5 version : 13.2.0","title":"5. Set PhEval Config YAML File"},{"location":"exomiser_pipeline/#6-generate-makefile-based-on-configuration","text":"bash ./resources/generatemakefile.sh","title":"6. Generate Makefile based on configuration"},{"location":"exomiser_pipeline/#7-exomiser-runner-requires-the-following-configuration","text":"The config.yaml file should be formatted like the example below and must be placed in exomiser: /path_where_exomiser_was_extracted declared in pheval-config.yaml file. tool : exomiser tool_version : 13.2.0 variant_analysis : True gene_analysis : True disease_analysis : True tool_specific_configuration_options : environment : local exomiser_software_directory : . analysis_configuration_file : preset-exome-analysis.yml max_jobs : 0 application_properties : remm_version : cadd_version : hg19_data_version : 2302 hg19_local_frequency_path : hg38_data_version : 2302 phenotype_data_version : 2302 cache_type : cache_caffeine_spec : post_process : score_name : combinedScore sort_order : DESCENDING","title":"7. Exomiser Runner requires the following configuration"},{"location":"exomiser_pipeline/#8-preset-exome-analysis-file","text":"Exomiser requires a preset-exome-analysis.yml file saved at /path_where_exomiser_was_extracted/preset-exome-analysis.yml This is an example of preset-exome-analysis.yml file ## Exomiser Analysis Template. # These are all the possible options for running exomiser. Use this as a template for # your own set-up. --- analysisMode : PASS_ONLY inheritanceModes : { AUTOSOMAL_DOMINANT : 0.1 , AUTOSOMAL_RECESSIVE_HOM_ALT : 0.1 , AUTOSOMAL_RECESSIVE_COMP_HET : 2.0 , X_DOMINANT : 0.1 , X_RECESSIVE_HOM_ALT : 0.1 , X_RECESSIVE_COMP_HET : 2.0 , MITOCHONDRIAL : 0.2 } frequencySources : [ THOUSAND_GENOMES , TOPMED , UK10K , ESP_AFRICAN_AMERICAN , ESP_EUROPEAN_AMERICAN , ESP_ALL , EXAC_AFRICAN_INC_AFRICAN_AMERICAN , EXAC_AMERICAN , EXAC_SOUTH_ASIAN , EXAC_EAST_ASIAN , EXAC_FINNISH , EXAC_NON_FINNISH_EUROPEAN , EXAC_OTHER , GNOMAD_E_AFR , GNOMAD_E_AMR , # GNOMAD_E_ASJ, GNOMAD_E_EAS , GNOMAD_E_FIN , GNOMAD_E_NFE , GNOMAD_E_OTH , GNOMAD_E_SAS , GNOMAD_G_AFR , GNOMAD_G_AMR , # GNOMAD_G_ASJ, GNOMAD_G_EAS , GNOMAD_G_FIN , GNOMAD_G_NFE , GNOMAD_G_OTH , GNOMAD_G_SAS ] # Possible pathogenicitySources: (POLYPHEN, MUTATION_TASTER, SIFT), (REVEL, MVP), CADD, REMM # REMM is trained on non-coding regulatory regions # *WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files # and updated their location in the application.properties. Exomiser will not run without this. pathogenicitySources : [ REVEL , MVP ] #this is the standard exomiser order. steps : [ failedVariantFilter : { }, variantEffectFilter : { remove : [ FIVE_PRIME_UTR_EXON_VARIANT , FIVE_PRIME_UTR_INTRON_VARIANT , THREE_PRIME_UTR_EXON_VARIANT , THREE_PRIME_UTR_INTRON_VARIANT , NON_CODING_TRANSCRIPT_EXON_VARIANT , NON_CODING_TRANSCRIPT_INTRON_VARIANT , CODING_TRANSCRIPT_INTRON_VARIANT , UPSTREAM_GENE_VARIANT , DOWNSTREAM_GENE_VARIANT , INTERGENIC_VARIANT , REGULATORY_REGION_VARIANT ] }, frequencyFilter : { maxFrequency : 2.0 }, pathogenicityFilter : { keepNonPathogenic : true }, inheritanceFilter : { }, omimPrioritiser : { }, hiPhivePrioritiser : { } ]","title":"8. Preset Exome Analysis File"},{"location":"exomiser_pipeline/#9-pheval-run","text":"make pheval run","title":"9. PhEval Run"},{"location":"pipeline/","text":"PhEval Pipeline TLDR The Pipeline presented on PhEval preprint was moved to a new repository - Monarch PhEval . NOTE: The default Monarch PhEval pipeline, as proposed in the paper preprint, requires approximately 1 TB of disk space. Learn how to modify the pipeline configuration here to customize the experiments. 1. Clone Monarch PhEval git clone https://github.com/monarch-initiative/monarch_pheval.git 2. Installing PhEval Pipeline dependencies Enter in the cloned folder and enter the following commands: poetry shell poetry install 3. Executing Pipeline make pheval Pipeline Description The Pipeline is divided in three main steps 1. Data Preparation Phase The data preparation phase, checks the completeness of the disease, gene and variant input data and optionally preparing simulated VCF files if required, gives the user the ability to randomise phenotypic profiles using the PhEval corpus scramble command utility, allowing for the assessment of how well VGPAs handle noise and less specific phenotypic profiles when making predict. 2. Runner Phase The runner phase is structured into three stages: prepare, run, and post-process. The prepare step plays a crucial role in adapting the input data to meet the specific requirements of the tool. In the run step, the VGPA is executed, applying the selected algorithm to the prepared data and generating the tool-specific outputs. Within the run stage, an essential task is the generation of input command files for the algorithm. These files serve as collections of individual commands, each tailored to run the targeted VGPA on specific samples. These commands are configured with the appropriate inputs, outputs and specific configuration settings, allowing for the automated and efficient processing of large corpora. Finally, the post-processing step takes care of harmonising the tool-specific outputs into standardised PhEval TSV format, ensuring uniformity and ease of analysis of results from all VGPAs. In this context, the tool-specific output is condensed to provide only two essential elements, the entity of interest, which can either be a variant, gene, or disease, and its corresponding score. PhEval then assumes the responsibility of subsequent standardisation processes. This involves the reranking of the results in a uniform manner, ensuring that fair and comprehensive comparisons can be made between tools. 3. Analysis Phase In the analysis phase, PhEval generates comprehensive statistical reports based on standardised outputs from the runner phase. Customising PhEval Pipeline Experiments The PhEval pipeline is orchestrated using a Makefile Jinja template strategy. Therefore, to describe a new experiment in the pipeline, the user needs to generate a Makefile workflow based on a configuration file. In the resources folder are the following files responsible for Makefile generation: \ud83d\udce6resources \u2523 \ud83d\udcdcMakefile.j2 \u2523 \ud83d\udcdccustom.Makefile \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml Let's begin by describing the pheval-config.yaml file and its structure. PhEval Configuration File This file is responsible define the experiment settings and will be used to generate the Makefile using a Jinja template which consumes this YAML configuration file. Directories Section The data and tmp properties are mandatory and must be specified in this section. data property refers to the folder location where the necessary phenotypic data for the pipeline will be downloaded and extracted. tmp property points to the folder where all temporary intermediate files will be generated. directories : data : data tmp : data/tmp Corpora Section The corpora section specifies which corpus will be used in the experiment. In this example is defined LIRICAL corpus, A small comparison corpus created for benchmarking the LIRICAL system which contains 385 case reports. The user needs to specify corpus id and it must be equals to the corpora folder structure, e.g. \ud83d\udce6corpora \u2503 \u2523 \ud83d\udcc2lirical \u2503 \u2523 \u2523 \ud83d\udcc2small_version \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2phenopackets \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.json \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.json \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2vcf \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.vcf.gz \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.vcf.gz \u2503 \u2523 \u2523 \u2523 \ud83d\udcdccorpus.yml \u2503 \u2523 \u2523 \u2523 \ud83d\udcdctemplate_exome_hg19.vcf.gz corpora : - id : lirical variant : small_version Configs Section The configs section holds all custom configurations for the different VGPAs. It must declare: - tool: VGPA tool name. - id: it's an arbiratry unique identifier that will be used in the runs section - version: VGPA tool version configs : - tool : phen2gene id : phen2gene-1.2.3 version : 1.2.3 configs section can also deal with special VGPA data preparation steps, for example, Semantic Similarity ingestions into Exomiser phenotypic database e.g. configs : - tool : exomiser id : exomiser-semsim-ingest-13.3.0 version : 13.3.0 phenotype : 2309 preprocessing : - phenio-monarch-hp-hp.0.4.semsimian.sql phenotype property describes the Exomiser phenotype database version and the preprocessing section will execute SQL scripts into that phenotypic database. Runs Section The \"runs\" section will integrate all previously described sections and pass them to pheval VGPA for concrete execution. tool property specifies which runner will be called corpus and corpusvariant must match properties declared on the corpora section . version should correspond to the tool version configuration must match the id described on the configuration section . runs : - tool : exomiser corpus : lirical corpusvariant : small_version version : 13.3.0 configuration : exomiser-semsim-ingest-13.3.0 Generating new Makefile based on PhEval configuration file \ud83d\udce6resources \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml To generate a new Makefile, simply execute the generatemakefile.sh script, which encapsulates the Makefile rendering process dynamically filling it using the pheval-config.yaml configuration file. ./resources/generatemakefile.sh","title":"PhEval Pipeline"},{"location":"pipeline/#pheval-pipeline","text":"","title":"PhEval Pipeline"},{"location":"pipeline/#tldr","text":"The Pipeline presented on PhEval preprint was moved to a new repository - Monarch PhEval . NOTE: The default Monarch PhEval pipeline, as proposed in the paper preprint, requires approximately 1 TB of disk space. Learn how to modify the pipeline configuration here to customize the experiments.","title":"TLDR"},{"location":"pipeline/#1-clone-monarch-pheval","text":"git clone https://github.com/monarch-initiative/monarch_pheval.git","title":"1. Clone Monarch PhEval"},{"location":"pipeline/#2-installing-pheval-pipeline-dependencies","text":"Enter in the cloned folder and enter the following commands: poetry shell poetry install","title":"2. Installing PhEval Pipeline dependencies"},{"location":"pipeline/#3-executing-pipeline","text":"make pheval","title":"3. Executing Pipeline"},{"location":"pipeline/#pipeline-description","text":"The Pipeline is divided in three main steps","title":"Pipeline Description"},{"location":"pipeline/#1-data-preparation-phase","text":"The data preparation phase, checks the completeness of the disease, gene and variant input data and optionally preparing simulated VCF files if required, gives the user the ability to randomise phenotypic profiles using the PhEval corpus scramble command utility, allowing for the assessment of how well VGPAs handle noise and less specific phenotypic profiles when making predict.","title":"1. Data Preparation Phase"},{"location":"pipeline/#2-runner-phase","text":"The runner phase is structured into three stages: prepare, run, and post-process. The prepare step plays a crucial role in adapting the input data to meet the specific requirements of the tool. In the run step, the VGPA is executed, applying the selected algorithm to the prepared data and generating the tool-specific outputs. Within the run stage, an essential task is the generation of input command files for the algorithm. These files serve as collections of individual commands, each tailored to run the targeted VGPA on specific samples. These commands are configured with the appropriate inputs, outputs and specific configuration settings, allowing for the automated and efficient processing of large corpora. Finally, the post-processing step takes care of harmonising the tool-specific outputs into standardised PhEval TSV format, ensuring uniformity and ease of analysis of results from all VGPAs. In this context, the tool-specific output is condensed to provide only two essential elements, the entity of interest, which can either be a variant, gene, or disease, and its corresponding score. PhEval then assumes the responsibility of subsequent standardisation processes. This involves the reranking of the results in a uniform manner, ensuring that fair and comprehensive comparisons can be made between tools.","title":"2. Runner Phase"},{"location":"pipeline/#3-analysis-phase","text":"In the analysis phase, PhEval generates comprehensive statistical reports based on standardised outputs from the runner phase.","title":"3. Analysis Phase"},{"location":"pipeline/#customising-pheval-pipeline-experiments","text":"The PhEval pipeline is orchestrated using a Makefile Jinja template strategy. Therefore, to describe a new experiment in the pipeline, the user needs to generate a Makefile workflow based on a configuration file. In the resources folder are the following files responsible for Makefile generation: \ud83d\udce6resources \u2523 \ud83d\udcdcMakefile.j2 \u2523 \ud83d\udcdccustom.Makefile \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml Let's begin by describing the pheval-config.yaml file and its structure.","title":"Customising PhEval Pipeline Experiments"},{"location":"pipeline/#pheval-configuration-file","text":"This file is responsible define the experiment settings and will be used to generate the Makefile using a Jinja template which consumes this YAML configuration file.","title":"PhEval Configuration File"},{"location":"pipeline/#directories-section","text":"The data and tmp properties are mandatory and must be specified in this section. data property refers to the folder location where the necessary phenotypic data for the pipeline will be downloaded and extracted. tmp property points to the folder where all temporary intermediate files will be generated. directories : data : data tmp : data/tmp","title":"Directories Section"},{"location":"pipeline/#corpora-section","text":"The corpora section specifies which corpus will be used in the experiment. In this example is defined LIRICAL corpus, A small comparison corpus created for benchmarking the LIRICAL system which contains 385 case reports. The user needs to specify corpus id and it must be equals to the corpora folder structure, e.g. \ud83d\udce6corpora \u2503 \u2523 \ud83d\udcc2lirical \u2503 \u2523 \u2523 \ud83d\udcc2small_version \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2phenopackets \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.json \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.json \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2vcf \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.vcf.gz \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.vcf.gz \u2503 \u2523 \u2523 \u2523 \ud83d\udcdccorpus.yml \u2503 \u2523 \u2523 \u2523 \ud83d\udcdctemplate_exome_hg19.vcf.gz corpora : - id : lirical variant : small_version","title":"Corpora Section"},{"location":"pipeline/#configs-section","text":"The configs section holds all custom configurations for the different VGPAs. It must declare: - tool: VGPA tool name. - id: it's an arbiratry unique identifier that will be used in the runs section - version: VGPA tool version configs : - tool : phen2gene id : phen2gene-1.2.3 version : 1.2.3 configs section can also deal with special VGPA data preparation steps, for example, Semantic Similarity ingestions into Exomiser phenotypic database e.g. configs : - tool : exomiser id : exomiser-semsim-ingest-13.3.0 version : 13.3.0 phenotype : 2309 preprocessing : - phenio-monarch-hp-hp.0.4.semsimian.sql phenotype property describes the Exomiser phenotype database version and the preprocessing section will execute SQL scripts into that phenotypic database.","title":"Configs Section"},{"location":"pipeline/#runs-section","text":"The \"runs\" section will integrate all previously described sections and pass them to pheval VGPA for concrete execution. tool property specifies which runner will be called corpus and corpusvariant must match properties declared on the corpora section . version should correspond to the tool version configuration must match the id described on the configuration section . runs : - tool : exomiser corpus : lirical corpusvariant : small_version version : 13.3.0 configuration : exomiser-semsim-ingest-13.3.0","title":"Runs Section"},{"location":"pipeline/#generating-new-makefile-based-on-pheval-configuration-file","text":"\ud83d\udce6resources \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml To generate a new Makefile, simply execute the generatemakefile.sh script, which encapsulates the Makefile rendering process dynamically filling it using the pheval-config.yaml configuration file. ./resources/generatemakefile.sh","title":"Generating new Makefile based on PhEval configuration file"},{"location":"plugins/","text":"A full list of implemented PhEval runners are listed below along with links to the original tool: Tool PhEval plugin Comment Exomiser Exomiser runner The link to the original tool can be found here Phen2Gene Phen2Gene runner The link to the original tool can be found here LIRICAL LIRICAL runner The link to the original tool can be found here SvAnna SvAnna runner The link to the original tool can be found here GADO GADO runner The link to the original tool can be found here Template Template runner OntoGPT OntoGPT runner ELDER ELDER runner MALCO MALCO runner AI MARRVEL AI MARRVEL runner The link to the original tool can be found here OAK OAK runner","title":"Plugins"},{"location":"roadmap/","text":"Roadmap The Roadmap is a rough plan, changes are expected throughout the year. 2023 Q1 Finalising the PhEval architecture (draft is done) End-to-end pipeline for testing PhEval with Exomiser and two versions of HPO Submitting a poster to Biocuration which outlines the full vision Q2 Focus on an analytic framework around PhEval, focusing on studying how changes to ontologies affect changes in variant prioritisation Extend phenotype pipeline to enable base releases and alternative patterns Q3 Improving the analytic framework of PhEval, especially phenotype analysis All intermediate files of pipeline have a corresponding LinkML model Focus on studying the effect of KG snippets (p2ds) on VP performance Q4 Drafting a PhEval paper Building standalone pipeline that reports changes in algorithm behaviours to ontology developers.","title":"Roadmap"},{"location":"roadmap/#roadmap","text":"The Roadmap is a rough plan, changes are expected throughout the year.","title":"Roadmap"},{"location":"roadmap/#2023","text":"","title":"2023"},{"location":"roadmap/#q1","text":"Finalising the PhEval architecture (draft is done) End-to-end pipeline for testing PhEval with Exomiser and two versions of HPO Submitting a poster to Biocuration which outlines the full vision","title":"Q1"},{"location":"roadmap/#q2","text":"Focus on an analytic framework around PhEval, focusing on studying how changes to ontologies affect changes in variant prioritisation Extend phenotype pipeline to enable base releases and alternative patterns","title":"Q2"},{"location":"roadmap/#q3","text":"Improving the analytic framework of PhEval, especially phenotype analysis All intermediate files of pipeline have a corresponding LinkML model Focus on studying the effect of KG snippets (p2ds) on VP performance","title":"Q3"},{"location":"roadmap/#q4","text":"Drafting a PhEval paper Building standalone pipeline that reports changes in algorithm behaviours to ontology developers.","title":"Q4"},{"location":"styleguide/","text":"Monarch Style Guide for PhEval No code in CLI methods","title":"Monarch Style Guide for PhEval"},{"location":"styleguide/#monarch-style-guide-for-pheval","text":"No code in CLI methods","title":"Monarch Style Guide for PhEval"},{"location":"api/pheval/cli/","text":"main main CLI method for PhEval Args: verbose (int, optional): Verbose flag. quiet (bool, optional): Queit Flag. Usage: main [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default -v , --verbose integer range ( 0 and above) N/A 0 -q , --quiet text N/A None --help boolean Show this message and exit. False pheval pheval Usage: pheval [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands run : PhEval Runner Command Line Interface run PhEval Runner Command Line Interface Args: input_dir (Path): The input directory (relative path: e.g exomiser-13.11) testdata_dir (Path): The input directory (relative path: e.g ./data runner (str): Runner implementation (e.g exomiser-13.11) tmp_dir (Path): The path of the temporary directory (optional) output_dir (Path): The path of the output directory config (Path): The path of the configuration file (optional e.g., config.yaml) version (str): The version of the tool implementation Usage: pheval run [OPTIONS] Options: Name Type Description Default --input-dir , -i Path The input directory (relative path: e.g exomiser-13.11) _required --testdata-dir , -t Path The input directory (relative path: e.g ./data) _required --runner , -r text Runner implementation (e.g exomiser-13.11) _required --tmp-dir , -m Path The path of the temporary directory (optional) None --output-dir , -o Path The path of the output directory _required --config , -c Path The path of the configuration file (optional e.g config.yaml) None --version , -v text Version of the tool implementation. None --help boolean Show this message and exit. False pheval-utils pheval_utils Usage: pheval-utils [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands create-spiked-vcfs : generate-benchmark-stats : Benchmark the gene/variant/disease prioritisation performance for runs. generate-stats-plot : Generate bar plot from benchmark db. prepare-corpus : scramble-phenopackets : Generate noisy phenopackets from existing ones. semsim-scramble : Scrambles semsim profile multiplying score value by scramble factor semsim-to-exomiserdb : ingests semsim file into exomiser phenotypic database update-phenopackets : Update gene symbols and identifiers for phenopackets. create-spiked-vcfs Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). output_dir (Path): The directory to store the generated spiked VCF file(s). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). Usage: pheval-utils create-spiked-vcfs [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -O Path Path for creation of output directory vcf --help boolean Show this message and exit. False generate-benchmark-stats Benchmark the gene/variant/disease prioritisation performance for runs. Usage: pheval-utils generate-benchmark-stats [OPTIONS] Options: Name Type Description Default --run-yaml , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False generate-stats-plot Generate bar plot from benchmark db. Usage: pheval-utils generate-stats-plot [OPTIONS] Options: Name Type Description Default --benchmark-db , -b Path Path to benchmark db output by PhEval benchmark commands. _required --run-data , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False prepare-corpus Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Usage: pheval-utils prepare-corpus [OPTIONS] Options: Name Type Description Default --phenopacket-dir , -p Path Path to phenopacket corpus directory.. _required --variant-analysis / --no-variant-analysis boolean Specify whether to check for complete variant records in the phenopackets. False --gene-analysis / --no-gene-analysis boolean Specify whether to check for complete gene records in the phenopackets. False --disease-analysis / --no-disease-analysis boolean Specify whether to check for complete disease records in the phenopackets. False --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to update in phenopacket None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -o Path Path to output prepared corpus. prepared_corpus --help boolean Show this message and exit. False scramble-phenopackets Generate noisy phenopackets from existing ones. Usage: pheval-utils scramble-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopackets directory. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --scramble-factor , -s float Scramble factor for randomising phenopacket phenotypic profiles. 0.5 --output-dir , -O Path Path for creation of output directory noisy_phenopackets --local-ontology-cache , -l Path Path to the local ontology cache, e.g., path to the hp.obo. None --help boolean Show this message and exit. False semsim-scramble Scrambles semsim profile multiplying score value by scramble factor Args: input (Path): Path file that points out to the semsim profile output (Path): Path file that points out to the output file score_column (List[str]): Score column(s) that will be scrambled scramble_factor (float): Scramble Magnitude Usage: pheval-utils semsim-scramble [OPTIONS] Options: Name Type Description Default --input , -i Path Path to the semantic similarity profile to be scrambled. _required --output , -o Path Path where the scrambled semsim file will be written. _required --score-column , -c choice ( jaccard_similarity | dice_similarity | phenodigm_score ) Score column that will be scrambled _required --scramble-factor , -s float Scramble Magnitude (noise) that will be applied to semantic similarity score column (e.g. jaccard similarity). 0.5 --help boolean Show this message and exit. False semsim-to-exomiserdb ingests semsim file into exomiser phenotypic database Args: input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) Usage: pheval-utils semsim-to-exomiserdb [OPTIONS] Options: Name Type Description Default --input-file , -i Path Semsim input file. _required --object-prefix text Object Prefix. e.g. MP _required --subject-prefix text Subject Prefix. e.g. HP _required --db-path , -d Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/). This is the path where the phenotypic database folder will be written out. _required --help boolean Show this message and exit. False update-phenopackets Update gene symbols and identifiers for phenopackets. Usage: pheval-utils update-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --output-dir , -o Path Path to write phenopacket. _required --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to add to phenopacket ensembl_id --help boolean Show this message and exit. False","title":"Cli"},{"location":"api/pheval/cli/#main","text":"main CLI method for PhEval Args: verbose (int, optional): Verbose flag. quiet (bool, optional): Queit Flag. Usage: main [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default -v , --verbose integer range ( 0 and above) N/A 0 -q , --quiet text N/A None --help boolean Show this message and exit. False","title":"main"},{"location":"api/pheval/cli/#pheval","text":"pheval Usage: pheval [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands run : PhEval Runner Command Line Interface","title":"pheval"},{"location":"api/pheval/cli/#run","text":"PhEval Runner Command Line Interface Args: input_dir (Path): The input directory (relative path: e.g exomiser-13.11) testdata_dir (Path): The input directory (relative path: e.g ./data runner (str): Runner implementation (e.g exomiser-13.11) tmp_dir (Path): The path of the temporary directory (optional) output_dir (Path): The path of the output directory config (Path): The path of the configuration file (optional e.g., config.yaml) version (str): The version of the tool implementation Usage: pheval run [OPTIONS] Options: Name Type Description Default --input-dir , -i Path The input directory (relative path: e.g exomiser-13.11) _required --testdata-dir , -t Path The input directory (relative path: e.g ./data) _required --runner , -r text Runner implementation (e.g exomiser-13.11) _required --tmp-dir , -m Path The path of the temporary directory (optional) None --output-dir , -o Path The path of the output directory _required --config , -c Path The path of the configuration file (optional e.g config.yaml) None --version , -v text Version of the tool implementation. None --help boolean Show this message and exit. False","title":"run"},{"location":"api/pheval/cli/#pheval-utils","text":"pheval_utils Usage: pheval-utils [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands create-spiked-vcfs : generate-benchmark-stats : Benchmark the gene/variant/disease prioritisation performance for runs. generate-stats-plot : Generate bar plot from benchmark db. prepare-corpus : scramble-phenopackets : Generate noisy phenopackets from existing ones. semsim-scramble : Scrambles semsim profile multiplying score value by scramble factor semsim-to-exomiserdb : ingests semsim file into exomiser phenotypic database update-phenopackets : Update gene symbols and identifiers for phenopackets.","title":"pheval-utils"},{"location":"api/pheval/cli/#create-spiked-vcfs","text":"Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). output_dir (Path): The directory to store the generated spiked VCF file(s). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). Usage: pheval-utils create-spiked-vcfs [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -O Path Path for creation of output directory vcf --help boolean Show this message and exit. False","title":"create-spiked-vcfs"},{"location":"api/pheval/cli/#generate-benchmark-stats","text":"Benchmark the gene/variant/disease prioritisation performance for runs. Usage: pheval-utils generate-benchmark-stats [OPTIONS] Options: Name Type Description Default --run-yaml , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False","title":"generate-benchmark-stats"},{"location":"api/pheval/cli/#generate-stats-plot","text":"Generate bar plot from benchmark db. Usage: pheval-utils generate-stats-plot [OPTIONS] Options: Name Type Description Default --benchmark-db , -b Path Path to benchmark db output by PhEval benchmark commands. _required --run-data , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False","title":"generate-stats-plot"},{"location":"api/pheval/cli/#prepare-corpus","text":"Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Usage: pheval-utils prepare-corpus [OPTIONS] Options: Name Type Description Default --phenopacket-dir , -p Path Path to phenopacket corpus directory.. _required --variant-analysis / --no-variant-analysis boolean Specify whether to check for complete variant records in the phenopackets. False --gene-analysis / --no-gene-analysis boolean Specify whether to check for complete gene records in the phenopackets. False --disease-analysis / --no-disease-analysis boolean Specify whether to check for complete disease records in the phenopackets. False --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to update in phenopacket None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -o Path Path to output prepared corpus. prepared_corpus --help boolean Show this message and exit. False","title":"prepare-corpus"},{"location":"api/pheval/cli/#scramble-phenopackets","text":"Generate noisy phenopackets from existing ones. Usage: pheval-utils scramble-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopackets directory. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --scramble-factor , -s float Scramble factor for randomising phenopacket phenotypic profiles. 0.5 --output-dir , -O Path Path for creation of output directory noisy_phenopackets --local-ontology-cache , -l Path Path to the local ontology cache, e.g., path to the hp.obo. None --help boolean Show this message and exit. False","title":"scramble-phenopackets"},{"location":"api/pheval/cli/#semsim-scramble","text":"Scrambles semsim profile multiplying score value by scramble factor Args: input (Path): Path file that points out to the semsim profile output (Path): Path file that points out to the output file score_column (List[str]): Score column(s) that will be scrambled scramble_factor (float): Scramble Magnitude Usage: pheval-utils semsim-scramble [OPTIONS] Options: Name Type Description Default --input , -i Path Path to the semantic similarity profile to be scrambled. _required --output , -o Path Path where the scrambled semsim file will be written. _required --score-column , -c choice ( jaccard_similarity | dice_similarity | phenodigm_score ) Score column that will be scrambled _required --scramble-factor , -s float Scramble Magnitude (noise) that will be applied to semantic similarity score column (e.g. jaccard similarity). 0.5 --help boolean Show this message and exit. False","title":"semsim-scramble"},{"location":"api/pheval/cli/#semsim-to-exomiserdb","text":"ingests semsim file into exomiser phenotypic database Args: input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) Usage: pheval-utils semsim-to-exomiserdb [OPTIONS] Options: Name Type Description Default --input-file , -i Path Semsim input file. _required --object-prefix text Object Prefix. e.g. MP _required --subject-prefix text Subject Prefix. e.g. HP _required --db-path , -d Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/). This is the path where the phenotypic database folder will be written out. _required --help boolean Show this message and exit. False","title":"semsim-to-exomiserdb"},{"location":"api/pheval/cli/#update-phenopackets","text":"Update gene symbols and identifiers for phenopackets. Usage: pheval-utils update-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --output-dir , -o Path Path to write phenopacket. _required --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to add to phenopacket ensembl_id --help boolean Show this message and exit. False","title":"update-phenopackets"},{"location":"api/pheval/config_parser/","text":"InputDirConfig dataclass Class for defining the fields within the input directory config. Parameters: Name Type Description Default tool str Name of the tool implementation (e.g. exomiser/phen2gene) required tool_version str Version of the tool implementation required variant_analysis bool Whether to extract prioritised variants from results. required gene_analysis bool Whether to extract prioritised genes from results. required disease_analysis bool Whether to extract prioritised diseases from results. required tool_specific_configuration_options Any Tool specific configurations required Source code in src/pheval/config_parser.py 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 @serde @dataclass class InputDirConfig : \"\"\" Class for defining the fields within the input directory config. Args: tool (str): Name of the tool implementation (e.g. exomiser/phen2gene) tool_version (str): Version of the tool implementation variant_analysis (bool): Whether to extract prioritised variants from results. gene_analysis (bool): Whether to extract prioritised genes from results. disease_analysis (bool): Whether to extract prioritised diseases from results. tool_specific_configuration_options (Any): Tool specific configurations \"\"\" tool : str tool_version : str variant_analysis : bool gene_analysis : bool disease_analysis : bool tool_specific_configuration_options : Any parse_input_dir_config ( input_dir ) Reads the config file. Source code in src/pheval/config_parser.py 35 36 37 38 39 40 def parse_input_dir_config ( input_dir : Path ) -> InputDirConfig : \"\"\"Reads the config file.\"\"\" with open ( Path ( input_dir ) . joinpath ( \"config.yaml\" ), \"r\" ) as config_file : config = yaml . safe_load ( config_file ) config_file . close () return from_yaml ( InputDirConfig , yaml . dump ( config ))","title":"Config parser"},{"location":"api/pheval/config_parser/#src.pheval.config_parser.InputDirConfig","text":"Class for defining the fields within the input directory config. Parameters: Name Type Description Default tool str Name of the tool implementation (e.g. exomiser/phen2gene) required tool_version str Version of the tool implementation required variant_analysis bool Whether to extract prioritised variants from results. required gene_analysis bool Whether to extract prioritised genes from results. required disease_analysis bool Whether to extract prioritised diseases from results. required tool_specific_configuration_options Any Tool specific configurations required Source code in src/pheval/config_parser.py 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 @serde @dataclass class InputDirConfig : \"\"\" Class for defining the fields within the input directory config. Args: tool (str): Name of the tool implementation (e.g. exomiser/phen2gene) tool_version (str): Version of the tool implementation variant_analysis (bool): Whether to extract prioritised variants from results. gene_analysis (bool): Whether to extract prioritised genes from results. disease_analysis (bool): Whether to extract prioritised diseases from results. tool_specific_configuration_options (Any): Tool specific configurations \"\"\" tool : str tool_version : str variant_analysis : bool gene_analysis : bool disease_analysis : bool tool_specific_configuration_options : Any","title":"InputDirConfig"},{"location":"api/pheval/config_parser/#src.pheval.config_parser.parse_input_dir_config","text":"Reads the config file. Source code in src/pheval/config_parser.py 35 36 37 38 39 40 def parse_input_dir_config ( input_dir : Path ) -> InputDirConfig : \"\"\"Reads the config file.\"\"\" with open ( Path ( input_dir ) . joinpath ( \"config.yaml\" ), \"r\" ) as config_file : config = yaml . safe_load ( config_file ) config_file . close () return from_yaml ( InputDirConfig , yaml . dump ( config ))","title":"parse_input_dir_config"},{"location":"api/pheval/run_metadata/","text":"BasicOutputRunMetaData dataclass Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run Source code in src/pheval/run_metadata.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 @serde @dataclass class BasicOutputRunMetaData : \"\"\"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run \"\"\" tool : str tool_version : str config : Path run_timestamp : int corpus : Path tool_specific_configuration_options : Any = None","title":"Run metadata"},{"location":"api/pheval/run_metadata/#src.pheval.run_metadata.BasicOutputRunMetaData","text":"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run Source code in src/pheval/run_metadata.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 @serde @dataclass class BasicOutputRunMetaData : \"\"\"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run \"\"\" tool : str tool_version : str config : Path run_timestamp : int corpus : Path tool_specific_configuration_options : Any = None","title":"BasicOutputRunMetaData"},{"location":"api/pheval/analyse/analysis/","text":"benchmark_run_comparisons ( run_config ) Benchmark prioritisation performance for several runs. Parameters: Name Type Description Default run_config Config Run configurations. required Source code in src/pheval/analyse/analysis.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def benchmark_run_comparisons ( run_config : Config , ) -> None : \"\"\" Benchmark prioritisation performance for several runs. Args: run_config (Config): Run configurations. \"\"\" gene_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . gene_analysis ], plot_customisation = run_config . plot_customisation , ) variant_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . variant_analysis ], plot_customisation = run_config . plot_customisation , ) disease_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . disease_analysis ], plot_customisation = run_config . plot_customisation , ) if gene_analysis_runs . runs : _run_benchmark_comparison ( run_config = gene_analysis_runs , benchmark_generator = GeneBenchmarkRunOutputGenerator ( plot_customisation = gene_analysis_runs . plot_customisation . gene_plots ), ) if variant_analysis_runs . runs : _run_benchmark_comparison ( run_config = variant_analysis_runs , benchmark_generator = VariantBenchmarkRunOutputGenerator ( plot_customisation = variant_analysis_runs . plot_customisation . variant_plots ), ) if disease_analysis_runs . runs : _run_benchmark_comparison ( run_config = disease_analysis_runs , benchmark_generator = DiseaseBenchmarkRunOutputGenerator ( plot_customisation = disease_analysis_runs . plot_customisation . disease_plots ), )","title":"Analysis"},{"location":"api/pheval/analyse/analysis/#src.pheval.analyse.analysis.benchmark_run_comparisons","text":"Benchmark prioritisation performance for several runs. Parameters: Name Type Description Default run_config Config Run configurations. required Source code in src/pheval/analyse/analysis.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def benchmark_run_comparisons ( run_config : Config , ) -> None : \"\"\" Benchmark prioritisation performance for several runs. Args: run_config (Config): Run configurations. \"\"\" gene_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . gene_analysis ], plot_customisation = run_config . plot_customisation , ) variant_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . variant_analysis ], plot_customisation = run_config . plot_customisation , ) disease_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . disease_analysis ], plot_customisation = run_config . plot_customisation , ) if gene_analysis_runs . runs : _run_benchmark_comparison ( run_config = gene_analysis_runs , benchmark_generator = GeneBenchmarkRunOutputGenerator ( plot_customisation = gene_analysis_runs . plot_customisation . gene_plots ), ) if variant_analysis_runs . runs : _run_benchmark_comparison ( run_config = variant_analysis_runs , benchmark_generator = VariantBenchmarkRunOutputGenerator ( plot_customisation = variant_analysis_runs . plot_customisation . variant_plots ), ) if disease_analysis_runs . runs : _run_benchmark_comparison ( run_config = disease_analysis_runs , benchmark_generator = DiseaseBenchmarkRunOutputGenerator ( plot_customisation = disease_analysis_runs . plot_customisation . disease_plots ), )","title":"benchmark_run_comparisons"},{"location":"api/pheval/analyse/assess_prioritisation_base/","text":"AssessPrioritisationBase Source code in src/pheval/analyse/assess_prioritisation_base.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 class AssessPrioritisationBase : def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 ) def _assess_with_threshold_ascending_order ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the ascending order threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) > float ( result_entry . score ): return result_entry . rank else : return 0 def _assess_with_threshold ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the score threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) < float ( result_entry . score ): return result_entry . rank else : return 0 def _record_matched_entity ( self , standardised_result : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Return the rank result - handling the specification of a threshold. Args: standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval disease result entry Returns: int: Recorded entity prioritisation rank \"\"\" if float ( self . threshold ) == 0.0 : return standardised_result . rank else : return ( self . _assess_with_threshold ( standardised_result ) if self . score_order != \"ascending\" else self . _assess_with_threshold_ascending_order ( standardised_result , ) ) __init__ ( db_connection , table_name , column , threshold , score_order ) Initialise AssessPrioritisationBase class Parameters: Name Type Description Default db_connection BenchmarkDBManager DB connection. required table_name str Table name. required column str Column name. required threshold float Threshold for scores required score_order str Score order for results, either ascending or descending required Source code in src/pheval/analyse/assess_prioritisation_base.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 )","title":"Assess prioritisation base"},{"location":"api/pheval/analyse/assess_prioritisation_base/#src.pheval.analyse.assess_prioritisation_base.AssessPrioritisationBase","text":"Source code in src/pheval/analyse/assess_prioritisation_base.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 class AssessPrioritisationBase : def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 ) def _assess_with_threshold_ascending_order ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the ascending order threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) > float ( result_entry . score ): return result_entry . rank else : return 0 def _assess_with_threshold ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the score threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) < float ( result_entry . score ): return result_entry . rank else : return 0 def _record_matched_entity ( self , standardised_result : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Return the rank result - handling the specification of a threshold. Args: standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval disease result entry Returns: int: Recorded entity prioritisation rank \"\"\" if float ( self . threshold ) == 0.0 : return standardised_result . rank else : return ( self . _assess_with_threshold ( standardised_result ) if self . score_order != \"ascending\" else self . _assess_with_threshold_ascending_order ( standardised_result , ) )","title":"AssessPrioritisationBase"},{"location":"api/pheval/analyse/assess_prioritisation_base/#src.pheval.analyse.assess_prioritisation_base.AssessPrioritisationBase.__init__","text":"Initialise AssessPrioritisationBase class Parameters: Name Type Description Default db_connection BenchmarkDBManager DB connection. required table_name str Table name. required column str Column name. required threshold float Threshold for scores required score_order str Score order for results, either ascending or descending required Source code in src/pheval/analyse/assess_prioritisation_base.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 )","title":"__init__"},{"location":"api/pheval/analyse/benchmark_db_manager/","text":"BenchmarkDBManager Class to connect to database. Source code in src/pheval/analyse/benchmark_db_manager.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 class BenchmarkDBManager : \"\"\" Class to connect to database. \"\"\" def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ] def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close () __init__ ( benchmark_name ) Initialise the BenchmarkDBManager class. Source code in src/pheval/analyse/benchmark_db_manager.py 20 21 22 23 24 def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) add_column_integer_default ( table_name , column , default = 0 ) Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. Source code in src/pheval/analyse/benchmark_db_manager.py 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass add_contains_function () Adds a custom contains_entity_function to the DuckDB connection if it does not already exist. Source code in src/pheval/analyse/benchmark_db_manager.py 84 85 86 87 88 89 90 91 92 def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) check_table_exists ( table_name ) Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns True if the table exists in the database, False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False close () Close the connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 138 139 140 def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close () contains_entity_function ( entity , known_causative_entity ) staticmethod Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the entity . Returns: Name Type Description bool bool True if known_causative_entity is found in entity (or its list representation), False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity drop_table ( table_name ) Drop a table from the database. Args: table_name: Name of the table to drop from the database Source code in src/pheval/analyse/benchmark_db_manager.py 57 58 59 60 61 62 63 def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) get_connection ( db_name ) staticmethod Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 30 31 32 33 34 35 36 37 38 @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn initialise () Initialise the duckdb connection. Source code in src/pheval/analyse/benchmark_db_manager.py 26 27 28 def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () parse_table_into_dataclass ( table_name , dataclass ) Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: Type Description Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ]] List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. Source code in src/pheval/analyse/benchmark_db_manager.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ]","title":"Benchmark db manager"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager","text":"Class to connect to database. Source code in src/pheval/analyse/benchmark_db_manager.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 class BenchmarkDBManager : \"\"\" Class to connect to database. \"\"\" def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ] def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close ()","title":"BenchmarkDBManager"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.__init__","text":"Initialise the BenchmarkDBManager class. Source code in src/pheval/analyse/benchmark_db_manager.py 20 21 22 23 24 def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" )","title":"__init__"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.add_column_integer_default","text":"Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. Source code in src/pheval/analyse/benchmark_db_manager.py 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass","title":"add_column_integer_default"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.add_contains_function","text":"Adds a custom contains_entity_function to the DuckDB connection if it does not already exist. Source code in src/pheval/analyse/benchmark_db_manager.py 84 85 86 87 88 89 90 91 92 def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function )","title":"add_contains_function"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.check_table_exists","text":"Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns True if the table exists in the database, False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False","title":"check_table_exists"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.close","text":"Close the connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 138 139 140 def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close ()","title":"close"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.contains_entity_function","text":"Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the entity . Returns: Name Type Description bool bool True if known_causative_entity is found in entity (or its list representation), False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity","title":"contains_entity_function"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.drop_table","text":"Drop a table from the database. Args: table_name: Name of the table to drop from the database Source code in src/pheval/analyse/benchmark_db_manager.py 57 58 59 60 61 62 63 def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" )","title":"drop_table"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.get_connection","text":"Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 30 31 32 33 34 35 36 37 38 @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn","title":"get_connection"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.initialise","text":"Initialise the duckdb connection. Source code in src/pheval/analyse/benchmark_db_manager.py 26 27 28 def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function ()","title":"initialise"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.parse_table_into_dataclass","text":"Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: Type Description Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ]] List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. Source code in src/pheval/analyse/benchmark_db_manager.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ]","title":"parse_table_into_dataclass"},{"location":"api/pheval/analyse/benchmark_generator/","text":"BenchmarkRunOutputGenerator dataclass Base class for recording data required for generating benchmarking outputs. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. y_label str Label for the y-axis in benchmarking outputs. generate_benchmark_run_results Callable Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the rank comparison file. Source code in src/pheval/analyse/benchmark_generator.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 @dataclass class BenchmarkRunOutputGenerator : \"\"\"Base class for recording data required for generating benchmarking outputs. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. y_label (str): Label for the y-axis in benchmarking outputs. generate_benchmark_run_results (Callable): Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the rank comparison file. \"\"\" plot_customisation : SinglePlotCustomisation prioritisation_type_string : str y_label : str generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] stats_comparison_file : str DiseaseBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". Source code in src/pheval/analyse/benchmark_generator.py 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 @dataclass class DiseaseBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"disease\" y_label : str = \"Known diseases (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_disease_prioritisation ) stats_comparison_file : str = \"disease_summary\" GeneBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". Source code in src/pheval/analyse/benchmark_generator.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @dataclass class GeneBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"gene\" y_label : str = \"Disease-causing genes (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_gene_prioritisation ) stats_comparison_file : str = \"gene_summary\" VariantBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". Source code in src/pheval/analyse/benchmark_generator.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 @dataclass class VariantBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"variant\" y_label : str = \"Disease-causing variants (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_variant_prioritisation ) stats_comparison_file : str = \"variant_summary\"","title":"Benchmark generator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.BenchmarkRunOutputGenerator","text":"Base class for recording data required for generating benchmarking outputs. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. y_label str Label for the y-axis in benchmarking outputs. generate_benchmark_run_results Callable Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the rank comparison file. Source code in src/pheval/analyse/benchmark_generator.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 @dataclass class BenchmarkRunOutputGenerator : \"\"\"Base class for recording data required for generating benchmarking outputs. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. y_label (str): Label for the y-axis in benchmarking outputs. generate_benchmark_run_results (Callable): Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the rank comparison file. \"\"\" plot_customisation : SinglePlotCustomisation prioritisation_type_string : str y_label : str generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] stats_comparison_file : str","title":"BenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.DiseaseBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". Source code in src/pheval/analyse/benchmark_generator.py 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 @dataclass class DiseaseBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"disease\" y_label : str = \"Known diseases (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_disease_prioritisation ) stats_comparison_file : str = \"disease_summary\"","title":"DiseaseBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.GeneBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". Source code in src/pheval/analyse/benchmark_generator.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @dataclass class GeneBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"gene\" y_label : str = \"Disease-causing genes (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_gene_prioritisation ) stats_comparison_file : str = \"gene_summary\"","title":"GeneBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.VariantBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". Source code in src/pheval/analyse/benchmark_generator.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 @dataclass class VariantBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"variant\" y_label : str = \"Disease-causing variants (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_variant_prioritisation ) stats_comparison_file : str = \"variant_summary\"","title":"VariantBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmarking_data/","text":"BenchmarkRunResults dataclass Benchmarking results for a run. Attributes: Name Type Description rank_stats RankStats Statistics related to benchmark. binary_classification_stats BinaryClassificationStats Binary statistics related to benchmark. results_dir Path Path to the result directory. Defaults to None. benchmark_name str Name of the benchmark run. Defaults to None. phenopacket_dir Path Path to the phenopacket directory. Defaults to None. Source code in src/pheval/analyse/benchmarking_data.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 @dataclass class BenchmarkRunResults : \"\"\" Benchmarking results for a run. Attributes: rank_stats (RankStats): Statistics related to benchmark. binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark. results_dir (Path, optional): Path to the result directory. Defaults to None. benchmark_name (str, optional): Name of the benchmark run. Defaults to None. phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None. \"\"\" rank_stats : RankStats binary_classification_stats : BinaryClassificationStats results_dir : Path = None benchmark_name : str = None phenopacket_dir : Path = None","title":"Benchmarking data"},{"location":"api/pheval/analyse/benchmarking_data/#src.pheval.analyse.benchmarking_data.BenchmarkRunResults","text":"Benchmarking results for a run. Attributes: Name Type Description rank_stats RankStats Statistics related to benchmark. binary_classification_stats BinaryClassificationStats Binary statistics related to benchmark. results_dir Path Path to the result directory. Defaults to None. benchmark_name str Name of the benchmark run. Defaults to None. phenopacket_dir Path Path to the phenopacket directory. Defaults to None. Source code in src/pheval/analyse/benchmarking_data.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 @dataclass class BenchmarkRunResults : \"\"\" Benchmarking results for a run. Attributes: rank_stats (RankStats): Statistics related to benchmark. binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark. results_dir (Path, optional): Path to the result directory. Defaults to None. benchmark_name (str, optional): Name of the benchmark run. Defaults to None. phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None. \"\"\" rank_stats : RankStats binary_classification_stats : BinaryClassificationStats results_dir : Path = None benchmark_name : str = None phenopacket_dir : Path = None","title":"BenchmarkRunResults"},{"location":"api/pheval/analyse/binary_classification_stats/","text":"BinaryClassificationStats dataclass A data class representing counts of different categories in binary classification. Attributes: Name Type Description true_positives int The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives int The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives int The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives int The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. Source code in src/pheval/analyse/binary_classification_stats.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 @dataclass class BinaryClassificationStats : \"\"\" A data class representing counts of different categories in binary classification. Attributes: true_positives (int): The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives (int): The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives (int): The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives (int): The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. \"\"\" true_positives : int = 0 true_negatives : int = 0 false_positives : int = 0 false_negatives : int = 0 labels : List = field ( default_factory = list ) scores : List = field ( default_factory = list ) @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 ) def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) accuracy () Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: Name Type Description float float The Accuracy of the model, calculated as the sum of true positives and true negatives divided by float the sum of true positives, false positives, true negatives, and false negatives. float Returns 0.0 if the total sum of counts is zero. Source code in src/pheval/analyse/binary_classification_stats.py 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) add_classification ( pheval_results , relevant_ranks ) Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Source code in src/pheval/analyse/binary_classification_stats.py 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) add_classification_for_known_entities ( relevant_ranks ) Update binary classification metrics for known entities based on their ranking. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 63 64 65 66 67 68 69 70 71 72 73 74 def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 add_classification_for_other_entities ( ranks ) Update binary classification metrics for other entities based on their ranking. Parameters: Name Type Description Default ranks List [ int ] A list of the ranks for all other entities. required Source code in src/pheval/analyse/binary_classification_stats.py 76 77 78 79 80 81 82 83 84 85 86 87 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 add_labels_and_scores ( pheval_results , relevant_ranks ) Adds scores and labels from the PhEval results. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None f1_score () Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: Name Type Description float float The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) false_discovery_rate () Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Discovery Rate of the model, calculated as false positives divided by the sum of float false positives and true positives. Returns 0.0 if both false positives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) false_negative_rate () Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: Name Type Description float float The False Negative Rate of the model, calculated as false negatives divided by the sum of float false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) false_positive_rate () Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Positive Rate of the model, calculated as false positives divided by the sum of float false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) matthews_correlation_coefficient () Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: Name Type Description float float The Matthews Correlation Coefficient of the model, calculated as float ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) negative_predictive_value () Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: Name Type Description float float The Negative Predictive Value of the model, calculated as true negatives divided by the sum of float true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) precision () Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: Name Type Description float float The precision of the model, calculated as true positives divided by the sum of true positives float and false positives. Returns 0.0 if both true positives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) remove_relevant_ranks ( pheval_results , relevant_ranks ) staticmethod Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: Type Description List [ int ] List[int]: A list of the ranks with the relevant entity ranks removed. Source code in src/pheval/analyse/binary_classification_stats.py 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks sensitivity () Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: Name Type Description float float The sensitivity of the model, calculated as true positives divided by the sum of true positives float and false negatives. Returns 0 if both true positives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) specificity () Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: Name Type Description float float The specificity of the model, calculated as true negatives divided by the sum of true negatives float and false positives. Returns 0.0 if both true negatives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 )","title":"Binary classification stats"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats","text":"A data class representing counts of different categories in binary classification. Attributes: Name Type Description true_positives int The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives int The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives int The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives int The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. Source code in src/pheval/analyse/binary_classification_stats.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 @dataclass class BinaryClassificationStats : \"\"\" A data class representing counts of different categories in binary classification. Attributes: true_positives (int): The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives (int): The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives (int): The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives (int): The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. \"\"\" true_positives : int = 0 true_negatives : int = 0 false_positives : int = 0 false_negatives : int = 0 labels : List = field ( default_factory = list ) scores : List = field ( default_factory = list ) @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 ) def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"BinaryClassificationStats"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.accuracy","text":"Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: Name Type Description float float The Accuracy of the model, calculated as the sum of true positives and true negatives divided by float the sum of true positives, false positives, true negatives, and false negatives. float Returns 0.0 if the total sum of counts is zero. Source code in src/pheval/analyse/binary_classification_stats.py 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"accuracy"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification","text":"Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Source code in src/pheval/analyse/binary_classification_stats.py 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks )","title":"add_classification"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification_for_known_entities","text":"Update binary classification metrics for known entities based on their ranking. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 63 64 65 66 67 68 69 70 71 72 73 74 def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1","title":"add_classification_for_known_entities"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification_for_other_entities","text":"Update binary classification metrics for other entities based on their ranking. Parameters: Name Type Description Default ranks List [ int ] A list of the ranks for all other entities. required Source code in src/pheval/analyse/binary_classification_stats.py 76 77 78 79 80 81 82 83 84 85 86 87 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1","title":"add_classification_for_other_entities"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_labels_and_scores","text":"Adds scores and labels from the PhEval results. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None","title":"add_labels_and_scores"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.f1_score","text":"Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: Name Type Description float float The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 )","title":"f1_score"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_discovery_rate","text":"Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Discovery Rate of the model, calculated as false positives divided by the sum of float false positives and true positives. Returns 0.0 if both false positives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 )","title":"false_discovery_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_negative_rate","text":"Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: Name Type Description float float The False Negative Rate of the model, calculated as false negatives divided by the sum of float false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 )","title":"false_negative_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_positive_rate","text":"Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Positive Rate of the model, calculated as false positives divided by the sum of float false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 )","title":"false_positive_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.matthews_correlation_coefficient","text":"Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: Name Type Description float float The Matthews Correlation Coefficient of the model, calculated as float ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"matthews_correlation_coefficient"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.negative_predictive_value","text":"Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: Name Type Description float float The Negative Predictive Value of the model, calculated as true negatives divided by the sum of float true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"negative_predictive_value"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.precision","text":"Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: Name Type Description float float The precision of the model, calculated as true positives divided by the sum of true positives float and false positives. Returns 0.0 if both true positives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 )","title":"precision"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.remove_relevant_ranks","text":"Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: Type Description List [ int ] List[int]: A list of the ranks with the relevant entity ranks removed. Source code in src/pheval/analyse/binary_classification_stats.py 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks","title":"remove_relevant_ranks"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.sensitivity","text":"Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: Name Type Description float float The sensitivity of the model, calculated as true positives divided by the sum of true positives float and false negatives. Returns 0 if both true positives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 )","title":"sensitivity"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.specificity","text":"Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: Name Type Description float float The specificity of the model, calculated as true negatives divided by the sum of true negatives float and false positives. Returns 0.0 if both true negatives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 )","title":"specificity"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/","text":"AssessDiseasePrioritisation Bases: AssessPrioritisationBase Class for assessing disease prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 class AssessDiseasePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing disease prioritisation based on thresholds and scoring orders.\"\"\" def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , ) assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , binary_classification_stats ) Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_disease_result_path Path Path to the standardised disease TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , ) assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker ) Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required disease_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required disease_benchmarker AssessDiseasePrioritisation AssessDiseasePrioritisation class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def assess_phenopacket_disease_prioritisation ( phenopacket_path : Path , run : RunConfig , disease_binary_classification_stats : BinaryClassificationStats , disease_benchmarker : AssessDiseasePrioritisation , ) -> None : \"\"\" Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance. \"\"\" standardised_disease_result_path = run . results_dir . joinpath ( f \"pheval_disease_results/ { phenopacket_path . stem } -pheval_disease_result.tsv\" ) disease_benchmarker . assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , disease_binary_classification_stats , ) benchmark_disease_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on disease prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 def benchmark_disease_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on disease prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" disease_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () disease_benchmarker = AssessDiseasePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker , ) db_connection . close () disease_rank_stats = RankStats () disease_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = disease_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = disease_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Disease prioritisation analysis"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.AssessDiseasePrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing disease prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 class AssessDiseasePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing disease prioritisation based on thresholds and scoring orders.\"\"\" def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , )","title":"AssessDiseasePrioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.AssessDiseasePrioritisation.assess_disease_prioritisation","text":"Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_disease_result_path Path Path to the standardised disease TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , )","title":"assess_disease_prioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.assess_phenopacket_disease_prioritisation","text":"Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required disease_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required disease_benchmarker AssessDiseasePrioritisation AssessDiseasePrioritisation class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def assess_phenopacket_disease_prioritisation ( phenopacket_path : Path , run : RunConfig , disease_binary_classification_stats : BinaryClassificationStats , disease_benchmarker : AssessDiseasePrioritisation , ) -> None : \"\"\" Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance. \"\"\" standardised_disease_result_path = run . results_dir . joinpath ( f \"pheval_disease_results/ { phenopacket_path . stem } -pheval_disease_result.tsv\" ) disease_benchmarker . assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , disease_binary_classification_stats , )","title":"assess_phenopacket_disease_prioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.benchmark_disease_prioritisation","text":"Benchmark a directory based on disease prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 def benchmark_disease_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on disease prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" disease_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () disease_benchmarker = AssessDiseasePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker , ) db_connection . close () disease_rank_stats = RankStats () disease_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = disease_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = disease_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_disease_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/","text":"AssessGenePrioritisation Bases: AssessPrioritisationBase Class for assessing gene prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 class AssessGenePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing gene prioritisation based on thresholds and scoring orders.\"\"\" def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , ) assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , binary_classification_stats ) Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_gene_result_path Path Path to the standardised gene TSV result. required phenopacket_path Path Path to the Phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , ) assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker ) Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required gene_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required gene_benchmarker AssessGenePrioritisation AssessGenePrioritisation class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 def assess_phenopacket_gene_prioritisation ( phenopacket_path : Path , run : RunConfig , gene_binary_classification_stats : BinaryClassificationStats , gene_benchmarker : AssessGenePrioritisation , ) -> None : \"\"\" Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance. \"\"\" standardised_gene_result_path = run . results_dir . joinpath ( f \"pheval_gene_results/ { phenopacket_path . stem } -pheval_gene_result.tsv\" ) gene_benchmarker . assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , gene_binary_classification_stats , ) benchmark_gene_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def benchmark_gene_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ) -> BenchmarkRunResults : \"\"\" Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" gene_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () gene_benchmarker = AssessGenePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_gene\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker , ) db_connection . close () gene_rank_stats = RankStats () gene_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _gene\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = gene_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = gene_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Gene prioritisation analysis"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.AssessGenePrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing gene prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 class AssessGenePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing gene prioritisation based on thresholds and scoring orders.\"\"\" def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , )","title":"AssessGenePrioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.AssessGenePrioritisation.assess_gene_prioritisation","text":"Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_gene_result_path Path Path to the standardised gene TSV result. required phenopacket_path Path Path to the Phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , )","title":"assess_gene_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.assess_phenopacket_gene_prioritisation","text":"Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required gene_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required gene_benchmarker AssessGenePrioritisation AssessGenePrioritisation class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 def assess_phenopacket_gene_prioritisation ( phenopacket_path : Path , run : RunConfig , gene_binary_classification_stats : BinaryClassificationStats , gene_benchmarker : AssessGenePrioritisation , ) -> None : \"\"\" Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance. \"\"\" standardised_gene_result_path = run . results_dir . joinpath ( f \"pheval_gene_results/ { phenopacket_path . stem } -pheval_gene_result.tsv\" ) gene_benchmarker . assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , gene_binary_classification_stats , )","title":"assess_phenopacket_gene_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.benchmark_gene_prioritisation","text":"Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def benchmark_gene_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ) -> BenchmarkRunResults : \"\"\" Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" gene_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () gene_benchmarker = AssessGenePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_gene\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker , ) db_connection . close () gene_rank_stats = RankStats () gene_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _gene\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = gene_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = gene_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_gene_prioritisation"},{"location":"api/pheval/analyse/generate_plots/","text":"PlotGenerator Class to generate plots. Source code in src/pheval/analyse/generate_plots.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 class PlotGenerator : \"\"\"Class to generate plots.\"\"\" palette_hex_codes = [ \"#f4ae3d\" , \"#ee5825\" , \"#2b7288\" , \"#9a84b2\" , \"#0c604c\" , \"#c94c4c\" , \"#3d8e83\" , \"#725ac1\" , \"#e7ba52\" , \"#1b9e77\" , ] def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False @staticmethod def _create_run_identifier ( results_dir : Path ) -> str : \"\"\" Create a run identifier from a path. Args: results_dir (Path): The directory path for results. Returns: str: A string representing the run identifier created from the given path. \"\"\" return f \" { Path ( results_dir ) . parents [ 0 ] . name } _ { trim_corpus_results_directory_suffix ( Path ( results_dir ) . name ) } \" def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) def _generate_stacked_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for a stacked bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats self . stats . append ( { \"Run\" : self . return_benchmark_name ( benchmark_result ), \"Top\" : benchmark_result . rank_stats . percentage_top (), \"2-3\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ), \"4-5\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ), \"6-10\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ), \">10\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ), \"Missed\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found ()), } ) def _generate_stats_mrr_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for MRR (Mean Reciprocal Rank) bar plot, appending to the self.mrr attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" self . mrr . extend ( [ { \"Rank\" : \"MRR\" , \"Percentage\" : benchmark_result . rank_stats . return_mean_reciprocal_rank (), \"Run\" : self . return_benchmark_name ( benchmark_result ), } ] ) def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ): \"\"\" Generate data in the correct format for dataframe creation for a cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top3\" , \"Percentage\" : rank_stats . percentage_top3 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top5\" , \"Percentage\" : rank_stats . percentage_top5 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top10\" , \"Percentage\" : rank_stats . percentage_top10 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Found\" , \"Percentage\" : rank_stats . percentage_found () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_non_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> [ dict ]: \"\"\" Generate data in the correct format for dataframe creation for a non-cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"2-3\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"4-5\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"6-10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \">10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) __init__ ( benchmark_name ) Initialise the PlotGenerator class. Note: self.stats will be used to store statistics data. self.mrr will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. Source code in src/pheval/analyse/generate_plots.py 50 51 52 53 54 55 56 57 58 59 60 61 62 def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False generate_cumulative_bar ( benchmarking_results , benchmark_generator ) Generate a cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_non_cumulative_bar ( benchmarking_results , benchmark_generator ) Generate a non-cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_precision_recall ( benchmarking_results , benchmark_generator ) Generate and plot Precision-Recall curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_roc_curve ( benchmarking_results , benchmark_generator ) Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) return_benchmark_name ( benchmark_result ) Return the benchmark name for a run. Parameters: Name Type Description Default benchmark_result BenchmarkRunResults The benchmarking results for a run. required Returns: Name Type Description str str The benchmark name obtained from the given BenchmarkRunResults instance. Source code in src/pheval/analyse/generate_plots.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , generate_from_db = False ) Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Parameters: Name Type Description Default benchmarking_results list [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required generate_from_db bool Specify whether to generate plots from the db file. Defaults to False. False Source code in src/pheval/analyse/generate_plots.py 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 def generate_plots ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , generate_from_db : bool = False , ) -> None : \"\"\" Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Args: benchmarking_results (list[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. generate_from_db (bool): Specify whether to generate plots from the db file. Defaults to False. \"\"\" plot_generator = PlotGenerator ( benchmark_name ) if not generate_from_db : plot_generator . generate_roc_curve ( benchmarking_results , benchmark_generator ) plot_generator . generate_precision_recall ( benchmarking_results , benchmark_generator ) if benchmark_generator . plot_customisation . plot_type == \"bar_stacked\" : plot_generator . generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_cumulative\" : plot_generator . generate_cumulative_bar ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_non_cumulative\" : plot_generator . generate_non_cumulative_bar ( benchmarking_results , benchmark_generator ) generate_plots_from_benchmark_summary_db ( benchmark_db , run_data ) Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Parameters: Name Type Description Default benchmark_db Path Path to the summary TSV file containing benchmark results. required run_data Path Path to YAML benchmarking configuration file. required Source code in src/pheval/analyse/generate_plots.py 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 def generate_plots_from_benchmark_summary_db ( benchmark_db : Path , run_data : Path , ): \"\"\" Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Args: benchmark_db (Path): Path to the summary TSV file containing benchmark results. run_data (Path): Path to YAML benchmarking configuration file. \"\"\" benchmark_stats_summary = parse_benchmark_db ( benchmark_db ) config = parse_run_config ( run_data ) if benchmark_stats_summary . gene_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . gene_results , GeneBenchmarkRunOutputGenerator ( config . plot_customisation . gene_plots ), True , ) if benchmark_stats_summary . variant_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . variant_results , VariantBenchmarkRunOutputGenerator ( config . plot_customisation . variant_plots ), True , ) elif benchmark_stats_summary . disease_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . disease_results , DiseaseBenchmarkRunOutputGenerator ( config . plot_customisation . disease_plots ), True , ) trim_corpus_results_directory_suffix ( corpus_results_directory ) Trim the suffix from the corpus results directory name. Parameters: Name Type Description Default corpus_results_directory Path The directory path containing corpus results. required Returns: Name Type Description Path Path The Path object with the suffix removed from the directory name. Source code in src/pheval/analyse/generate_plots.py 21 22 23 24 25 26 27 28 29 30 31 def trim_corpus_results_directory_suffix ( corpus_results_directory : Path ) -> Path : \"\"\" Trim the suffix from the corpus results directory name. Args: corpus_results_directory (Path): The directory path containing corpus results. Returns: Path: The Path object with the suffix removed from the directory name. \"\"\" return Path ( str ( corpus_results_directory ) . replace ( \"_results\" , \"\" ))","title":"Generate plots"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator","text":"Class to generate plots. Source code in src/pheval/analyse/generate_plots.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 class PlotGenerator : \"\"\"Class to generate plots.\"\"\" palette_hex_codes = [ \"#f4ae3d\" , \"#ee5825\" , \"#2b7288\" , \"#9a84b2\" , \"#0c604c\" , \"#c94c4c\" , \"#3d8e83\" , \"#725ac1\" , \"#e7ba52\" , \"#1b9e77\" , ] def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False @staticmethod def _create_run_identifier ( results_dir : Path ) -> str : \"\"\" Create a run identifier from a path. Args: results_dir (Path): The directory path for results. Returns: str: A string representing the run identifier created from the given path. \"\"\" return f \" { Path ( results_dir ) . parents [ 0 ] . name } _ { trim_corpus_results_directory_suffix ( Path ( results_dir ) . name ) } \" def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) def _generate_stacked_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for a stacked bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats self . stats . append ( { \"Run\" : self . return_benchmark_name ( benchmark_result ), \"Top\" : benchmark_result . rank_stats . percentage_top (), \"2-3\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ), \"4-5\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ), \"6-10\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ), \">10\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ), \"Missed\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found ()), } ) def _generate_stats_mrr_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for MRR (Mean Reciprocal Rank) bar plot, appending to the self.mrr attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" self . mrr . extend ( [ { \"Rank\" : \"MRR\" , \"Percentage\" : benchmark_result . rank_stats . return_mean_reciprocal_rank (), \"Run\" : self . return_benchmark_name ( benchmark_result ), } ] ) def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ): \"\"\" Generate data in the correct format for dataframe creation for a cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top3\" , \"Percentage\" : rank_stats . percentage_top3 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top5\" , \"Percentage\" : rank_stats . percentage_top5 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top10\" , \"Percentage\" : rank_stats . percentage_top10 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Found\" , \"Percentage\" : rank_stats . percentage_found () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_non_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> [ dict ]: \"\"\" Generate data in the correct format for dataframe creation for a non-cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"2-3\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"4-5\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"6-10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \">10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"PlotGenerator"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.__init__","text":"Initialise the PlotGenerator class. Note: self.stats will be used to store statistics data. self.mrr will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. Source code in src/pheval/analyse/generate_plots.py 50 51 52 53 54 55 56 57 58 59 60 61 62 def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False","title":"__init__"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_cumulative_bar","text":"Generate a cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_cumulative_bar"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_non_cumulative_bar","text":"Generate a non-cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_non_cumulative_bar"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_precision_recall","text":"Generate and plot Precision-Recall curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_precision_recall"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_roc_curve","text":"Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_roc_curve"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_stacked_bar_plot","text":"Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_stacked_bar_plot"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.return_benchmark_name","text":"Return the benchmark name for a run. Parameters: Name Type Description Default benchmark_result BenchmarkRunResults The benchmarking results for a run. required Returns: Name Type Description str str The benchmark name obtained from the given BenchmarkRunResults instance. Source code in src/pheval/analyse/generate_plots.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) )","title":"return_benchmark_name"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.generate_plots","text":"Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Parameters: Name Type Description Default benchmarking_results list [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required generate_from_db bool Specify whether to generate plots from the db file. Defaults to False. False Source code in src/pheval/analyse/generate_plots.py 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 def generate_plots ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , generate_from_db : bool = False , ) -> None : \"\"\" Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Args: benchmarking_results (list[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. generate_from_db (bool): Specify whether to generate plots from the db file. Defaults to False. \"\"\" plot_generator = PlotGenerator ( benchmark_name ) if not generate_from_db : plot_generator . generate_roc_curve ( benchmarking_results , benchmark_generator ) plot_generator . generate_precision_recall ( benchmarking_results , benchmark_generator ) if benchmark_generator . plot_customisation . plot_type == \"bar_stacked\" : plot_generator . generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_cumulative\" : plot_generator . generate_cumulative_bar ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_non_cumulative\" : plot_generator . generate_non_cumulative_bar ( benchmarking_results , benchmark_generator )","title":"generate_plots"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.generate_plots_from_benchmark_summary_db","text":"Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Parameters: Name Type Description Default benchmark_db Path Path to the summary TSV file containing benchmark results. required run_data Path Path to YAML benchmarking configuration file. required Source code in src/pheval/analyse/generate_plots.py 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 def generate_plots_from_benchmark_summary_db ( benchmark_db : Path , run_data : Path , ): \"\"\" Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Args: benchmark_db (Path): Path to the summary TSV file containing benchmark results. run_data (Path): Path to YAML benchmarking configuration file. \"\"\" benchmark_stats_summary = parse_benchmark_db ( benchmark_db ) config = parse_run_config ( run_data ) if benchmark_stats_summary . gene_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . gene_results , GeneBenchmarkRunOutputGenerator ( config . plot_customisation . gene_plots ), True , ) if benchmark_stats_summary . variant_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . variant_results , VariantBenchmarkRunOutputGenerator ( config . plot_customisation . variant_plots ), True , ) elif benchmark_stats_summary . disease_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . disease_results , DiseaseBenchmarkRunOutputGenerator ( config . plot_customisation . disease_plots ), True , )","title":"generate_plots_from_benchmark_summary_db"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.trim_corpus_results_directory_suffix","text":"Trim the suffix from the corpus results directory name. Parameters: Name Type Description Default corpus_results_directory Path The directory path containing corpus results. required Returns: Name Type Description Path Path The Path object with the suffix removed from the directory name. Source code in src/pheval/analyse/generate_plots.py 21 22 23 24 25 26 27 28 29 30 31 def trim_corpus_results_directory_suffix ( corpus_results_directory : Path ) -> Path : \"\"\" Trim the suffix from the corpus results directory name. Args: corpus_results_directory (Path): The directory path containing corpus results. Returns: Path: The Path object with the suffix removed from the directory name. \"\"\" return Path ( str ( corpus_results_directory ) . replace ( \"_results\" , \"\" ))","title":"trim_corpus_results_directory_suffix"},{"location":"api/pheval/analyse/generate_summary_outputs/","text":"create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name ) Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from Source code in src/pheval/analyse/generate_summary_outputs.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 def create_comparison_table ( comparison_table_name : str , connector : BenchmarkDBManager , drop_columns : List [ str ], run_identifier_1 : str , run_identifier_2 : str , table_name : str , ) -> None : \"\"\" Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from \"\"\" connector . drop_table ( comparison_table_name ) excluded_columns = tuple ( drop_columns + [ \"identifier\" ]) if drop_columns else ( \"identifier\" ,) connector . conn . execute ( f 'CREATE TABLE \" { comparison_table_name } \" AS SELECT * ' f \"EXCLUDE { excluded_columns } FROM { table_name } \" ) connector . conn . execute ( f \"\"\"ALTER TABLE \" { comparison_table_name } \" ADD COLUMN rank_change VARCHAR;\"\"\" ) connector . conn . execute ( f 'UPDATE \" { comparison_table_name } \" SET rank_change = CASE WHEN \" { run_identifier_1 } \" = 0 ' f 'AND \" { run_identifier_2 } \" != 0 ' f \"THEN 'GAINED' WHEN \\\" { run_identifier_1 } \\\" != 0 AND \\\" { run_identifier_2 } \\\" = 0 THEN 'LOST' ELSE \" f 'CAST (\" { run_identifier_1 } \" - \" { run_identifier_2 } \" AS VARCHAR) END;' ) connector . conn . commit () generate_benchmark_comparison_output ( benchmark_name , benchmarking_results , run_identifiers , benchmark_generator , table_name ) Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of BenchmarkRunResults instances in benchmarking_results and generates rank comparison outputs using RankComparisonGenerator for each pair. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required benchmarking_results List [ BenchmarkRunResults ] A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. required run_identifiers List [ str ] A list of run identifiers. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required table_name str The name of the table where ranks are stored. required Source code in src/pheval/analyse/generate_summary_outputs.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def generate_benchmark_comparison_output ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], run_identifiers : List [ str ], benchmark_generator : BenchmarkRunOutputGenerator , table_name : str , ) -> None : \"\"\" Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of `BenchmarkRunResults` instances in `benchmarking_results` and generates rank comparison outputs using `RankComparisonGenerator` for each pair. Args: benchmark_name (str): Name of the benchmark. benchmarking_results (List[BenchmarkRunResults]): A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. run_identifiers (List[str]): A list of run identifiers. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. table_name (str): The name of the table where ranks are stored. \"\"\" output_prefix = benchmark_generator . prioritisation_type_string connector = BenchmarkDBManager ( benchmark_name ) for pair in itertools . combinations ( [ str ( result . benchmark_name ) for result in benchmarking_results ], 2 ): run_identifier_1 = pair [ 0 ] run_identifier_2 = pair [ 1 ] drop_columns = [ run for run in run_identifiers if run not in pair ] comparison_table_name = get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name , ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , ) get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. Source code in src/pheval/analyse/generate_summary_outputs.py 10 11 12 13 14 15 16 17 18 19 20 def get_new_table_name ( run_identifier_1 : str , run_identifier_2 : str , output_prefix : str ) -> str : \"\"\" Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. \"\"\" return f \" { run_identifier_1 } _vs_\" f \" { run_identifier_2 } _\" f \" { output_prefix } _rank_comparison\"","title":"Generate summary outputs"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.create_comparison_table","text":"Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from Source code in src/pheval/analyse/generate_summary_outputs.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 def create_comparison_table ( comparison_table_name : str , connector : BenchmarkDBManager , drop_columns : List [ str ], run_identifier_1 : str , run_identifier_2 : str , table_name : str , ) -> None : \"\"\" Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from \"\"\" connector . drop_table ( comparison_table_name ) excluded_columns = tuple ( drop_columns + [ \"identifier\" ]) if drop_columns else ( \"identifier\" ,) connector . conn . execute ( f 'CREATE TABLE \" { comparison_table_name } \" AS SELECT * ' f \"EXCLUDE { excluded_columns } FROM { table_name } \" ) connector . conn . execute ( f \"\"\"ALTER TABLE \" { comparison_table_name } \" ADD COLUMN rank_change VARCHAR;\"\"\" ) connector . conn . execute ( f 'UPDATE \" { comparison_table_name } \" SET rank_change = CASE WHEN \" { run_identifier_1 } \" = 0 ' f 'AND \" { run_identifier_2 } \" != 0 ' f \"THEN 'GAINED' WHEN \\\" { run_identifier_1 } \\\" != 0 AND \\\" { run_identifier_2 } \\\" = 0 THEN 'LOST' ELSE \" f 'CAST (\" { run_identifier_1 } \" - \" { run_identifier_2 } \" AS VARCHAR) END;' ) connector . conn . commit ()","title":"create_comparison_table"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.generate_benchmark_comparison_output","text":"Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of BenchmarkRunResults instances in benchmarking_results and generates rank comparison outputs using RankComparisonGenerator for each pair. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required benchmarking_results List [ BenchmarkRunResults ] A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. required run_identifiers List [ str ] A list of run identifiers. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required table_name str The name of the table where ranks are stored. required Source code in src/pheval/analyse/generate_summary_outputs.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def generate_benchmark_comparison_output ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], run_identifiers : List [ str ], benchmark_generator : BenchmarkRunOutputGenerator , table_name : str , ) -> None : \"\"\" Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of `BenchmarkRunResults` instances in `benchmarking_results` and generates rank comparison outputs using `RankComparisonGenerator` for each pair. Args: benchmark_name (str): Name of the benchmark. benchmarking_results (List[BenchmarkRunResults]): A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. run_identifiers (List[str]): A list of run identifiers. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. table_name (str): The name of the table where ranks are stored. \"\"\" output_prefix = benchmark_generator . prioritisation_type_string connector = BenchmarkDBManager ( benchmark_name ) for pair in itertools . combinations ( [ str ( result . benchmark_name ) for result in benchmarking_results ], 2 ): run_identifier_1 = pair [ 0 ] run_identifier_2 = pair [ 1 ] drop_columns = [ run for run in run_identifiers if run not in pair ] comparison_table_name = get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name , ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , )","title":"generate_benchmark_comparison_output"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.get_new_table_name","text":"Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. Source code in src/pheval/analyse/generate_summary_outputs.py 10 11 12 13 14 15 16 17 18 19 20 def get_new_table_name ( run_identifier_1 : str , run_identifier_2 : str , output_prefix : str ) -> str : \"\"\" Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. \"\"\" return f \" { run_identifier_1 } _vs_\" f \" { run_identifier_2 } _\" f \" { output_prefix } _rank_comparison\"","title":"get_new_table_name"},{"location":"api/pheval/analyse/parse_benchmark_summary/","text":"parse_benchmark_db ( benchmarking_db ) Read the summary benchmark TSV output generated from the benchmark-comparison command. Parameters: Name Type Description Default benchmarking_db Path Path to the benchmark db. required Returns: Name Type Description BenchmarkSummaryResults BenchmarkSummaryResults A dataclass containing all benchmarking results contained in the db. Source code in src/pheval/analyse/parse_benchmark_summary.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 def parse_benchmark_db ( benchmarking_db : Path ) -> BenchmarkSummaryResults : \"\"\" Read the summary benchmark TSV output generated from the benchmark-comparison command. Args: benchmarking_db (Path): Path to the benchmark db. Returns: BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db. \"\"\" db_connector = BenchmarkDBManager ( benchmarking_db ) gene_benchmarking_results , disease_benchmarking_results , variant_benchmarking_results = ( None , None , None , ) if db_connector . check_table_exists ( \"gene_summary\" ): gene_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM gene_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"disease_summary\" ): disease_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM disease_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"variant_summary\" ): variant_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM variant_summary\" ) . fetchdf () ) return BenchmarkSummaryResults ( gene_results = gene_benchmarking_results , disease_results = disease_benchmarking_results , variant_results = variant_benchmarking_results , ) parse_benchmark_results ( benchmark_summary_table ) Parse benchmark results from a DataFrame. Parameters: Name Type Description Default benchmark_summary_table DataFrame DataFrame containing benchmark results. required Returns: Type Description List [ BenchmarkRunResults ] List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. Source code in src/pheval/analyse/parse_benchmark_summary.py 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 def parse_benchmark_results ( benchmark_summary_table : pd . DataFrame ) -> List [ BenchmarkRunResults ]: \"\"\" Parse benchmark results from a DataFrame. Args: benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results. Returns: List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. \"\"\" results = [] for _ , row in benchmark_summary_table . iterrows (): benchmarking_result = BenchmarkRunResults ( rank_stats = RankStats ( top = row [ \"top\" ], top3 = row [ \"top3\" ], top5 = row [ \"top5\" ], top10 = row [ \"top10\" ], found = row [ \"found\" ], total = row [ \"total\" ], mrr = row [ \"mean_reciprocal_rank\" ], ), benchmark_name = row [ \"results_directory_path\" ], binary_classification_stats = BinaryClassificationStats (), ) results . append ( benchmarking_result ) return results","title":"Parse benchmark summary"},{"location":"api/pheval/analyse/parse_benchmark_summary/#src.pheval.analyse.parse_benchmark_summary.parse_benchmark_db","text":"Read the summary benchmark TSV output generated from the benchmark-comparison command. Parameters: Name Type Description Default benchmarking_db Path Path to the benchmark db. required Returns: Name Type Description BenchmarkSummaryResults BenchmarkSummaryResults A dataclass containing all benchmarking results contained in the db. Source code in src/pheval/analyse/parse_benchmark_summary.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 def parse_benchmark_db ( benchmarking_db : Path ) -> BenchmarkSummaryResults : \"\"\" Read the summary benchmark TSV output generated from the benchmark-comparison command. Args: benchmarking_db (Path): Path to the benchmark db. Returns: BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db. \"\"\" db_connector = BenchmarkDBManager ( benchmarking_db ) gene_benchmarking_results , disease_benchmarking_results , variant_benchmarking_results = ( None , None , None , ) if db_connector . check_table_exists ( \"gene_summary\" ): gene_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM gene_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"disease_summary\" ): disease_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM disease_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"variant_summary\" ): variant_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM variant_summary\" ) . fetchdf () ) return BenchmarkSummaryResults ( gene_results = gene_benchmarking_results , disease_results = disease_benchmarking_results , variant_results = variant_benchmarking_results , )","title":"parse_benchmark_db"},{"location":"api/pheval/analyse/parse_benchmark_summary/#src.pheval.analyse.parse_benchmark_summary.parse_benchmark_results","text":"Parse benchmark results from a DataFrame. Parameters: Name Type Description Default benchmark_summary_table DataFrame DataFrame containing benchmark results. required Returns: Type Description List [ BenchmarkRunResults ] List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. Source code in src/pheval/analyse/parse_benchmark_summary.py 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 def parse_benchmark_results ( benchmark_summary_table : pd . DataFrame ) -> List [ BenchmarkRunResults ]: \"\"\" Parse benchmark results from a DataFrame. Args: benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results. Returns: List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. \"\"\" results = [] for _ , row in benchmark_summary_table . iterrows (): benchmarking_result = BenchmarkRunResults ( rank_stats = RankStats ( top = row [ \"top\" ], top3 = row [ \"top3\" ], top5 = row [ \"top5\" ], top10 = row [ \"top10\" ], found = row [ \"found\" ], total = row [ \"total\" ], mrr = row [ \"mean_reciprocal_rank\" ], ), benchmark_name = row [ \"results_directory_path\" ], binary_classification_stats = BinaryClassificationStats (), ) results . append ( benchmarking_result ) return results","title":"parse_benchmark_results"},{"location":"api/pheval/analyse/parse_corpus/","text":"CorpusParser Class for parsing phenopacket corpus and retrieving known variants/genes/diseases. Source code in src/pheval/analyse/parse_corpus.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 class CorpusParser : \"\"\"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases.\"\"\" def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name def _create_gene_table ( self ) -> None : \"\"\" Create the Gene benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _gene ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, gene_symbol VARCHAR, gene_identifier VARCHAR ) \"\"\" ) def _create_variant_table ( self ) -> None : \"\"\" Create the Variant benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _variant ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, chrom VARCHAR, pos INTEGER, \"ref\" VARCHAR, alt VARCHAR ) \"\"\" ) def _create_disease_table ( self ): \"\"\" Create the Disease benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _disease ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, disease_identifier VARCHAR, disease_name VARCHAR ) \"\"\" ) def _create_tables ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Create tables based on the benchmarking analysis specified. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): self . _create_gene_table () if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): self . _create_variant_table () if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): self . _create_disease_table () def _insert_genes ( self , phenopacket_path : Path , genes : List [ ProbandCausativeGene ]) -> None : \"\"\" Insert known disease-causing genes into the Gene benchmarking table. Args: phenopacket_path(Path): Path to the Phenopacket file. genes(List[ProbandCausativeGene]): List of known genes associated with the proband. \"\"\" for gene in genes : identifier = f \" { phenopacket_path . name } - { gene . gene_symbol } \" self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _gene (identifier, phenopacket, gene_symbol, gene_identifier) VALUES (?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , gene . gene_symbol , gene . gene_identifier ), ) def _insert_variants ( self , phenopacket_path : Path , variants : List [ GenomicVariant ]) -> None : \"\"\" Insert known variants into the Variant benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: variants (List[GenomicVariant]): List of known variants associated with the proband. \"\"\" for variant in variants : identifier = ( f \" { phenopacket_path . name } - { variant . chrom } - { variant . pos } - { variant . ref } - { variant . alt } \" ) self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _variant (identifier, phenopacket, chrom, pos, \"ref\", alt) VALUES (?, ?, ?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , variant . chrom , variant . pos , variant . ref , variant . alt , ), ) def _insert_diseases ( self , phenopacket_path : Path , diseases : List [ ProbandDisease ]) -> None : \"\"\" Insert known diseases into the Disease benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: diseases (List[ProbandDisease]): List of known diseases associated with the proband. \"\"\" for disease in diseases : identifier = f \" { phenopacket_path . name } - { disease . disease_identifier } \" self . conn . execute ( f \"INSERT OR IGNORE INTO { self . table_name } _disease \" f \"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)\" , ( identifier , phenopacket_path . name , disease . disease_identifier , disease . disease_name , ), ) def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close () __init__ ( benchmark_name , phenopacket_dir ) Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. Source code in src/pheval/analyse/parse_corpus.py 68 69 70 71 72 73 74 75 76 def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name parse_corpus ( benchmark_generator ) Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. Source code in src/pheval/analyse/parse_corpus.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"Parse corpus"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser","text":"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases. Source code in src/pheval/analyse/parse_corpus.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 class CorpusParser : \"\"\"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases.\"\"\" def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name def _create_gene_table ( self ) -> None : \"\"\" Create the Gene benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _gene ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, gene_symbol VARCHAR, gene_identifier VARCHAR ) \"\"\" ) def _create_variant_table ( self ) -> None : \"\"\" Create the Variant benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _variant ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, chrom VARCHAR, pos INTEGER, \"ref\" VARCHAR, alt VARCHAR ) \"\"\" ) def _create_disease_table ( self ): \"\"\" Create the Disease benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _disease ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, disease_identifier VARCHAR, disease_name VARCHAR ) \"\"\" ) def _create_tables ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Create tables based on the benchmarking analysis specified. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): self . _create_gene_table () if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): self . _create_variant_table () if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): self . _create_disease_table () def _insert_genes ( self , phenopacket_path : Path , genes : List [ ProbandCausativeGene ]) -> None : \"\"\" Insert known disease-causing genes into the Gene benchmarking table. Args: phenopacket_path(Path): Path to the Phenopacket file. genes(List[ProbandCausativeGene]): List of known genes associated with the proband. \"\"\" for gene in genes : identifier = f \" { phenopacket_path . name } - { gene . gene_symbol } \" self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _gene (identifier, phenopacket, gene_symbol, gene_identifier) VALUES (?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , gene . gene_symbol , gene . gene_identifier ), ) def _insert_variants ( self , phenopacket_path : Path , variants : List [ GenomicVariant ]) -> None : \"\"\" Insert known variants into the Variant benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: variants (List[GenomicVariant]): List of known variants associated with the proband. \"\"\" for variant in variants : identifier = ( f \" { phenopacket_path . name } - { variant . chrom } - { variant . pos } - { variant . ref } - { variant . alt } \" ) self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _variant (identifier, phenopacket, chrom, pos, \"ref\", alt) VALUES (?, ?, ?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , variant . chrom , variant . pos , variant . ref , variant . alt , ), ) def _insert_diseases ( self , phenopacket_path : Path , diseases : List [ ProbandDisease ]) -> None : \"\"\" Insert known diseases into the Disease benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: diseases (List[ProbandDisease]): List of known diseases associated with the proband. \"\"\" for disease in diseases : identifier = f \" { phenopacket_path . name } - { disease . disease_identifier } \" self . conn . execute ( f \"INSERT OR IGNORE INTO { self . table_name } _disease \" f \"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)\" , ( identifier , phenopacket_path . name , disease . disease_identifier , disease . disease_name , ), ) def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"CorpusParser"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser.__init__","text":"Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. Source code in src/pheval/analyse/parse_corpus.py 68 69 70 71 72 73 74 75 76 def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name","title":"__init__"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser.parse_corpus","text":"Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. Source code in src/pheval/analyse/parse_corpus.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"parse_corpus"},{"location":"api/pheval/analyse/prioritisation_result_types/","text":"DiseasePrioritisationResult dataclass Store rank data for known diseases. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. disease ProbandDisease The proband disease. rank int The assigned rank for the disease. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 @dataclass class DiseasePrioritisationResult : \"\"\" Store rank data for known diseases. Attributes: phenopacket_path (Path): Path to the phenopacket. disease (ProbandDisease): The proband disease. rank (int): The assigned rank for the disease. Defaults to 0. \"\"\" phenopacket_path : Path disease : ProbandDisease rank : int = 0 GenePrioritisationResult dataclass Store rank data for causative genes. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. gene str The causative gene. rank int The assigned rank for the gene. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 7 8 9 10 11 12 13 14 15 16 17 18 19 20 @dataclass class GenePrioritisationResult : \"\"\" Store rank data for causative genes. Attributes: phenopacket_path (Path): Path to the phenopacket. gene (str): The causative gene. rank (int): The assigned rank for the gene. Defaults to 0. \"\"\" phenopacket_path : Path gene : str rank : int = 0 VariantPrioritisationResult dataclass Store rank data for variants. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. variant GenomicVariant The genomic variant. rank int The assigned rank for the variant. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 @dataclass class VariantPrioritisationResult : \"\"\" Store rank data for variants. Attributes: phenopacket_path (Path): Path to the phenopacket. variant (GenomicVariant): The genomic variant. rank (int): The assigned rank for the variant. Defaults to 0. \"\"\" phenopacket_path : Path variant : GenomicVariant rank : int = 0","title":"Prioritisation result types"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.DiseasePrioritisationResult","text":"Store rank data for known diseases. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. disease ProbandDisease The proband disease. rank int The assigned rank for the disease. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 @dataclass class DiseasePrioritisationResult : \"\"\" Store rank data for known diseases. Attributes: phenopacket_path (Path): Path to the phenopacket. disease (ProbandDisease): The proband disease. rank (int): The assigned rank for the disease. Defaults to 0. \"\"\" phenopacket_path : Path disease : ProbandDisease rank : int = 0","title":"DiseasePrioritisationResult"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.GenePrioritisationResult","text":"Store rank data for causative genes. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. gene str The causative gene. rank int The assigned rank for the gene. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 7 8 9 10 11 12 13 14 15 16 17 18 19 20 @dataclass class GenePrioritisationResult : \"\"\" Store rank data for causative genes. Attributes: phenopacket_path (Path): Path to the phenopacket. gene (str): The causative gene. rank (int): The assigned rank for the gene. Defaults to 0. \"\"\" phenopacket_path : Path gene : str rank : int = 0","title":"GenePrioritisationResult"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.VariantPrioritisationResult","text":"Store rank data for variants. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. variant GenomicVariant The genomic variant. rank int The assigned rank for the variant. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 @dataclass class VariantPrioritisationResult : \"\"\" Store rank data for variants. Attributes: phenopacket_path (Path): Path to the phenopacket. variant (GenomicVariant): The genomic variant. rank (int): The assigned rank for the variant. Defaults to 0. \"\"\" phenopacket_path : Path variant : GenomicVariant rank : int = 0","title":"VariantPrioritisationResult"},{"location":"api/pheval/analyse/rank_stats/","text":"RankStats dataclass Store statistics related to ranking. Attributes: Name Type Description top int Count of top-ranked matches. top3 int Count of matches within the top 3 ranks. top5 int Count of matches within the top 5 ranks. top10 int Count of matches within the top 10 ranks. found int Count of found matches. total int Total count of matches. reciprocal_ranks List [ float ] List of reciprocal ranks. relevant_ranks List [ List [ int ]] Nested list of ranks for the known entities for all cases in a run. mrr float Mean Reciprocal Rank (MRR). Defaults to None. Source code in src/pheval/analyse/rank_stats.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 @dataclass class RankStats : \"\"\"Store statistics related to ranking. Attributes: top (int): Count of top-ranked matches. top3 (int): Count of matches within the top 3 ranks. top5 (int): Count of matches within the top 5 ranks. top10 (int): Count of matches within the top 10 ranks. found (int): Count of found matches. total (int): Total count of matches. reciprocal_ranks (List[float]): List of reciprocal ranks. relevant_ranks List[List[int]]: Nested list of ranks for the known entities for all cases in a run. mrr (float): Mean Reciprocal Rank (MRR). Defaults to None. \"\"\" top : int = 0 top3 : int = 0 top5 : int = 0 top10 : int = 0 found : int = 0 total : int = 0 reciprocal_ranks : List = field ( default_factory = list ) relevant_result_ranks : List [ List [ int ]] = field ( default_factory = list ) mrr : float = None def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () @staticmethod def _execute_count_query ( conn : DuckDBPyConnection , table_name : str , column_name : str , condition : str ) -> int : \"\"\" Execute count query on table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to execute count query on. column_name (str): Name of the column to execute count query on. condition (str): Condition to execute count query. Returns: int: Count query result. \"\"\" query = f 'SELECT COUNT(*) FROM { table_name } WHERE \" { column_name } \" { condition } ' return conn . execute ( query ) . fetchone ()[ 0 ] @staticmethod def _fetch_reciprocal_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ float ]: \"\"\" Fetch reciprocal ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch reciprocal ranks from. column_name (str): Name of the column to fetch reciprocal ranks from. Returns: List[float]: List of reciprocal ranks. \"\"\" query = f 'SELECT \" { column_name } \" FROM { table_name } ' return [ 1 / rank [ 0 ] if rank [ 0 ] > 0 else 0 for rank in conn . execute ( query ) . fetchall ()] @staticmethod def _fetch_relevant_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ List [ int ]]: \"\"\" Fetch relevant ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch relevant ranks from. column_name (str): Name of the column to fetch relevant ranks from. Returns: List[List[int]]: List of relevant ranks. \"\"\" query = ( f 'SELECT LIST(\" { column_name } \") as values_list FROM { table_name } GROUP BY phenopacket' ) return [ rank [ 0 ] for rank in conn . execute ( query ) . fetchall ()] def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) @staticmethod def _average_precision_at_k ( number_of_relevant_entities_at_k : int , precision_at_k : float ) -> float : \"\"\" Calculate the Average Precision at k. Average Precision at k (AP@k) is a metric used to evaluate the precision of a ranked retrieval system. It measures the precision at each relevant position up to k and takes the average. Args: number_of_relevant_entities_at_k (int): The count of relevant entities in the top-k predictions. precision_at_k (float): The precision at k - the sum of the precision values at each relevant position. Returns: float: The Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better precision in the top-k predictions. \"\"\" return ( ( 1 / number_of_relevant_entities_at_k ) * precision_at_k if number_of_relevant_entities_at_k > 0 else 0.0 ) def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores ) add_ranks ( benchmark_name , table_name , column_name ) Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: Source code in src/pheval/analyse/rank_stats.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () f_beta_score_at_k ( percentage_at_k , k ) Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Parameters: Name Type Description Default percentage_at_k float The percentage of true positive predictions within the top-k. required k int The number of top predictions to consider. required Returns: Name Type Description float float The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. Source code in src/pheval/analyse/rank_stats.py 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) mean_average_precision_at_k ( k ) Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Parameters: Name Type Description Default k int The number of top predictions to consider for precision calculation. required Returns: Name Type Description float float The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. Source code in src/pheval/analyse/rank_stats.py 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores mean_normalised_discounted_cumulative_gain ( k ) Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Parameters: Name Type Description Default k int The rank cutoff for calculating NDCG. required Returns: Name Type Description float float The mean NDCG score across all query results. Source code in src/pheval/analyse/rank_stats.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores ) mean_reciprocal_rank () Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: Name Type Description float float The calculated Mean Reciprocal Rank. Source code in src/pheval/analyse/rank_stats.py 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) percentage_difference ( percentage_value_1 , percentage_value_2 ) staticmethod Calculate the percentage difference between two percentage values. Parameters: Name Type Description Default percentage_value_1 float The first percentage value. required percentage_value_2 float The second percentage value. required Returns: Name Type Description float float The difference between the two percentage values. Source code in src/pheval/analyse/rank_stats.py 167 168 169 170 171 172 173 174 175 176 177 178 179 @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 percentage_found () Calculate the percentage of matches found. Returns: Name Type Description float float The percentage of matches found compared to the total count. Source code in src/pheval/analyse/rank_stats.py 158 159 160 161 162 163 164 165 def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) percentage_rank ( value ) Calculate the percentage rank. Parameters: Name Type Description Default value int The value for which the percentage rank needs to be calculated. required Returns: Name Type Description float float The calculated percentage rank based on the provided value and the total count. Source code in src/pheval/analyse/rank_stats.py 110 111 112 113 114 115 116 117 118 119 120 def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total percentage_top () Calculate the percentage of top matches. Returns: Name Type Description float float The percentage of top matches compared to the total count. Source code in src/pheval/analyse/rank_stats.py 122 123 124 125 126 127 128 129 def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) percentage_top10 () Calculate the percentage of matches within the top 10. Returns: Name Type Description float float The percentage of matches within the top 10 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 149 150 151 152 153 154 155 156 def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) percentage_top3 () Calculate the percentage of matches within the top 3. Returns: Name Type Description float float The percentage of matches within the top 3 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 131 132 133 134 135 136 137 138 def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) percentage_top5 () Calculate the percentage of matches within the top 5. Returns: Name Type Description float float The percentage of matches within the top 5 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 140 141 142 143 144 145 146 147 def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) precision_at_k ( k ) Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Parameters: Name Type Description Default k int The number of top predictions to consider. required Returns: Name Type Description float float The precision at k, ranging from 0.0 to 1.0. float A higher precision indicates a better performance in identifying relevant items in the top-k predictions. Source code in src/pheval/analyse/rank_stats.py 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) return_mean_reciprocal_rank () Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: Name Type Description float float The Mean Reciprocal Rank value. Source code in src/pheval/analyse/rank_stats.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () RankStatsWriter Class for writing the rank stats to a file. Source code in src/pheval/analyse/rank_stats.py 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 class RankStatsWriter : \"\"\"Class for writing the rank stats to a file.\"\"\" def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close () __init__ ( benchmark_name , table_name ) Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. Source code in src/pheval/analyse/rank_stats.py 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () add_statistics_entry ( run_identifier , rank_stats , binary_classification ) Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. Source code in src/pheval/analyse/rank_stats.py 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"Rank stats"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats","text":"Store statistics related to ranking. Attributes: Name Type Description top int Count of top-ranked matches. top3 int Count of matches within the top 3 ranks. top5 int Count of matches within the top 5 ranks. top10 int Count of matches within the top 10 ranks. found int Count of found matches. total int Total count of matches. reciprocal_ranks List [ float ] List of reciprocal ranks. relevant_ranks List [ List [ int ]] Nested list of ranks for the known entities for all cases in a run. mrr float Mean Reciprocal Rank (MRR). Defaults to None. Source code in src/pheval/analyse/rank_stats.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 @dataclass class RankStats : \"\"\"Store statistics related to ranking. Attributes: top (int): Count of top-ranked matches. top3 (int): Count of matches within the top 3 ranks. top5 (int): Count of matches within the top 5 ranks. top10 (int): Count of matches within the top 10 ranks. found (int): Count of found matches. total (int): Total count of matches. reciprocal_ranks (List[float]): List of reciprocal ranks. relevant_ranks List[List[int]]: Nested list of ranks for the known entities for all cases in a run. mrr (float): Mean Reciprocal Rank (MRR). Defaults to None. \"\"\" top : int = 0 top3 : int = 0 top5 : int = 0 top10 : int = 0 found : int = 0 total : int = 0 reciprocal_ranks : List = field ( default_factory = list ) relevant_result_ranks : List [ List [ int ]] = field ( default_factory = list ) mrr : float = None def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () @staticmethod def _execute_count_query ( conn : DuckDBPyConnection , table_name : str , column_name : str , condition : str ) -> int : \"\"\" Execute count query on table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to execute count query on. column_name (str): Name of the column to execute count query on. condition (str): Condition to execute count query. Returns: int: Count query result. \"\"\" query = f 'SELECT COUNT(*) FROM { table_name } WHERE \" { column_name } \" { condition } ' return conn . execute ( query ) . fetchone ()[ 0 ] @staticmethod def _fetch_reciprocal_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ float ]: \"\"\" Fetch reciprocal ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch reciprocal ranks from. column_name (str): Name of the column to fetch reciprocal ranks from. Returns: List[float]: List of reciprocal ranks. \"\"\" query = f 'SELECT \" { column_name } \" FROM { table_name } ' return [ 1 / rank [ 0 ] if rank [ 0 ] > 0 else 0 for rank in conn . execute ( query ) . fetchall ()] @staticmethod def _fetch_relevant_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ List [ int ]]: \"\"\" Fetch relevant ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch relevant ranks from. column_name (str): Name of the column to fetch relevant ranks from. Returns: List[List[int]]: List of relevant ranks. \"\"\" query = ( f 'SELECT LIST(\" { column_name } \") as values_list FROM { table_name } GROUP BY phenopacket' ) return [ rank [ 0 ] for rank in conn . execute ( query ) . fetchall ()] def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) @staticmethod def _average_precision_at_k ( number_of_relevant_entities_at_k : int , precision_at_k : float ) -> float : \"\"\" Calculate the Average Precision at k. Average Precision at k (AP@k) is a metric used to evaluate the precision of a ranked retrieval system. It measures the precision at each relevant position up to k and takes the average. Args: number_of_relevant_entities_at_k (int): The count of relevant entities in the top-k predictions. precision_at_k (float): The precision at k - the sum of the precision values at each relevant position. Returns: float: The Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better precision in the top-k predictions. \"\"\" return ( ( 1 / number_of_relevant_entities_at_k ) * precision_at_k if number_of_relevant_entities_at_k > 0 else 0.0 ) def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores )","title":"RankStats"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.add_ranks","text":"Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: Source code in src/pheval/analyse/rank_stats.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close ()","title":"add_ranks"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.f_beta_score_at_k","text":"Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Parameters: Name Type Description Default percentage_at_k float The percentage of true positive predictions within the top-k. required k int The number of top predictions to consider. required Returns: Name Type Description float float The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. Source code in src/pheval/analyse/rank_stats.py 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 )","title":"f_beta_score_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_average_precision_at_k","text":"Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Parameters: Name Type Description Default k int The number of top predictions to consider for precision calculation. required Returns: Name Type Description float float The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. Source code in src/pheval/analyse/rank_stats.py 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores","title":"mean_average_precision_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_normalised_discounted_cumulative_gain","text":"Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Parameters: Name Type Description Default k int The rank cutoff for calculating NDCG. required Returns: Name Type Description float float The mean NDCG score across all query results. Source code in src/pheval/analyse/rank_stats.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores )","title":"mean_normalised_discounted_cumulative_gain"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_reciprocal_rank","text":"Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: Name Type Description float float The calculated Mean Reciprocal Rank. Source code in src/pheval/analyse/rank_stats.py 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks )","title":"mean_reciprocal_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_difference","text":"Calculate the percentage difference between two percentage values. Parameters: Name Type Description Default percentage_value_1 float The first percentage value. required percentage_value_2 float The second percentage value. required Returns: Name Type Description float float The difference between the two percentage values. Source code in src/pheval/analyse/rank_stats.py 167 168 169 170 171 172 173 174 175 176 177 178 179 @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2","title":"percentage_difference"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_found","text":"Calculate the percentage of matches found. Returns: Name Type Description float float The percentage of matches found compared to the total count. Source code in src/pheval/analyse/rank_stats.py 158 159 160 161 162 163 164 165 def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found )","title":"percentage_found"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_rank","text":"Calculate the percentage rank. Parameters: Name Type Description Default value int The value for which the percentage rank needs to be calculated. required Returns: Name Type Description float float The calculated percentage rank based on the provided value and the total count. Source code in src/pheval/analyse/rank_stats.py 110 111 112 113 114 115 116 117 118 119 120 def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total","title":"percentage_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top","text":"Calculate the percentage of top matches. Returns: Name Type Description float float The percentage of top matches compared to the total count. Source code in src/pheval/analyse/rank_stats.py 122 123 124 125 126 127 128 129 def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top )","title":"percentage_top"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top10","text":"Calculate the percentage of matches within the top 10. Returns: Name Type Description float float The percentage of matches within the top 10 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 149 150 151 152 153 154 155 156 def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 )","title":"percentage_top10"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top3","text":"Calculate the percentage of matches within the top 3. Returns: Name Type Description float float The percentage of matches within the top 3 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 131 132 133 134 135 136 137 138 def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 )","title":"percentage_top3"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top5","text":"Calculate the percentage of matches within the top 5. Returns: Name Type Description float float The percentage of matches within the top 5 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 140 141 142 143 144 145 146 147 def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 )","title":"percentage_top5"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.precision_at_k","text":"Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Parameters: Name Type Description Default k int The number of top predictions to consider. required Returns: Name Type Description float float The precision at k, ranging from 0.0 to 1.0. float A higher precision indicates a better performance in identifying relevant items in the top-k predictions. Source code in src/pheval/analyse/rank_stats.py 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k )","title":"precision_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.return_mean_reciprocal_rank","text":"Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: Name Type Description float float The Mean Reciprocal Rank value. Source code in src/pheval/analyse/rank_stats.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank ()","title":"return_mean_reciprocal_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter","text":"Class for writing the rank stats to a file. Source code in src/pheval/analyse/rank_stats.py 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 class RankStatsWriter : \"\"\"Class for writing the rank stats to a file.\"\"\" def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"RankStatsWriter"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter.__init__","text":"Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. Source code in src/pheval/analyse/rank_stats.py 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close ()","title":"__init__"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter.add_statistics_entry","text":"Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. Source code in src/pheval/analyse/rank_stats.py 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"add_statistics_entry"},{"location":"api/pheval/analyse/run_data_parser/","text":"Config Bases: BaseModel Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. Source code in src/pheval/analyse/run_data_parser.py 101 102 103 104 105 106 107 108 109 110 class Config ( BaseModel ): \"\"\" Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. \"\"\" benchmark_name : str runs : List [ RunConfig ] plot_customisation : PlotCustomisation PlotCustomisation Bases: BaseModel Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. Source code in src/pheval/analyse/run_data_parser.py 87 88 89 90 91 92 93 94 95 96 97 98 class PlotCustomisation ( BaseModel ): \"\"\" Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. \"\"\" gene_plots : SinglePlotCustomisation disease_plots : SinglePlotCustomisation variant_plots : SinglePlotCustomisation RunConfig Bases: BaseModel Store configurations for a run. Attributes: Name Type Description run_identifier str The run identifier. phenopacket_dir str The path to the phenopacket directory used for generating the results. results_dir str The path to the results directory. gene_analysis bool Whether or not to benchmark gene analysis results. variant_analysis bool Whether or not to benchmark variant analysis results. disease_analysis bool Whether or not to benchmark disease analysis results. threshold Optional [ float ] The threshold to consider for benchmarking. score_order Optional [ str ] The order of scores to consider for benchmarking, either ascending or descending. Source code in src/pheval/analyse/run_data_parser.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 class RunConfig ( BaseModel ): \"\"\" Store configurations for a run. Attributes: run_identifier (str): The run identifier. phenopacket_dir (str): The path to the phenopacket directory used for generating the results. results_dir (str): The path to the results directory. gene_analysis (bool): Whether or not to benchmark gene analysis results. variant_analysis (bool): Whether or not to benchmark variant analysis results. disease_analysis (bool): Whether or not to benchmark disease analysis results. threshold (Optional[float]): The threshold to consider for benchmarking. score_order (Optional[str]): The order of scores to consider for benchmarking, either ascending or descending. \"\"\" run_identifier : str phenopacket_dir : Path results_dir : Path gene_analysis : bool variant_analysis : bool disease_analysis : bool threshold : Optional [ float ] score_order : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values handle_blank_fields ( values ) Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values SinglePlotCustomisation Bases: BaseModel Store customisations for plots. Attributes: Name Type Description plot_type str The plot type. rank_plot_title str The title for the rank summary plot. roc_curve_title str The title for the roc curve plot. precision_recall_title str The title for the precision-recall plot. Source code in src/pheval/analyse/run_data_parser.py 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class SinglePlotCustomisation ( BaseModel ): \"\"\" Store customisations for plots. Attributes: plot_type (str): The plot type. rank_plot_title (str): The title for the rank summary plot. roc_curve_title (str): The title for the roc curve plot. precision_recall_title (str): The title for the precision-recall plot. \"\"\" plot_type : Optional [ str ] = \"bar_cumulative\" rank_plot_title : Optional [ str ] roc_curve_title : Optional [ str ] precision_recall_title : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values handle_blank_fields ( values ) Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values parse_run_config ( run_config ) Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. Source code in src/pheval/analyse/run_data_parser.py 113 114 115 116 117 118 119 120 121 122 123 124 125 def parse_run_config ( run_config : Path ) -> Config : \"\"\" Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. \"\"\" with open ( run_config , \"r\" ) as f : config_data = yaml . safe_load ( f ) f . close () config = Config ( ** config_data ) return config","title":"Run data parser"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.Config","text":"Bases: BaseModel Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. Source code in src/pheval/analyse/run_data_parser.py 101 102 103 104 105 106 107 108 109 110 class Config ( BaseModel ): \"\"\" Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. \"\"\" benchmark_name : str runs : List [ RunConfig ] plot_customisation : PlotCustomisation","title":"Config"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.PlotCustomisation","text":"Bases: BaseModel Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. Source code in src/pheval/analyse/run_data_parser.py 87 88 89 90 91 92 93 94 95 96 97 98 class PlotCustomisation ( BaseModel ): \"\"\" Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. \"\"\" gene_plots : SinglePlotCustomisation disease_plots : SinglePlotCustomisation variant_plots : SinglePlotCustomisation","title":"PlotCustomisation"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.RunConfig","text":"Bases: BaseModel Store configurations for a run. Attributes: Name Type Description run_identifier str The run identifier. phenopacket_dir str The path to the phenopacket directory used for generating the results. results_dir str The path to the results directory. gene_analysis bool Whether or not to benchmark gene analysis results. variant_analysis bool Whether or not to benchmark variant analysis results. disease_analysis bool Whether or not to benchmark disease analysis results. threshold Optional [ float ] The threshold to consider for benchmarking. score_order Optional [ str ] The order of scores to consider for benchmarking, either ascending or descending. Source code in src/pheval/analyse/run_data_parser.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 class RunConfig ( BaseModel ): \"\"\" Store configurations for a run. Attributes: run_identifier (str): The run identifier. phenopacket_dir (str): The path to the phenopacket directory used for generating the results. results_dir (str): The path to the results directory. gene_analysis (bool): Whether or not to benchmark gene analysis results. variant_analysis (bool): Whether or not to benchmark variant analysis results. disease_analysis (bool): Whether or not to benchmark disease analysis results. threshold (Optional[float]): The threshold to consider for benchmarking. score_order (Optional[str]): The order of scores to consider for benchmarking, either ascending or descending. \"\"\" run_identifier : str phenopacket_dir : Path results_dir : Path gene_analysis : bool variant_analysis : bool disease_analysis : bool threshold : Optional [ float ] score_order : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values","title":"RunConfig"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.RunConfig.handle_blank_fields","text":"Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values","title":"handle_blank_fields"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.SinglePlotCustomisation","text":"Bases: BaseModel Store customisations for plots. Attributes: Name Type Description plot_type str The plot type. rank_plot_title str The title for the rank summary plot. roc_curve_title str The title for the roc curve plot. precision_recall_title str The title for the precision-recall plot. Source code in src/pheval/analyse/run_data_parser.py 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class SinglePlotCustomisation ( BaseModel ): \"\"\" Store customisations for plots. Attributes: plot_type (str): The plot type. rank_plot_title (str): The title for the rank summary plot. roc_curve_title (str): The title for the roc curve plot. precision_recall_title (str): The title for the precision-recall plot. \"\"\" plot_type : Optional [ str ] = \"bar_cumulative\" rank_plot_title : Optional [ str ] roc_curve_title : Optional [ str ] precision_recall_title : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values","title":"SinglePlotCustomisation"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.SinglePlotCustomisation.handle_blank_fields","text":"Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values","title":"handle_blank_fields"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.parse_run_config","text":"Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. Source code in src/pheval/analyse/run_data_parser.py 113 114 115 116 117 118 119 120 121 122 123 124 125 def parse_run_config ( run_config : Path ) -> Config : \"\"\" Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. \"\"\" with open ( run_config , \"r\" ) as f : config_data = yaml . safe_load ( f ) f . close () config = Config ( ** config_data ) return config","title":"parse_run_config"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/","text":"AssessVariantPrioritisation Bases: AssessPrioritisationBase Class for assessing variant prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 class AssessVariantPrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing variant prioritisation based on thresholds and scoring orders.\"\"\" def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , ) assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , binary_classification_stats ) Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_variant_result_path Path Path to standardised variant TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , ) assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker ) Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required variant_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required variant_benchmarker AssessVariantPrioritisation AssessVariantPrioritisation class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 def assess_phenopacket_variant_prioritisation ( phenopacket_path : Path , run : RunConfig , variant_binary_classification_stats : BinaryClassificationStats , variant_benchmarker : AssessVariantPrioritisation , ) -> None : \"\"\" Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance. \"\"\" standardised_variant_result_path = run . results_dir . joinpath ( f \"pheval_variant_results/ { phenopacket_path . stem } -pheval_variant_result.tsv\" ) variant_benchmarker . assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , variant_binary_classification_stats , ) benchmark_variant_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on variant prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def benchmark_variant_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on variant prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" variant_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) variant_benchmarker = AssessVariantPrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_variant\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker , ) variant_rank_stats = RankStats () variant_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _variant\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( benchmark_name = run . run_identifier , rank_stats = variant_rank_stats , binary_classification_stats = variant_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Variant prioritisation analysis"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.AssessVariantPrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing variant prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 class AssessVariantPrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing variant prioritisation based on thresholds and scoring orders.\"\"\" def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , )","title":"AssessVariantPrioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.AssessVariantPrioritisation.assess_variant_prioritisation","text":"Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_variant_result_path Path Path to standardised variant TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , )","title":"assess_variant_prioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.assess_phenopacket_variant_prioritisation","text":"Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required variant_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required variant_benchmarker AssessVariantPrioritisation AssessVariantPrioritisation class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 def assess_phenopacket_variant_prioritisation ( phenopacket_path : Path , run : RunConfig , variant_binary_classification_stats : BinaryClassificationStats , variant_benchmarker : AssessVariantPrioritisation , ) -> None : \"\"\" Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance. \"\"\" standardised_variant_result_path = run . results_dir . joinpath ( f \"pheval_variant_results/ { phenopacket_path . stem } -pheval_variant_result.tsv\" ) variant_benchmarker . assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , variant_binary_classification_stats , )","title":"assess_phenopacket_variant_prioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.benchmark_variant_prioritisation","text":"Benchmark a directory based on variant prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def benchmark_variant_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on variant prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" variant_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) variant_benchmarker = AssessVariantPrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_variant\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker , ) variant_rank_stats = RankStats () variant_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _variant\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( benchmark_name = run . run_identifier , rank_stats = variant_rank_stats , binary_classification_stats = variant_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_variant_prioritisation"},{"location":"api/pheval/infra/exomiserdb/","text":"DBConnection Source code in src/pheval/infra/exomiserdb.py 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 class DBConnection : connection = None def __init__ ( self , connection ): DBConnection . connection = connection @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection def close ( self ): return self . connection . close () @classmethod def get_cursor ( cls ) -> jaydebeapi . Cursor : connection = cls . get_connection () return connection . cursor () get_connection () classmethod Creates return new Singleton database connection Source code in src/pheval/infra/exomiserdb.py 49 50 51 52 @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection DBConnector Source code in src/pheval/infra/exomiserdb.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 class DBConnector : def __init__ ( self , jar : Path , driver : str , server : str , database : str , user : str , password : str ): self . jar = jar self . driver = driver self . server = server self . database = database self . user = user self . password = password self . dbconn = None def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) def __enter__ ( self ) -> jaydebeapi . Connection : self . dbconn = self . create_connection () return self . dbconn def __exit__ ( self , * other ): self . dbconn . close () create_connection () creates h2 database connection Source code in src/pheval/infra/exomiserdb.py 26 27 28 29 30 31 32 33 def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) ExomiserDB Source code in src/pheval/infra/exomiserdb.py 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 class ExomiserDB : def __init__ ( self , db_path : Path ): try : self . connector = DBConnector ( # noqa jar = os . path . join ( os . path . dirname ( __file__ ), \"../../../lib/h2-1.4.199.jar\" ), driver = \"org.h2.Driver\" , server = f \"jdbc:h2: { db_path } \" , user = \"sa\" , password = \"\" , database = \"\" , ) except Exception as e : print ( \"An exception occurred\" , e ) def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length ) import_from_semsim_file ( input_file , subject_prefix , object_prefix ) imports semsim tsv profile into exomiser phenotype database Parameters: Name Type Description Default input_file Path semsim profile required subject_prefix str Subject Prefix. e.g HP required object_prefix str Object Prefix. e.g MP required Source code in src/pheval/infra/exomiserdb.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"Exomiserdb"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnection","text":"Source code in src/pheval/infra/exomiserdb.py 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 class DBConnection : connection = None def __init__ ( self , connection ): DBConnection . connection = connection @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection def close ( self ): return self . connection . close () @classmethod def get_cursor ( cls ) -> jaydebeapi . Cursor : connection = cls . get_connection () return connection . cursor ()","title":"DBConnection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnection.get_connection","text":"Creates return new Singleton database connection Source code in src/pheval/infra/exomiserdb.py 49 50 51 52 @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection","title":"get_connection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnector","text":"Source code in src/pheval/infra/exomiserdb.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 class DBConnector : def __init__ ( self , jar : Path , driver : str , server : str , database : str , user : str , password : str ): self . jar = jar self . driver = driver self . server = server self . database = database self . user = user self . password = password self . dbconn = None def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) def __enter__ ( self ) -> jaydebeapi . Connection : self . dbconn = self . create_connection () return self . dbconn def __exit__ ( self , * other ): self . dbconn . close ()","title":"DBConnector"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnector.create_connection","text":"creates h2 database connection Source code in src/pheval/infra/exomiserdb.py 26 27 28 29 30 31 32 33 def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , )","title":"create_connection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.ExomiserDB","text":"Source code in src/pheval/infra/exomiserdb.py 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 class ExomiserDB : def __init__ ( self , db_path : Path ): try : self . connector = DBConnector ( # noqa jar = os . path . join ( os . path . dirname ( __file__ ), \"../../../lib/h2-1.4.199.jar\" ), driver = \"org.h2.Driver\" , server = f \"jdbc:h2: { db_path } \" , user = \"sa\" , password = \"\" , database = \"\" , ) except Exception as e : print ( \"An exception occurred\" , e ) def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"ExomiserDB"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.ExomiserDB.import_from_semsim_file","text":"imports semsim tsv profile into exomiser phenotype database Parameters: Name Type Description Default input_file Path semsim profile required subject_prefix str Subject Prefix. e.g HP required object_prefix str Object Prefix. e.g MP required Source code in src/pheval/infra/exomiserdb.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"import_from_semsim_file"},{"location":"api/pheval/post_processing/post_processing/","text":"PhEvalDiseaseResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float PhEvalGeneResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float PhEvalResult dataclass Base class for PhEval results. Source code in src/pheval/post_processing/post_processing.py 25 26 27 @dataclass class PhEvalResult : \"\"\"Base class for PhEval results.\"\"\" PhEvalVariantResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float RankedPhEvalDiseaseResult dataclass Bases: PhEvalDiseaseResult PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @dataclass class RankedPhEvalDiseaseResult ( PhEvalDiseaseResult ): \"\"\"PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , ) from_disease_result ( pheval_disease_result , rank ) staticmethod Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalDiseaseResult The result as a RankedPhEvalDiseaseResult Source code in src/pheval/post_processing/post_processing.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , ) RankedPhEvalGeneResult dataclass Bases: PhEvalGeneResult PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @dataclass class RankedPhEvalGeneResult ( PhEvalGeneResult ): \"\"\"PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , ) from_gene_result ( pheval_gene_result , rank ) staticmethod Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalGeneResult The result as a RankedPhEvalGeneResult Source code in src/pheval/post_processing/post_processing.py 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , ) RankedPhEvalVariantResult dataclass Bases: PhEvalVariantResult PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @dataclass class RankedPhEvalVariantResult ( PhEvalVariantResult ): \"\"\"PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , ) from_variant_result ( pheval_variant_result , rank ) staticmethod Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalVariantResult The result as a RankedPhEvalVariantResult Source code in src/pheval/post_processing/post_processing.py 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , ) ResultSorter Class for sorting PhEvalResult instances based on a given sort order. Source code in src/pheval/post_processing/post_processing.py 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 class ResultSorter : \"\"\"Class for sorting PhEvalResult instances based on a given sort order.\"\"\" def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order def _sort_by_decreasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in descending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = True ) def _sort_by_increasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in ascending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = False ) def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () ) __init__ ( pheval_results , sort_order ) Initialise ResultSorter Parameters: Name Type Description Default pheval_results [ PhEvalResult ] List of PhEvalResult instances to be sorted required sort_order SortOrder Sorting order to be applied required Source code in src/pheval/post_processing/post_processing.py 188 189 190 191 192 193 194 195 196 197 def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order sort_pheval_results () Sort results based on the specified sort order. Returns: Type Description [ PhEvalResult ] [PhEvalResult]: Sorted list of PhEvalResult instances. Source code in src/pheval/post_processing/post_processing.py 217 218 219 220 221 222 223 224 225 226 227 228 def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () ) SortOrder Bases: Enum Enumeration representing sorting orders. Source code in src/pheval/post_processing/post_processing.py 176 177 178 179 180 181 182 class SortOrder ( Enum ): \"\"\"Enumeration representing sorting orders.\"\"\" ASCENDING = 1 \"\"\"Ascending sort order.\"\"\" DESCENDING = 2 \"\"\"Descending sort order.\"\"\" ASCENDING = 1 class-attribute instance-attribute Ascending sort order. DESCENDING = 2 class-attribute instance-attribute Descending sort order. calculate_end_pos ( variant_start , variant_ref ) Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: Name Type Description int int The end position of the variant Source code in src/pheval/post_processing/post_processing.py 13 14 15 16 17 18 19 20 21 22 def calculate_end_pos ( variant_start : int , variant_ref : str ) -> int : \"\"\"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: int: The end position of the variant \"\"\" return variant_start + len ( variant_ref ) - 1 generate_pheval_result ( pheval_result , sort_order_str , output_dir , tool_result_path ) Generate PhEval variant, gene or disease TSV result based on input results. Parameters: Name Type Description Default pheval_result [ PhEvalResult ] List of PhEvalResult instances to be processed. required sort_order_str str String representation of the desired sorting order. required output_dir Path Path to the output directory. required tool_result_path Path Path to the tool-specific result file. required Raises: Type Description ValueError If the results are not all the same type or an error occurs during file writing. Source code in src/pheval/post_processing/post_processing.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 def generate_pheval_result ( pheval_result : [ PhEvalResult ], sort_order_str : str , output_dir : Path , tool_result_path : Path , ) -> None : \"\"\" Generate PhEval variant, gene or disease TSV result based on input results. Args: pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed. sort_order_str (str): String representation of the desired sorting order. output_dir (Path): Path to the output directory. tool_result_path (Path): Path to the tool-specific result file. Raises: ValueError: If the results are not all the same type or an error occurs during file writing. \"\"\" if not pheval_result : info_log . warning ( f \"No results found for { tool_result_path . name } \" ) return ranked_pheval_result = _create_pheval_result ( pheval_result , sort_order_str ) if all ( isinstance ( result , PhEvalGeneResult ) for result in pheval_result ): _write_pheval_gene_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalVariantResult ) for result in pheval_result ): _write_pheval_variant_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalDiseaseResult ) for result in pheval_result ): _write_pheval_disease_result ( ranked_pheval_result , output_dir , tool_result_path ) else : raise ValueError ( \"Results are not all of the same type.\" )","title":"Post processing"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalDiseaseResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float","title":"PhEvalDiseaseResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalGeneResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float","title":"PhEvalGeneResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalResult","text":"Base class for PhEval results. Source code in src/pheval/post_processing/post_processing.py 25 26 27 @dataclass class PhEvalResult : \"\"\"Base class for PhEval results.\"\"\"","title":"PhEvalResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalVariantResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float","title":"PhEvalVariantResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalDiseaseResult","text":"Bases: PhEvalDiseaseResult PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @dataclass class RankedPhEvalDiseaseResult ( PhEvalDiseaseResult ): \"\"\"PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , )","title":"RankedPhEvalDiseaseResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalDiseaseResult.from_disease_result","text":"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalDiseaseResult The result as a RankedPhEvalDiseaseResult Source code in src/pheval/post_processing/post_processing.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , )","title":"from_disease_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalGeneResult","text":"Bases: PhEvalGeneResult PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @dataclass class RankedPhEvalGeneResult ( PhEvalGeneResult ): \"\"\"PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , )","title":"RankedPhEvalGeneResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalGeneResult.from_gene_result","text":"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalGeneResult The result as a RankedPhEvalGeneResult Source code in src/pheval/post_processing/post_processing.py 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , )","title":"from_gene_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalVariantResult","text":"Bases: PhEvalVariantResult PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @dataclass class RankedPhEvalVariantResult ( PhEvalVariantResult ): \"\"\"PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , )","title":"RankedPhEvalVariantResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalVariantResult.from_variant_result","text":"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalVariantResult The result as a RankedPhEvalVariantResult Source code in src/pheval/post_processing/post_processing.py 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , )","title":"from_variant_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter","text":"Class for sorting PhEvalResult instances based on a given sort order. Source code in src/pheval/post_processing/post_processing.py 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 class ResultSorter : \"\"\"Class for sorting PhEvalResult instances based on a given sort order.\"\"\" def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order def _sort_by_decreasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in descending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = True ) def _sort_by_increasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in ascending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = False ) def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () )","title":"ResultSorter"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter.__init__","text":"Initialise ResultSorter Parameters: Name Type Description Default pheval_results [ PhEvalResult ] List of PhEvalResult instances to be sorted required sort_order SortOrder Sorting order to be applied required Source code in src/pheval/post_processing/post_processing.py 188 189 190 191 192 193 194 195 196 197 def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order","title":"__init__"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter.sort_pheval_results","text":"Sort results based on the specified sort order. Returns: Type Description [ PhEvalResult ] [PhEvalResult]: Sorted list of PhEvalResult instances. Source code in src/pheval/post_processing/post_processing.py 217 218 219 220 221 222 223 224 225 226 227 228 def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () )","title":"sort_pheval_results"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder","text":"Bases: Enum Enumeration representing sorting orders. Source code in src/pheval/post_processing/post_processing.py 176 177 178 179 180 181 182 class SortOrder ( Enum ): \"\"\"Enumeration representing sorting orders.\"\"\" ASCENDING = 1 \"\"\"Ascending sort order.\"\"\" DESCENDING = 2 \"\"\"Descending sort order.\"\"\"","title":"SortOrder"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder.ASCENDING","text":"Ascending sort order.","title":"ASCENDING"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder.DESCENDING","text":"Descending sort order.","title":"DESCENDING"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.calculate_end_pos","text":"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: Name Type Description int int The end position of the variant Source code in src/pheval/post_processing/post_processing.py 13 14 15 16 17 18 19 20 21 22 def calculate_end_pos ( variant_start : int , variant_ref : str ) -> int : \"\"\"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: int: The end position of the variant \"\"\" return variant_start + len ( variant_ref ) - 1","title":"calculate_end_pos"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.generate_pheval_result","text":"Generate PhEval variant, gene or disease TSV result based on input results. Parameters: Name Type Description Default pheval_result [ PhEvalResult ] List of PhEvalResult instances to be processed. required sort_order_str str String representation of the desired sorting order. required output_dir Path Path to the output directory. required tool_result_path Path Path to the tool-specific result file. required Raises: Type Description ValueError If the results are not all the same type or an error occurs during file writing. Source code in src/pheval/post_processing/post_processing.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 def generate_pheval_result ( pheval_result : [ PhEvalResult ], sort_order_str : str , output_dir : Path , tool_result_path : Path , ) -> None : \"\"\" Generate PhEval variant, gene or disease TSV result based on input results. Args: pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed. sort_order_str (str): String representation of the desired sorting order. output_dir (Path): Path to the output directory. tool_result_path (Path): Path to the tool-specific result file. Raises: ValueError: If the results are not all the same type or an error occurs during file writing. \"\"\" if not pheval_result : info_log . warning ( f \"No results found for { tool_result_path . name } \" ) return ranked_pheval_result = _create_pheval_result ( pheval_result , sort_order_str ) if all ( isinstance ( result , PhEvalGeneResult ) for result in pheval_result ): _write_pheval_gene_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalVariantResult ) for result in pheval_result ): _write_pheval_variant_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalDiseaseResult ) for result in pheval_result ): _write_pheval_disease_result ( ranked_pheval_result , output_dir , tool_result_path ) else : raise ValueError ( \"Results are not all of the same type.\" )","title":"generate_pheval_result"},{"location":"api/pheval/prepare/create_noisy_phenopackets/","text":"HpoRandomiser Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO). Source code in src/pheval/prepare/create_noisy_phenopackets.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 class HpoRandomiser : \"\"\"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO).\"\"\" def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), ) __init__ ( hpo_ontology , scramble_factor ) Initialise the HpoRandomiser. Parameters: Name Type Description Default hpo_ontology ProntoImplementation The instance of the HPO ontology. required scramble_factor float A factor for scrambling phenotypic features. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 37 38 39 40 41 42 43 44 45 46 47 def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor add_noise_to_phenotypic_profile ( phenopacket ) Randomise the phenotypic profile of a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] The Phenopacket or Family to be randomised. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The randomised Phenopacket or Family. Source code in src/pheval/prepare/create_noisy_phenopackets.py 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket convert_patient_terms_to_parent ( phenotypic_features , retained_phenotypic_features , number_of_scrambled_terms ) Convert a subset of patient HPO terms to their respective parent terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of all phenotypic features. required retained_phenotypic_features List [ PhenotypicFeature ] List of retained non-scrambled phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. Source code in src/pheval/prepare/create_noisy_phenopackets.py 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms create_random_hpo_terms ( number_of_scrambled_terms ) Generate a list of random HPO terms. Parameters: Name Type Description Default number_of_scrambled_terms int The count of random HPO terms to be generated. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomly selected HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] create_scrambled_phenopacket ( output_dir , phenopacket_path ) Create a scrambled version of a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopacket. required phenopacket_path Path The path to the original Phenopacket file. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) create_scrambled_phenopackets ( output_dir , phenopacket_dir ) Create scrambled versions of Phenopackets within a directory. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_dir Path The directory containing the original Phenopacket files. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), ) randomise_hpo_terms ( phenotypic_features ) Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features to be randomised. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomised HPO terms. Note This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. Source code in src/pheval/prepare/create_noisy_phenopackets.py 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) staticmethod Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) retrieve_hpo_term ( hpo_id ) Retrieve an HPO term based on the provided HPO ID. Parameters: Name Type Description Default hpo_id str The HPO ID of the term to retrieve. required Returns: Name Type Description PhenotypicFeature PhenotypicFeature The PhenotypicFeature object representing the retrieved HPO term. Source code in src/pheval/prepare/create_noisy_phenopackets.py 64 65 66 67 68 69 70 71 72 73 74 75 76 def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) scramble_factor_proportions ( phenotypic_features ) Calculate the proportion of scrambled HPO terms based on the scramble factor. Parameters: Name Type Description Default phenotypic_features list [ PhenotypicFeature ] List of phenotypic features. required Returns: Name Type Description int int The calculated number of phenotypic features to be scrambled. Source code in src/pheval/prepare/create_noisy_phenopackets.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) load_ontology ( local_cached_ontology = None ) Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. Source code in src/pheval/prepare/create_noisy_phenopackets.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 def load_ontology ( local_cached_ontology : Path = None ) -> ProntoImplementation : \"\"\" Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. \"\"\" if local_cached_ontology is None : resource = OntologyResource ( slug = \"hp.obo\" , local = False ) return ProntoImplementation ( resource ) else : resource = OntologyResource ( slug = local_cached_ontology , local = True ) return ProntoImplementation ( resource ) scramble_phenopackets ( output_dir , phenopacket_path , phenopacket_dir , scramble_factor , local_cached_ontology ) Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_path Path The path to a single Phenopacket file (if applicable). required phenopacket_dir Path The directory containing multiple Phenopacket files (if applicable). required scramble_factor float A factor determining the level of scrambling for phenotypic features. required local_cached_ontology Path The path to the local cached ontology. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def scramble_phenopackets ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , scramble_factor : float , local_cached_ontology : Path , ) -> None : \"\"\" Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_path (Path): The path to a single Phenopacket file (if applicable). phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable). scramble_factor (float): A factor determining the level of scrambling for phenotypic features. local_cached_ontology (Path): The path to the local cached ontology. \"\"\" output_dir . mkdir ( exist_ok = True ) ontology = load_ontology ( local_cached_ontology ) if phenopacket_path is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopacket ( output_dir , phenopacket_path ) elif phenopacket_dir is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopackets ( output_dir , phenopacket_dir , )","title":"Create noisy phenopackets"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser","text":"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO). Source code in src/pheval/prepare/create_noisy_phenopackets.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 class HpoRandomiser : \"\"\"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO).\"\"\" def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), )","title":"HpoRandomiser"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.__init__","text":"Initialise the HpoRandomiser. Parameters: Name Type Description Default hpo_ontology ProntoImplementation The instance of the HPO ontology. required scramble_factor float A factor for scrambling phenotypic features. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 37 38 39 40 41 42 43 44 45 46 47 def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor","title":"__init__"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.add_noise_to_phenotypic_profile","text":"Randomise the phenotypic profile of a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] The Phenopacket or Family to be randomised. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The randomised Phenopacket or Family. Source code in src/pheval/prepare/create_noisy_phenopackets.py 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket","title":"add_noise_to_phenotypic_profile"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.convert_patient_terms_to_parent","text":"Convert a subset of patient HPO terms to their respective parent terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of all phenotypic features. required retained_phenotypic_features List [ PhenotypicFeature ] List of retained non-scrambled phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. Source code in src/pheval/prepare/create_noisy_phenopackets.py 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms","title":"convert_patient_terms_to_parent"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_random_hpo_terms","text":"Generate a list of random HPO terms. Parameters: Name Type Description Default number_of_scrambled_terms int The count of random HPO terms to be generated. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomly selected HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ]","title":"create_random_hpo_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_scrambled_phenopacket","text":"Create a scrambled version of a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopacket. required phenopacket_path Path The path to the original Phenopacket file. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), )","title":"create_scrambled_phenopacket"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_scrambled_phenopackets","text":"Create scrambled versions of Phenopackets within a directory. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_dir Path The directory containing the original Phenopacket files. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), )","title":"create_scrambled_phenopackets"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.randomise_hpo_terms","text":"Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features to be randomised. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomised HPO terms. Note This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. Source code in src/pheval/prepare/create_noisy_phenopackets.py 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) )","title":"randomise_hpo_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.retain_real_patient_terms","text":"Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id )","title":"retain_real_patient_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.retrieve_hpo_term","text":"Retrieve an HPO term based on the provided HPO ID. Parameters: Name Type Description Default hpo_id str The HPO ID of the term to retrieve. required Returns: Name Type Description PhenotypicFeature PhenotypicFeature The PhenotypicFeature object representing the retrieved HPO term. Source code in src/pheval/prepare/create_noisy_phenopackets.py 64 65 66 67 68 69 70 71 72 73 74 75 76 def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term ))","title":"retrieve_hpo_term"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.scramble_factor_proportions","text":"Calculate the proportion of scrambled HPO terms based on the scramble factor. Parameters: Name Type Description Default phenotypic_features list [ PhenotypicFeature ] List of phenotypic features. required Returns: Name Type Description int int The calculated number of phenotypic features to be scrambled. Source code in src/pheval/prepare/create_noisy_phenopackets.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 ))","title":"scramble_factor_proportions"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.load_ontology","text":"Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. Source code in src/pheval/prepare/create_noisy_phenopackets.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 def load_ontology ( local_cached_ontology : Path = None ) -> ProntoImplementation : \"\"\" Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. \"\"\" if local_cached_ontology is None : resource = OntologyResource ( slug = \"hp.obo\" , local = False ) return ProntoImplementation ( resource ) else : resource = OntologyResource ( slug = local_cached_ontology , local = True ) return ProntoImplementation ( resource )","title":"load_ontology"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.scramble_phenopackets","text":"Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_path Path The path to a single Phenopacket file (if applicable). required phenopacket_dir Path The directory containing multiple Phenopacket files (if applicable). required scramble_factor float A factor determining the level of scrambling for phenotypic features. required local_cached_ontology Path The path to the local cached ontology. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def scramble_phenopackets ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , scramble_factor : float , local_cached_ontology : Path , ) -> None : \"\"\" Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_path (Path): The path to a single Phenopacket file (if applicable). phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable). scramble_factor (float): A factor determining the level of scrambling for phenotypic features. local_cached_ontology (Path): The path to the local cached ontology. \"\"\" output_dir . mkdir ( exist_ok = True ) ontology = load_ontology ( local_cached_ontology ) if phenopacket_path is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopacket ( output_dir , phenopacket_path ) elif phenopacket_dir is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopackets ( output_dir , phenopacket_dir , )","title":"scramble_phenopackets"},{"location":"api/pheval/prepare/create_spiked_vcf/","text":"VcfFile dataclass Represents a VCF file with its name, contents, and header information. Attributes: Name Type Description vcf_file_name str The name of the VCF file. vcf_contents List [ str ] The contents of the VCF file. vcf_header VcfHeader The parsed header information of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @dataclass class VcfFile : \"\"\" Represents a VCF file with its name, contents, and header information. Attributes: vcf_file_name (str): The name of the VCF file. vcf_contents (List[str]): The contents of the VCF file. vcf_header (VcfHeader): The parsed header information of the VCF file. \"\"\" vcf_file_name : str = None vcf_contents : List [ str ] = None vcf_header : VcfHeader = None @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ()) populate_fields ( template_vcf ) staticmethod Populate the fields of the VcfFile instance using the contents of a template VCF file. Parameters: Name Type Description Default template_vcf Path The path to the template VCF file. required Returns: Name Type Description VcfFile An instance of VcfFile with populated fields. Source code in src/pheval/prepare/create_spiked_vcf.py 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ()) VcfHeader dataclass Data obtained from VCF header. Parameters: Name Type Description Default sample_id str The sample identifier from the VCF header. required assembly str The assembly information obtained from the VCF header. required chr_status bool A boolean indicating whether the VCF denotes chromosomes as chr or not. required Source code in src/pheval/prepare/create_spiked_vcf.py 78 79 80 81 82 83 84 85 86 87 88 89 90 @dataclass class VcfHeader : \"\"\"Data obtained from VCF header. Args: sample_id (str): The sample identifier from the VCF header. assembly (str): The assembly information obtained from the VCF header. chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not. \"\"\" sample_id : str assembly : str chr_status : bool VcfHeaderParser Class for parsing the header of a VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 class VcfHeaderParser : \"\"\"Class for parsing the header of a VCF file.\"\"\" def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status ) __init__ ( vcf_contents ) Initialise the VcfHeaderParser. Parameters: Name Type Description Default vcf_contents list [ str ] The contents of the VCF file as a list of strings. required Source code in src/pheval/prepare/create_spiked_vcf.py 115 116 117 118 119 120 121 122 def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents parse_assembly () Parse the genome assembly and format of vcf_records. Returns: Type Description tuple [ str , bool ] Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). Source code in src/pheval/prepare/create_spiked_vcf.py 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () parse_vcf_header () Parse the header of the VCF. Returns: Name Type Description VcfHeader VcfHeader An instance of VcfHeader containing sample ID, assembly, and chromosome status. Source code in src/pheval/prepare/create_spiked_vcf.py 163 164 165 166 167 168 169 170 171 172 def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status ) VcfSpiker Class for spiking proband variants into template VCF file contents. Source code in src/pheval/prepare/create_spiked_vcf.py 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 class VcfSpiker : \"\"\"Class for spiking proband variants into template VCF file contents.\"\"\" def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name )) __init__ ( vcf_contents , proband_causative_variants , vcf_header ) Initialise the VcfSpiker. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the template VCF file. required proband_causative_variants List [ ProbandCausativeVariant ] List of proband causative variants. required vcf_header VcfHeader The VCF header information. required Source code in src/pheval/prepare/create_spiked_vcf.py 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header construct_header ( updated_vcf_records ) Construct the header of the VCF. Parameters: Name Type Description Default updated_vcf_records List [ str ] Updated VCF records. required Returns: Type Description List [ str ] List[str]: Constructed header as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file construct_variant_entry ( proband_variant_data ) Construct variant entries. Parameters: Name Type Description Default proband_variant_data ProbandCausativeVariant Data for the proband variant. required Returns: Type Description List [ str ] List[str]: Constructed variant entry as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] construct_vcf ( template_vcf_name ) Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: The complete spiked VCF file content as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 393 394 395 396 397 398 399 400 401 402 403 def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name )) construct_vcf_records ( template_vcf_name ) Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: Updated VCF records containing the spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records VcfWriter Class for writing VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 class VcfWriter : \"\"\"Class for writing VCF file.\"\"\" def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed () __init__ ( vcf_contents , spiked_vcf_file_path ) Initialise the VcfWriter class. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the VCF file to be written. required spiked_vcf_file_path Path Path to the spiked VCF file to be created. required Source code in src/pheval/prepare/create_spiked_vcf.py 409 410 411 412 413 414 415 416 417 418 419 420 421 422 def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path write_gzip () Write the VCF contents to a gzipped VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 424 425 426 427 428 429 430 431 432 def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () write_uncompressed () Write the VCF contents to an uncompressed VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 434 435 436 437 438 439 440 def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () write_vcf_file () Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). Source code in src/pheval/prepare/create_spiked_vcf.py 442 443 444 445 446 447 448 449 def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed () check_variant_assembly ( proband_causative_variants , vcf_header , phenopacket_path ) Check the assembly of the variant assembly against the VCF. Parameters: Name Type Description Default proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required vcf_header VcfHeader An instance of VcfHeader representing the VCF file's header. required phenopacket_path Path The path to the Phenopacket file. required Raises: Type Description ValueError If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError If the assembly in the Phenopacket does not match the VCF assembly. Source code in src/pheval/prepare/create_spiked_vcf.py 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def check_variant_assembly ( proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , phenopacket_path : Path , ) -> None : \"\"\" Check the assembly of the variant assembly against the VCF. Args: proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header. phenopacket_path (Path): The path to the Phenopacket file. Raises: ValueError: If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly. \"\"\" compatible_genome_assembly = { \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" } phenopacket_assembly = list ({ variant . assembly for variant in proband_causative_variants }) if len ( phenopacket_assembly ) > 1 : raise ValueError ( \"Too many genome assemblies!\" ) if phenopacket_assembly [ 0 ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( phenopacket_assembly , phenopacket_path ) if ( phenopacket_assembly [ 0 ] in { \"hg19\" , \"GRCh37\" } and vcf_header . assembly not in { \"hg19\" , \"GRCh37\" } ) or ( phenopacket_assembly [ 0 ] in { \"hg38\" , \"GRCh38\" } and vcf_header . assembly not in { \"hg38\" , \"GRCh38\" } ): raise IncompatibleGenomeAssemblyError ( assembly = phenopacket_assembly , phenopacket = phenopacket_path ) create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create a spiked VCF for a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_path Path Path to the Phenopacket file. required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 def create_spiked_vcf ( output_dir : Path , phenopacket_path : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a Phenopacket. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_path (Path): Path to the Phenopacket file. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if hg19_template_vcf is None and hg38_template_vcf is None : raise InputError ( \"Either a hg19 template vcf or hg38 template vcf must be specified\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create a spiked VCF for a directory of Phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_dir Path Path to the Phenopacket directory. required hg19_template_vcf Path Path to the template hg19 VCF file (optional). required hg38_template_vcf Path Path to the template hg19 VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 def create_spiked_vcfs ( output_dir : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a directory of Phenopackets. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_dir (Path): Path to the Phenopacket directory. hg19_template_vcf (Path): Path to the template hg19 VCF file (optional). hg38_template_vcf (Path): Path to the template hg19 VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if ( hg19_template_vcf is None and hg38_template_vcf is None and hg19_vcf_dir is None and hg38_vcf_dir is None ): raise InputError ( \"Need to specify a VCF!\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None for phenopacket_path in files_with_suffix ( phenopacket_dir , \".json\" ): spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Write spiked VCF contents to a new file. Parameters: Name Type Description Default output_dir Path Path to the directory to store the generated file. required phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: File: The generated File object representing the newly created spiked VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 def generate_spiked_vcf_file ( output_dir : Path , phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> File : \"\"\" Write spiked VCF contents to a new file. Args: output_dir (Path): Path to the directory to store the generated file. phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: File: The generated File object representing the newly created spiked VCF file. \"\"\" output_dir . mkdir ( exist_ok = True ) info_log . info ( f \" Created a directory { output_dir } \" ) vcf_assembly , spiked_vcf = spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) spiked_vcf_path = output_dir . joinpath ( phenopacket_path . name . replace ( \".json\" , \".vcf.gz\" )) VcfWriter ( spiked_vcf , spiked_vcf_path ) . write_vcf_file () return File ( uri = urllib . parse . unquote ( spiked_vcf_path . as_uri ()), file_attributes = { \"fileFormat\" : \"vcf\" , \"genomeAssembly\" : vcf_assembly }, ) read_vcf ( vcf_file ) Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Parameters: Name Type Description Default vcf_file Path The path to the VCF file to be read. required Returns: Type Description List [ str ] List[str]: A list containing the lines of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def read_vcf ( vcf_file : Path ) -> List [ str ]: \"\"\" Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Args: vcf_file (Path): The path to the VCF file to be read. Returns: List[str]: A list containing the lines of the VCF file. \"\"\" open_fn = gzip . open if is_gzipped ( vcf_file ) else open vcf = open_fn ( vcf_file ) vcf_contents = ( [ line . decode () for line in vcf . readlines ()] if is_gzipped ( vcf_file ) else vcf . readlines () ) vcf . close () return vcf_contents select_vcf_template ( phenopacket_path , proband_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Select the appropriate VCF template based on the assembly information of the proband causative variants. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile CF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Name Type Description VcfFile VcfFile The selected VCF template file based on the assembly information of the proband causative variants. Source code in src/pheval/prepare/create_spiked_vcf.py 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 def select_vcf_template ( phenopacket_path : Path , proband_causative_variants : List [ ProbandCausativeVariant ], hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> VcfFile : \"\"\" Select the appropriate VCF template based on the assembly information of the proband causative variants. Args: phenopacket_path (Path): The path to the Phenopacket file. proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): CF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: VcfFile: The selected VCF template file based on the assembly information of the proband causative variants. \"\"\" if proband_causative_variants [ 0 ] . assembly in [ \"hg19\" , \"GRCh37\" ]: if hg19_vcf_info : return hg19_vcf_info elif hg19_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg19_vcf_dir ))) else : raise InputError ( \"Must specify hg19 template VCF!\" ) elif proband_causative_variants [ 0 ] . assembly in [ \"hg38\" , \"GRCh38\" ]: if hg38_vcf_info : return hg38_vcf_info elif hg38_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg38_vcf_dir ))) else : raise InputError ( \"Must specify hg38 template VCF!\" ) else : raise IncompatibleGenomeAssemblyError ( proband_causative_variants [ 0 ] . assembly , phenopacket_path ) spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Parameters: Name Type Description Default hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required output_dir Path Directory where the updated Phenopacket will be saved. required phenopacket_path Path Path to the original Phenopacket file. required Returns: Type Description None None Source code in src/pheval/prepare/create_spiked_vcf.py 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 def spike_and_update_phenopacket ( hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Args: hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. output_dir (Path): Directory where the updated Phenopacket will be saved. phenopacket_path (Path): Path to the original Phenopacket file. Returns: None \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) spiked_vcf_file_message = generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) updated_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_spiked_vcf_path ( spiked_vcf_file_message ) write_phenopacket ( updated_phenopacket , phenopacket_path ) spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Spike VCF records with variants obtained from a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Type Description tuple [ str , List [ str ]] A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 def spike_vcf_contents ( phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> tuple [ str , List [ str ]]: \"\"\" Spike VCF records with variants obtained from a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. \"\"\" phenopacket_causative_variants = PhenopacketUtil ( phenopacket ) . causative_variants () chosen_template_vcf = select_vcf_template ( phenopacket_path , phenopacket_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) check_variant_assembly ( phenopacket_causative_variants , chosen_template_vcf . vcf_header , phenopacket_path ) return ( chosen_template_vcf . vcf_header . assembly , VcfSpiker ( chosen_template_vcf . vcf_contents , phenopacket_causative_variants , chosen_template_vcf . vcf_header , ) . construct_vcf ( chosen_template_vcf . vcf_file_name ), ) spike_vcfs ( output_dir , phenopacket_path , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create spiked VCF from either a Phenopacket or a Phenopacket directory. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file(s). required phenopacket_path Path Path to a single Phenopacket file (optional). required phenopacket_dir Path Path to a directory containing Phenopacket files (optional). required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Source code in src/pheval/prepare/create_spiked_vcf.py 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 def spike_vcfs ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: output_dir (Path): The directory to store the generated spiked VCF file(s). phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). \"\"\" if phenopacket_path is not None : create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) elif phenopacket_dir is not None : create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , )","title":"Create spiked vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfFile","text":"Represents a VCF file with its name, contents, and header information. Attributes: Name Type Description vcf_file_name str The name of the VCF file. vcf_contents List [ str ] The contents of the VCF file. vcf_header VcfHeader The parsed header information of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @dataclass class VcfFile : \"\"\" Represents a VCF file with its name, contents, and header information. Attributes: vcf_file_name (str): The name of the VCF file. vcf_contents (List[str]): The contents of the VCF file. vcf_header (VcfHeader): The parsed header information of the VCF file. \"\"\" vcf_file_name : str = None vcf_contents : List [ str ] = None vcf_header : VcfHeader = None @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ())","title":"VcfFile"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfFile.populate_fields","text":"Populate the fields of the VcfFile instance using the contents of a template VCF file. Parameters: Name Type Description Default template_vcf Path The path to the template VCF file. required Returns: Name Type Description VcfFile An instance of VcfFile with populated fields. Source code in src/pheval/prepare/create_spiked_vcf.py 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ())","title":"populate_fields"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeader","text":"Data obtained from VCF header. Parameters: Name Type Description Default sample_id str The sample identifier from the VCF header. required assembly str The assembly information obtained from the VCF header. required chr_status bool A boolean indicating whether the VCF denotes chromosomes as chr or not. required Source code in src/pheval/prepare/create_spiked_vcf.py 78 79 80 81 82 83 84 85 86 87 88 89 90 @dataclass class VcfHeader : \"\"\"Data obtained from VCF header. Args: sample_id (str): The sample identifier from the VCF header. assembly (str): The assembly information obtained from the VCF header. chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not. \"\"\" sample_id : str assembly : str chr_status : bool","title":"VcfHeader"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser","text":"Class for parsing the header of a VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 class VcfHeaderParser : \"\"\"Class for parsing the header of a VCF file.\"\"\" def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status )","title":"VcfHeaderParser"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.__init__","text":"Initialise the VcfHeaderParser. Parameters: Name Type Description Default vcf_contents list [ str ] The contents of the VCF file as a list of strings. required Source code in src/pheval/prepare/create_spiked_vcf.py 115 116 117 118 119 120 121 122 def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.parse_assembly","text":"Parse the genome assembly and format of vcf_records. Returns: Type Description tuple [ str , bool ] Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). Source code in src/pheval/prepare/create_spiked_vcf.py 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip ()","title":"parse_sample_id"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.parse_vcf_header","text":"Parse the header of the VCF. Returns: Name Type Description VcfHeader VcfHeader An instance of VcfHeader containing sample ID, assembly, and chromosome status. Source code in src/pheval/prepare/create_spiked_vcf.py 163 164 165 166 167 168 169 170 171 172 def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status )","title":"parse_vcf_header"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker","text":"Class for spiking proband variants into template VCF file contents. Source code in src/pheval/prepare/create_spiked_vcf.py 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 class VcfSpiker : \"\"\"Class for spiking proband variants into template VCF file contents.\"\"\" def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name ))","title":"VcfSpiker"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.__init__","text":"Initialise the VcfSpiker. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the template VCF file. required proband_causative_variants List [ ProbandCausativeVariant ] List of proband causative variants. required vcf_header VcfHeader The VCF header information. required Source code in src/pheval/prepare/create_spiked_vcf.py 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_header","text":"Construct the header of the VCF. Parameters: Name Type Description Default updated_vcf_records List [ str ] Updated VCF records. required Returns: Type Description List [ str ] List[str]: Constructed header as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file","title":"construct_header"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_variant_entry","text":"Construct variant entries. Parameters: Name Type Description Default proband_variant_data ProbandCausativeVariant Data for the proband variant. required Returns: Type Description List [ str ] List[str]: Constructed variant entry as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ]","title":"construct_variant_entry"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_vcf","text":"Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: The complete spiked VCF file content as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 393 394 395 396 397 398 399 400 401 402 403 def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name ))","title":"construct_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_vcf_records","text":"Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: Updated VCF records containing the spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records","title":"construct_vcf_records"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter","text":"Class for writing VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 class VcfWriter : \"\"\"Class for writing VCF file.\"\"\" def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed ()","title":"VcfWriter"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.__init__","text":"Initialise the VcfWriter class. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the VCF file to be written. required spiked_vcf_file_path Path Path to the spiked VCF file to be created. required Source code in src/pheval/prepare/create_spiked_vcf.py 409 410 411 412 413 414 415 416 417 418 419 420 421 422 def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_gzip","text":"Write the VCF contents to a gzipped VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 424 425 426 427 428 429 430 431 432 def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close ()","title":"write_gzip"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_uncompressed","text":"Write the VCF contents to an uncompressed VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 434 435 436 437 438 439 440 def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close ()","title":"write_uncompressed"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_vcf_file","text":"Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). Source code in src/pheval/prepare/create_spiked_vcf.py 442 443 444 445 446 447 448 449 def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed ()","title":"write_vcf_file"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.check_variant_assembly","text":"Check the assembly of the variant assembly against the VCF. Parameters: Name Type Description Default proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required vcf_header VcfHeader An instance of VcfHeader representing the VCF file's header. required phenopacket_path Path The path to the Phenopacket file. required Raises: Type Description ValueError If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError If the assembly in the Phenopacket does not match the VCF assembly. Source code in src/pheval/prepare/create_spiked_vcf.py 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def check_variant_assembly ( proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , phenopacket_path : Path , ) -> None : \"\"\" Check the assembly of the variant assembly against the VCF. Args: proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header. phenopacket_path (Path): The path to the Phenopacket file. Raises: ValueError: If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly. \"\"\" compatible_genome_assembly = { \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" } phenopacket_assembly = list ({ variant . assembly for variant in proband_causative_variants }) if len ( phenopacket_assembly ) > 1 : raise ValueError ( \"Too many genome assemblies!\" ) if phenopacket_assembly [ 0 ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( phenopacket_assembly , phenopacket_path ) if ( phenopacket_assembly [ 0 ] in { \"hg19\" , \"GRCh37\" } and vcf_header . assembly not in { \"hg19\" , \"GRCh37\" } ) or ( phenopacket_assembly [ 0 ] in { \"hg38\" , \"GRCh38\" } and vcf_header . assembly not in { \"hg38\" , \"GRCh38\" } ): raise IncompatibleGenomeAssemblyError ( assembly = phenopacket_assembly , phenopacket = phenopacket_path )","title":"check_variant_assembly"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.create_spiked_vcf","text":"Create a spiked VCF for a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_path Path Path to the Phenopacket file. required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 def create_spiked_vcf ( output_dir : Path , phenopacket_path : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a Phenopacket. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_path (Path): Path to the Phenopacket file. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if hg19_template_vcf is None and hg38_template_vcf is None : raise InputError ( \"Either a hg19 template vcf or hg38 template vcf must be specified\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path )","title":"create_spiked_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.create_spiked_vcfs","text":"Create a spiked VCF for a directory of Phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_dir Path Path to the Phenopacket directory. required hg19_template_vcf Path Path to the template hg19 VCF file (optional). required hg38_template_vcf Path Path to the template hg19 VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 def create_spiked_vcfs ( output_dir : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a directory of Phenopackets. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_dir (Path): Path to the Phenopacket directory. hg19_template_vcf (Path): Path to the template hg19 VCF file (optional). hg38_template_vcf (Path): Path to the template hg19 VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if ( hg19_template_vcf is None and hg38_template_vcf is None and hg19_vcf_dir is None and hg38_vcf_dir is None ): raise InputError ( \"Need to specify a VCF!\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None for phenopacket_path in files_with_suffix ( phenopacket_dir , \".json\" ): spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path )","title":"create_spiked_vcfs"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.generate_spiked_vcf_file","text":"Write spiked VCF contents to a new file. Parameters: Name Type Description Default output_dir Path Path to the directory to store the generated file. required phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: File: The generated File object representing the newly created spiked VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 def generate_spiked_vcf_file ( output_dir : Path , phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> File : \"\"\" Write spiked VCF contents to a new file. Args: output_dir (Path): Path to the directory to store the generated file. phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: File: The generated File object representing the newly created spiked VCF file. \"\"\" output_dir . mkdir ( exist_ok = True ) info_log . info ( f \" Created a directory { output_dir } \" ) vcf_assembly , spiked_vcf = spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) spiked_vcf_path = output_dir . joinpath ( phenopacket_path . name . replace ( \".json\" , \".vcf.gz\" )) VcfWriter ( spiked_vcf , spiked_vcf_path ) . write_vcf_file () return File ( uri = urllib . parse . unquote ( spiked_vcf_path . as_uri ()), file_attributes = { \"fileFormat\" : \"vcf\" , \"genomeAssembly\" : vcf_assembly }, )","title":"generate_spiked_vcf_file"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.read_vcf","text":"Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Parameters: Name Type Description Default vcf_file Path The path to the VCF file to be read. required Returns: Type Description List [ str ] List[str]: A list containing the lines of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def read_vcf ( vcf_file : Path ) -> List [ str ]: \"\"\" Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Args: vcf_file (Path): The path to the VCF file to be read. Returns: List[str]: A list containing the lines of the VCF file. \"\"\" open_fn = gzip . open if is_gzipped ( vcf_file ) else open vcf = open_fn ( vcf_file ) vcf_contents = ( [ line . decode () for line in vcf . readlines ()] if is_gzipped ( vcf_file ) else vcf . readlines () ) vcf . close () return vcf_contents","title":"read_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.select_vcf_template","text":"Select the appropriate VCF template based on the assembly information of the proband causative variants. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile CF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Name Type Description VcfFile VcfFile The selected VCF template file based on the assembly information of the proband causative variants. Source code in src/pheval/prepare/create_spiked_vcf.py 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 def select_vcf_template ( phenopacket_path : Path , proband_causative_variants : List [ ProbandCausativeVariant ], hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> VcfFile : \"\"\" Select the appropriate VCF template based on the assembly information of the proband causative variants. Args: phenopacket_path (Path): The path to the Phenopacket file. proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): CF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: VcfFile: The selected VCF template file based on the assembly information of the proband causative variants. \"\"\" if proband_causative_variants [ 0 ] . assembly in [ \"hg19\" , \"GRCh37\" ]: if hg19_vcf_info : return hg19_vcf_info elif hg19_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg19_vcf_dir ))) else : raise InputError ( \"Must specify hg19 template VCF!\" ) elif proband_causative_variants [ 0 ] . assembly in [ \"hg38\" , \"GRCh38\" ]: if hg38_vcf_info : return hg38_vcf_info elif hg38_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg38_vcf_dir ))) else : raise InputError ( \"Must specify hg38 template VCF!\" ) else : raise IncompatibleGenomeAssemblyError ( proband_causative_variants [ 0 ] . assembly , phenopacket_path )","title":"select_vcf_template"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_and_update_phenopacket","text":"Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Parameters: Name Type Description Default hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required output_dir Path Directory where the updated Phenopacket will be saved. required phenopacket_path Path Path to the original Phenopacket file. required Returns: Type Description None None Source code in src/pheval/prepare/create_spiked_vcf.py 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 def spike_and_update_phenopacket ( hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Args: hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. output_dir (Path): Directory where the updated Phenopacket will be saved. phenopacket_path (Path): Path to the original Phenopacket file. Returns: None \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) spiked_vcf_file_message = generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) updated_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_spiked_vcf_path ( spiked_vcf_file_message ) write_phenopacket ( updated_phenopacket , phenopacket_path )","title":"spike_and_update_phenopacket"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_vcf_contents","text":"Spike VCF records with variants obtained from a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Type Description tuple [ str , List [ str ]] A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 def spike_vcf_contents ( phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> tuple [ str , List [ str ]]: \"\"\" Spike VCF records with variants obtained from a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. \"\"\" phenopacket_causative_variants = PhenopacketUtil ( phenopacket ) . causative_variants () chosen_template_vcf = select_vcf_template ( phenopacket_path , phenopacket_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) check_variant_assembly ( phenopacket_causative_variants , chosen_template_vcf . vcf_header , phenopacket_path ) return ( chosen_template_vcf . vcf_header . assembly , VcfSpiker ( chosen_template_vcf . vcf_contents , phenopacket_causative_variants , chosen_template_vcf . vcf_header , ) . construct_vcf ( chosen_template_vcf . vcf_file_name ), )","title":"spike_vcf_contents"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_vcfs","text":"Create spiked VCF from either a Phenopacket or a Phenopacket directory. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file(s). required phenopacket_path Path Path to a single Phenopacket file (optional). required phenopacket_dir Path Path to a directory containing Phenopacket files (optional). required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Source code in src/pheval/prepare/create_spiked_vcf.py 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 def spike_vcfs ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: output_dir (Path): The directory to store the generated spiked VCF file(s). phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). \"\"\" if phenopacket_path is not None : create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) elif phenopacket_dir is not None : create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , )","title":"spike_vcfs"},{"location":"api/pheval/prepare/custom_exceptions/","text":"InputError Bases: Exception Exception raised for missing required inputs. Source code in src/pheval/prepare/custom_exceptions.py 4 5 6 7 8 9 10 11 12 13 class InputError ( Exception ): \"\"\"Exception raised for missing required inputs.\"\"\" def __init__ ( self , file , message = \"Missing required input\" ): self . file : str = file self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . file } \" MutuallyExclusiveOptionError Bases: Option Exception raised for when Source code in src/pheval/prepare/custom_exceptions.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 class MutuallyExclusiveOptionError ( Option ): \"\"\"Exception raised for when\"\"\" def __init__ ( self , * args , ** kwargs ): self . mutually_exclusive = set ( kwargs . pop ( \"mutually_exclusive\" , [])) help_ = kwargs . get ( \"help\" , \"\" ) if self . mutually_exclusive : ex_str = \", \" . join ( self . mutually_exclusive ) kwargs [ \"help\" ] = help_ + ( \" NOTE: This argument is mutually exclusive with \" \" arguments: [\" + ex_str + \"].\" ) super ( MutuallyExclusiveOptionError , self ) . __init__ ( * args , ** kwargs ) def handle_parse_result ( self , ctx , opts , args ): if self . mutually_exclusive . intersection ( opts ) and self . name in opts : raise UsageError ( \"Illegal usage: ` {} ` is mutually exclusive with \" \"arguments ` {} `.\" . format ( self . name , \", \" . join ( self . mutually_exclusive )) ) return super ( MutuallyExclusiveOptionError , self ) . handle_parse_result ( ctx , opts , args )","title":"Custom exceptions"},{"location":"api/pheval/prepare/custom_exceptions/#src.pheval.prepare.custom_exceptions.InputError","text":"Bases: Exception Exception raised for missing required inputs. Source code in src/pheval/prepare/custom_exceptions.py 4 5 6 7 8 9 10 11 12 13 class InputError ( Exception ): \"\"\"Exception raised for missing required inputs.\"\"\" def __init__ ( self , file , message = \"Missing required input\" ): self . file : str = file self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . file } \"","title":"InputError"},{"location":"api/pheval/prepare/custom_exceptions/#src.pheval.prepare.custom_exceptions.MutuallyExclusiveOptionError","text":"Bases: Option Exception raised for when Source code in src/pheval/prepare/custom_exceptions.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 class MutuallyExclusiveOptionError ( Option ): \"\"\"Exception raised for when\"\"\" def __init__ ( self , * args , ** kwargs ): self . mutually_exclusive = set ( kwargs . pop ( \"mutually_exclusive\" , [])) help_ = kwargs . get ( \"help\" , \"\" ) if self . mutually_exclusive : ex_str = \", \" . join ( self . mutually_exclusive ) kwargs [ \"help\" ] = help_ + ( \" NOTE: This argument is mutually exclusive with \" \" arguments: [\" + ex_str + \"].\" ) super ( MutuallyExclusiveOptionError , self ) . __init__ ( * args , ** kwargs ) def handle_parse_result ( self , ctx , opts , args ): if self . mutually_exclusive . intersection ( opts ) and self . name in opts : raise UsageError ( \"Illegal usage: ` {} ` is mutually exclusive with \" \"arguments ` {} `.\" . format ( self . name , \", \" . join ( self . mutually_exclusive )) ) return super ( MutuallyExclusiveOptionError , self ) . handle_parse_result ( ctx , opts , args )","title":"MutuallyExclusiveOptionError"},{"location":"api/pheval/prepare/prepare_corpus/","text":"prepare_corpus ( phenopacket_dir , variant_analysis , gene_analysis , disease_analysis , gene_identifier , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , output_dir ) Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Parameters: Name Type Description Default phenopacket_dir Path The path to the directory containing Phenopackets. required variant_analysis bool If True, check for complete variant records in the Phenopackets. required gene_analysis bool If True, check for complete gene records in the Phenopackets. required disease_analysis bool If True, check for complete disease records in the Phenopackets. required gene_identifier str Identifier for updating gene identifiers, if applicable. required hg19_template_vcf Path Path to the hg19 template VCF file (optional), to spike variants into required hg38_template_vcf Path Path to the hg38 template VCF file (optional), to spike variants into required hg19_vcf_dir Path Path to the directory containing hg19 template VCF files (optional). required hg38_vcf_dir Path Path to the directory containing hg38 template VCF files (optional). required output_dir Path The directory to save the prepared Phenopackets and, optionally, VCF files. required Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Source code in src/pheval/prepare/prepare_corpus.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def prepare_corpus ( phenopacket_dir : Path , variant_analysis : bool , gene_analysis : bool , disease_analysis : bool , gene_identifier : str , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , ) -> None : \"\"\" Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. \"\"\" output_dir . joinpath ( \"phenopackets\" ) . mkdir ( exist_ok = True , parents = True ) for phenopacket_path in all_files ( phenopacket_dir ): phenopacket_util = PhenopacketUtil ( phenopacket_reader ( phenopacket_path )) if not phenopacket_util . observed_phenotypic_features (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to no observed phenotypic features.\" ) continue if variant_analysis : if phenopacket_util . check_incomplete_variant_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing variant fields.\" ) continue if gene_analysis : if phenopacket_util . check_incomplete_gene_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing gene fields.\" ) continue if disease_analysis : if phenopacket_util . check_incomplete_disease_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing disease fields.\" ) continue if hg19_template_vcf or hg38_template_vcf : output_dir . joinpath ( \"vcf\" ) . mkdir ( exist_ok = True ) create_spiked_vcf ( output_dir . joinpath ( \"vcf\" ), phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) if gene_identifier : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir . joinpath ( \"phenopackets\" ) ) else : # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory shutil . copy ( phenopacket_path , output_dir . joinpath ( f \"phenopackets/ { phenopacket_path . name } \" ) )","title":"Prepare corpus"},{"location":"api/pheval/prepare/prepare_corpus/#src.pheval.prepare.prepare_corpus.prepare_corpus","text":"Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Parameters: Name Type Description Default phenopacket_dir Path The path to the directory containing Phenopackets. required variant_analysis bool If True, check for complete variant records in the Phenopackets. required gene_analysis bool If True, check for complete gene records in the Phenopackets. required disease_analysis bool If True, check for complete disease records in the Phenopackets. required gene_identifier str Identifier for updating gene identifiers, if applicable. required hg19_template_vcf Path Path to the hg19 template VCF file (optional), to spike variants into required hg38_template_vcf Path Path to the hg38 template VCF file (optional), to spike variants into required hg19_vcf_dir Path Path to the directory containing hg19 template VCF files (optional). required hg38_vcf_dir Path Path to the directory containing hg38 template VCF files (optional). required output_dir Path The directory to save the prepared Phenopackets and, optionally, VCF files. required Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Source code in src/pheval/prepare/prepare_corpus.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def prepare_corpus ( phenopacket_dir : Path , variant_analysis : bool , gene_analysis : bool , disease_analysis : bool , gene_identifier : str , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , ) -> None : \"\"\" Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. \"\"\" output_dir . joinpath ( \"phenopackets\" ) . mkdir ( exist_ok = True , parents = True ) for phenopacket_path in all_files ( phenopacket_dir ): phenopacket_util = PhenopacketUtil ( phenopacket_reader ( phenopacket_path )) if not phenopacket_util . observed_phenotypic_features (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to no observed phenotypic features.\" ) continue if variant_analysis : if phenopacket_util . check_incomplete_variant_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing variant fields.\" ) continue if gene_analysis : if phenopacket_util . check_incomplete_gene_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing gene fields.\" ) continue if disease_analysis : if phenopacket_util . check_incomplete_disease_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing disease fields.\" ) continue if hg19_template_vcf or hg38_template_vcf : output_dir . joinpath ( \"vcf\" ) . mkdir ( exist_ok = True ) create_spiked_vcf ( output_dir . joinpath ( \"vcf\" ), phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) if gene_identifier : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir . joinpath ( \"phenopackets\" ) ) else : # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory shutil . copy ( phenopacket_path , output_dir . joinpath ( f \"phenopackets/ { phenopacket_path . name } \" ) )","title":"prepare_corpus"},{"location":"api/pheval/prepare/update_phenopacket/","text":"create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_path Path The path to the input Phenopacket file. required output_dir Path The directory where the updated Phenopacket will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def create_updated_phenopacket ( gene_identifier : str , phenopacket_path : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_path (Path): The path to the input Phenopacket file. output_dir (Path): The directory where the updated Phenopacket will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name )) create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir ) Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_dir Path The path to the input Phenopacket directory. required output_dir Path The directory where the updated Phenopackets will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 def create_updated_phenopackets ( gene_identifier : str , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_dir (Path): The path to the input Phenopacket directory. output_dir (Path): The directory where the updated Phenopackets will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () for phenopacket_path in all_files ( phenopacket_dir ): updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name )) update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) Update the gene context of the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required gene_identifier str Identifier to update the gene context. required hgnc_data defaultdict The HGNC data used for updating. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 def update_outdated_gene_context ( phenopacket_path : Path , gene_identifier : str , hgnc_data : defaultdict ) -> Union [ Phenopacket , Family ]: \"\"\" Update the gene context of the Phenopacket. Args: phenopacket_path (Path): The path to the Phenopacket file. gene_identifier (str): Identifier to update the gene context. hgnc_data (defaultdict): The HGNC data used for updating. Returns: Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) interpretations = PhenopacketUtil ( phenopacket ) . interpretations () updated_interpretations = GeneIdentifierUpdater ( hgnc_data = hgnc_data , gene_identifier = gene_identifier ) . update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) return PhenopacketRebuilder ( phenopacket ) . update_interpretations ( updated_interpretations ) update_phenopackets ( gene_identifier , phenopacket_path , phenopacket_dir , output_dir ) Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default gene_identifier str The gene identifier to be updated. required phenopacket_path Path The path to a single Phenopacket file. required phenopacket_dir Path The directory containing multiple Phenopacket files. required output_dir Path The output directory to save the updated Phenopacket files. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def update_phenopackets ( gene_identifier : str , phenopacket_path : Path , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Args: gene_identifier (str): The gene identifier to be updated. phenopacket_path (Path): The path to a single Phenopacket file. phenopacket_dir (Path): The directory containing multiple Phenopacket files. output_dir (Path): The output directory to save the updated Phenopacket files. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" output_dir . mkdir ( exist_ok = True ) if phenopacket_path is not None : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) elif phenopacket_dir is not None : create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir )","title":"Update phenopacket"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.create_updated_phenopacket","text":"Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_path Path The path to the input Phenopacket file. required output_dir Path The directory where the updated Phenopacket will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def create_updated_phenopacket ( gene_identifier : str , phenopacket_path : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_path (Path): The path to the input Phenopacket file. output_dir (Path): The directory where the updated Phenopacket will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name ))","title":"create_updated_phenopacket"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.create_updated_phenopackets","text":"Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_dir Path The path to the input Phenopacket directory. required output_dir Path The directory where the updated Phenopackets will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 def create_updated_phenopackets ( gene_identifier : str , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_dir (Path): The path to the input Phenopacket directory. output_dir (Path): The directory where the updated Phenopackets will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () for phenopacket_path in all_files ( phenopacket_dir ): updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name ))","title":"create_updated_phenopackets"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.update_outdated_gene_context","text":"Update the gene context of the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required gene_identifier str Identifier to update the gene context. required hgnc_data defaultdict The HGNC data used for updating. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 def update_outdated_gene_context ( phenopacket_path : Path , gene_identifier : str , hgnc_data : defaultdict ) -> Union [ Phenopacket , Family ]: \"\"\" Update the gene context of the Phenopacket. Args: phenopacket_path (Path): The path to the Phenopacket file. gene_identifier (str): Identifier to update the gene context. hgnc_data (defaultdict): The HGNC data used for updating. Returns: Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) interpretations = PhenopacketUtil ( phenopacket ) . interpretations () updated_interpretations = GeneIdentifierUpdater ( hgnc_data = hgnc_data , gene_identifier = gene_identifier ) . update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) return PhenopacketRebuilder ( phenopacket ) . update_interpretations ( updated_interpretations )","title":"update_outdated_gene_context"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.update_phenopackets","text":"Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default gene_identifier str The gene identifier to be updated. required phenopacket_path Path The path to a single Phenopacket file. required phenopacket_dir Path The directory containing multiple Phenopacket files. required output_dir Path The output directory to save the updated Phenopacket files. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def update_phenopackets ( gene_identifier : str , phenopacket_path : Path , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Args: gene_identifier (str): The gene identifier to be updated. phenopacket_path (Path): The path to a single Phenopacket file. phenopacket_dir (Path): The directory containing multiple Phenopacket files. output_dir (Path): The output directory to save the updated Phenopacket files. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" output_dir . mkdir ( exist_ok = True ) if phenopacket_path is not None : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) elif phenopacket_dir is not None : create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir )","title":"update_phenopackets"},{"location":"api/pheval/runners/runner/","text":"Runners Module DefaultPhEvalRunner Bases: PhEvalRunner DefaultPhEvalRunner Parameters: Name Type Description Default PhEvalRunner PhEvalRunner Abstract PhEvalRunnerClass required Source code in src/pheval/runners/runner.py 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 class DefaultPhEvalRunner ( PhEvalRunner ): \"\"\"DefaultPhEvalRunner Args: PhEvalRunner (PhEvalRunner): Abstract PhEvalRunnerClass \"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): print ( \"preparing\" ) def run ( self ): print ( \"running\" ) def post_process ( self ): print ( \"post processing\" ) PhEvalRunner dataclass Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data build_output_directory_structure () build output directory structure Source code in src/pheval/runners/runner.py 87 88 89 90 91 92 93 94 95 96 def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) construct_meta_data () Construct run output meta data Source code in src/pheval/runners/runner.py 125 126 127 def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data post_process () abstractmethod post_process Source code in src/pheval/runners/runner.py 121 122 123 @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" prepare () abstractmethod prepare Source code in src/pheval/runners/runner.py 113 114 115 @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" run () abstractmethod run Source code in src/pheval/runners/runner.py 117 118 119 @abstractmethod def run ( self ): \"\"\"run\"\"\"","title":"Runner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.DefaultPhEvalRunner","text":"Bases: PhEvalRunner DefaultPhEvalRunner Parameters: Name Type Description Default PhEvalRunner PhEvalRunner Abstract PhEvalRunnerClass required Source code in src/pheval/runners/runner.py 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 class DefaultPhEvalRunner ( PhEvalRunner ): \"\"\"DefaultPhEvalRunner Args: PhEvalRunner (PhEvalRunner): Abstract PhEvalRunnerClass \"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): print ( \"preparing\" ) def run ( self ): print ( \"running\" ) def post_process ( self ): print ( \"post processing\" )","title":"DefaultPhEvalRunner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner","text":"Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"PhEvalRunner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.build_output_directory_structure","text":"build output directory structure Source code in src/pheval/runners/runner.py 87 88 89 90 91 92 93 94 95 96 def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True )","title":"build_output_directory_structure"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.construct_meta_data","text":"Construct run output meta data Source code in src/pheval/runners/runner.py 125 126 127 def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"construct_meta_data"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.post_process","text":"post_process Source code in src/pheval/runners/runner.py 121 122 123 @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\"","title":"post_process"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.prepare","text":"prepare Source code in src/pheval/runners/runner.py 113 114 115 @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\"","title":"prepare"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.run","text":"run Source code in src/pheval/runners/runner.py 117 118 119 @abstractmethod def run ( self ): \"\"\"run\"\"\"","title":"run"},{"location":"api/pheval/utils/exomiser/","text":"semsim_to_exomiserdb ( input_path , object_prefix , subject_prefix , db_path ) ingests semsim file into exomiser phenotypic database Parameters: Name Type Description Default input_path Path semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv required object_prefix str object prefix. e.g. MP required subject_prefix str subject prefix e.g HP required db_path Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) required Source code in src/pheval/utils/exomiser.py 6 7 8 9 10 11 12 13 14 15 16 def semsim_to_exomiserdb ( input_path : Path , object_prefix : str , subject_prefix : str , db_path : Path ): \"\"\"ingests semsim file into exomiser phenotypic database Args: input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) \"\"\" exomiserdb = ExomiserDB ( db_path ) exomiserdb . import_from_semsim_file ( input_path , object_prefix , subject_prefix )","title":"Exomiser"},{"location":"api/pheval/utils/exomiser/#src.pheval.utils.exomiser.semsim_to_exomiserdb","text":"ingests semsim file into exomiser phenotypic database Parameters: Name Type Description Default input_path Path semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv required object_prefix str object prefix. e.g. MP required subject_prefix str subject prefix e.g HP required db_path Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) required Source code in src/pheval/utils/exomiser.py 6 7 8 9 10 11 12 13 14 15 16 def semsim_to_exomiserdb ( input_path : Path , object_prefix : str , subject_prefix : str , db_path : Path ): \"\"\"ingests semsim file into exomiser phenotypic database Args: input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) \"\"\" exomiserdb = ExomiserDB ( db_path ) exomiserdb . import_from_semsim_file ( input_path , object_prefix , subject_prefix )","title":"semsim_to_exomiserdb"},{"location":"api/pheval/utils/file_utils/","text":"all_files ( directory ) Obtains all files from a given directory. Parameters: Name Type Description Default directory Path The directory path. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing all files in the directory. Source code in src/pheval/utils/file_utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 def all_files ( directory : Path ) -> list [ Path ]: \"\"\" Obtains all files from a given directory. Args: directory (Path): The directory path. Returns: list[Path]: A list of Path objects representing all files in the directory. \"\"\" files = [ file_path for file_path in directory . iterdir ()] files . sort () return files ensure_columns_exists ( cols , dataframes , err_message = '' ) Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" Source code in src/pheval/utils/file_utils.py 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def ensure_columns_exists ( cols : list , dataframes : List [ pd . DataFrame ], err_message : str = \"\" ): \"\"\"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" \"\"\" flat_cols = list ( itertools . chain ( cols )) if not dataframes or not flat_cols : return if err_message : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } { err_message } \"\"\" else : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } \\ - must be present in both left and right files\"\"\" for dataframe in dataframes : if not all ( x in dataframe . columns for x in flat_cols ): raise ValueError ( err_msg ) ensure_file_exists ( * files ) Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised Source code in src/pheval/utils/file_utils.py 73 74 75 76 77 78 79 80 def ensure_file_exists ( * files : str ): \"\"\"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised \"\"\" for file in files : if not path . isfile ( file ): raise FileNotFoundError ( f \"File { file } not found\" ) files_with_suffix ( directory , suffix ) Obtains all files ending in a specified suffix from a given directory. Parameters: Name Type Description Default directory Path The directory path. required suffix str The specified suffix to filter files. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing files with the specified suffix. Source code in src/pheval/utils/file_utils.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def files_with_suffix ( directory : Path , suffix : str ) -> list [ Path ]: \"\"\" Obtains all files ending in a specified suffix from a given directory. Args: directory (Path): The directory path. suffix (str): The specified suffix to filter files. Returns: list[Path]: A list of Path objects representing files with the specified suffix. \"\"\" files = [ file_path for file_path in directory . iterdir () if file_path . suffix == suffix ] files . sort () return files is_gzipped ( file_path ) Confirms whether a file is gzipped. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description bool bool True if the file is gzipped, False otherwise. Source code in src/pheval/utils/file_utils.py 46 47 48 49 50 51 52 53 54 55 56 def is_gzipped ( file_path : Path ) -> bool : \"\"\" Confirms whether a file is gzipped. Args: file_path (Path): The path to the file. Returns: bool: True if the file is gzipped, False otherwise. \"\"\" return file_path . name . endswith ( \".gz\" ) normalise_file_name ( file_path ) Normalises the file name by removing diacritical marks (accents) from Unicode characters. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description str str The normalised file name without diacritical marks. Source code in src/pheval/utils/file_utils.py 59 60 61 62 63 64 65 66 67 68 69 70 def normalise_file_name ( file_path : Path ) -> str : \"\"\" Normalises the file name by removing diacritical marks (accents) from Unicode characters. Args: file_path (Path): The path to the file. Returns: str: The normalised file name without diacritical marks. \"\"\" normalised_file_name = unicodedata . normalize ( \"NFD\" , str ( file_path )) return re . sub ( \"[ \\u0300 - \\u036f ]\" , \"\" , normalised_file_name ) write_metadata ( output_dir , meta_data ) Write the metadata for a run to a YAML file. Parameters: Name Type Description Default output_dir Path The directory where the metadata file will be saved. required meta_data BasicOutputRunMetaData The metadata to be written. required Source code in src/pheval/utils/file_utils.py 108 109 110 111 112 113 114 115 116 117 118 def write_metadata ( output_dir : Path , meta_data : BasicOutputRunMetaData ) -> None : \"\"\" Write the metadata for a run to a YAML file. Args: output_dir (Path): The directory where the metadata file will be saved. meta_data (BasicOutputRunMetaData): The metadata to be written. \"\"\" with open ( Path ( output_dir ) . joinpath ( \"results.yml\" ), \"w\" ) as metadata_file : yaml . dump ( to_dict ( meta_data ), metadata_file , sort_keys = False , default_style = \"\" ) metadata_file . close ()","title":"File utils"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.all_files","text":"Obtains all files from a given directory. Parameters: Name Type Description Default directory Path The directory path. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing all files in the directory. Source code in src/pheval/utils/file_utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 def all_files ( directory : Path ) -> list [ Path ]: \"\"\" Obtains all files from a given directory. Args: directory (Path): The directory path. Returns: list[Path]: A list of Path objects representing all files in the directory. \"\"\" files = [ file_path for file_path in directory . iterdir ()] files . sort () return files","title":"all_files"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.ensure_columns_exists","text":"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" Source code in src/pheval/utils/file_utils.py 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def ensure_columns_exists ( cols : list , dataframes : List [ pd . DataFrame ], err_message : str = \"\" ): \"\"\"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" \"\"\" flat_cols = list ( itertools . chain ( cols )) if not dataframes or not flat_cols : return if err_message : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } { err_message } \"\"\" else : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } \\ - must be present in both left and right files\"\"\" for dataframe in dataframes : if not all ( x in dataframe . columns for x in flat_cols ): raise ValueError ( err_msg )","title":"ensure_columns_exists"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.ensure_file_exists","text":"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised Source code in src/pheval/utils/file_utils.py 73 74 75 76 77 78 79 80 def ensure_file_exists ( * files : str ): \"\"\"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised \"\"\" for file in files : if not path . isfile ( file ): raise FileNotFoundError ( f \"File { file } not found\" )","title":"ensure_file_exists"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.files_with_suffix","text":"Obtains all files ending in a specified suffix from a given directory. Parameters: Name Type Description Default directory Path The directory path. required suffix str The specified suffix to filter files. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing files with the specified suffix. Source code in src/pheval/utils/file_utils.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def files_with_suffix ( directory : Path , suffix : str ) -> list [ Path ]: \"\"\" Obtains all files ending in a specified suffix from a given directory. Args: directory (Path): The directory path. suffix (str): The specified suffix to filter files. Returns: list[Path]: A list of Path objects representing files with the specified suffix. \"\"\" files = [ file_path for file_path in directory . iterdir () if file_path . suffix == suffix ] files . sort () return files","title":"files_with_suffix"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.is_gzipped","text":"Confirms whether a file is gzipped. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description bool bool True if the file is gzipped, False otherwise. Source code in src/pheval/utils/file_utils.py 46 47 48 49 50 51 52 53 54 55 56 def is_gzipped ( file_path : Path ) -> bool : \"\"\" Confirms whether a file is gzipped. Args: file_path (Path): The path to the file. Returns: bool: True if the file is gzipped, False otherwise. \"\"\" return file_path . name . endswith ( \".gz\" )","title":"is_gzipped"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.normalise_file_name","text":"Normalises the file name by removing diacritical marks (accents) from Unicode characters. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description str str The normalised file name without diacritical marks. Source code in src/pheval/utils/file_utils.py 59 60 61 62 63 64 65 66 67 68 69 70 def normalise_file_name ( file_path : Path ) -> str : \"\"\" Normalises the file name by removing diacritical marks (accents) from Unicode characters. Args: file_path (Path): The path to the file. Returns: str: The normalised file name without diacritical marks. \"\"\" normalised_file_name = unicodedata . normalize ( \"NFD\" , str ( file_path )) return re . sub ( \"[ \\u0300 - \\u036f ]\" , \"\" , normalised_file_name )","title":"normalise_file_name"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.write_metadata","text":"Write the metadata for a run to a YAML file. Parameters: Name Type Description Default output_dir Path The directory where the metadata file will be saved. required meta_data BasicOutputRunMetaData The metadata to be written. required Source code in src/pheval/utils/file_utils.py 108 109 110 111 112 113 114 115 116 117 118 def write_metadata ( output_dir : Path , meta_data : BasicOutputRunMetaData ) -> None : \"\"\" Write the metadata for a run to a YAML file. Args: output_dir (Path): The directory where the metadata file will be saved. meta_data (BasicOutputRunMetaData): The metadata to be written. \"\"\" with open ( Path ( output_dir ) . joinpath ( \"results.yml\" ), \"w\" ) as metadata_file : yaml . dump ( to_dict ( meta_data ), metadata_file , sort_keys = False , default_style = \"\" ) metadata_file . close ()","title":"write_metadata"},{"location":"api/pheval/utils/phenopacket_utils/","text":"GeneIdentifierUpdater Class for updating gene identifiers within genomic interpretations. Source code in src/pheval/utils/phenopacket_utils.py 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 class GeneIdentifierUpdater : \"\"\"Class for updating gene identifiers within genomic interpretations.\"\"\" def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] def _find_alternate_ids ( self , gene_symbol : str ) -> List [ str ]: \"\"\" Find the alternate IDs for a gene symbol. Args: gene_symbol (str): The gene symbol to find alternate IDs for. Returns: List[str]: List of alternate IDs for the gene symbol. \"\"\" if gene_symbol in self . hgnc_data . keys (): return [ self . hgnc_data [ gene_symbol ][ \"hgnc_id\" ], \"ncbigene:\" + self . hgnc_data [ gene_symbol ][ \"entrez_id\" ], \"ensembl:\" + self . hgnc_data [ gene_symbol ][ \"ensembl_id\" ], \"symbol:\" + gene_symbol , ] else : for symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return [ data [ \"hgnc_id\" ], \"ncbigene:\" + data [ \"entrez_id\" ], \"ensembl:\" + data [ \"ensembl_id\" ], \"symbol:\" + symbol , ] def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations __init__ ( gene_identifier , hgnc_data = None , identifier_map = None ) Initialise the GeneIdentifierUpdater. Parameters: Name Type Description Default gene_identifier str The gene identifier to update to. required hgnc_data dict A dictionary containing HGNC data (default: None). None identifier_map dict A dictionary mapping gene identifiers (default: None). None Source code in src/pheval/utils/phenopacket_utils.py 641 642 643 644 645 646 647 648 649 650 651 652 653 def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map find_identifier ( gene_symbol ) Find the specified gene identifier for a gene symbol. Parameters: Name Type Description Default gene_symbol str The gene symbol to find the identifier for. required Returns: Name Type Description str str The identified gene identifier. Source code in src/pheval/utils/phenopacket_utils.py 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] obtain_gene_symbol_from_identifier ( query_gene_identifier ) Obtain gene symbol from a gene identifier. Parameters: Name Type Description Default query_gene_identifier str The gene identifier. required Returns: Name Type Description str str The gene symbol corresponding to the identifier. Source code in src/pheval/utils/phenopacket_utils.py 673 674 675 676 677 678 679 680 681 682 683 def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) Update the genomic interpretations of a Phenopacket. Parameters: Name Type Description Default interpretations List [ Interpretation ] List of Interpretation objects. required Returns: Type Description List [ Interpretation ] List[Interpretation]: Updated list of Interpretation objects. Source code in src/pheval/utils/phenopacket_utils.py 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations GenomicVariant dataclass Represents a genomic variant. Parameters: Name Type Description Default chrom str The chromosome position of the variant recommended to be provided in the following format. required pos int Position of the variant following VCF convention. required ref str Reference allele following VCF convention. required alt str Alternate allele following VCF convention. required Source code in src/pheval/utils/phenopacket_utils.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 @dataclass class GenomicVariant : \"\"\" Represents a genomic variant. Args: chrom (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. pos (int): Position of the variant following VCF convention. ref (str): Reference allele following VCF convention. alt (str): Alternate allele following VCF convention. \"\"\" chrom : str pos : int ref : str alt : str IncompatibleGenomeAssemblyError Bases: Exception Exception raised for incompatible genome assembly. Source code in src/pheval/utils/phenopacket_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 class IncompatibleGenomeAssemblyError ( Exception ): \"\"\"Exception raised for incompatible genome assembly.\"\"\" def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . assembly } in { self . phenopacket } \" __init__ ( assembly , phenopacket , message = 'Incompatible Genome Assembly' ) Initialise IncompatibleGenomeAssemblyError. Attributes: Name Type Description assembly str Incompatible genome assembly encountered. phenopacket Path Path to the Phenopacket associated with the error. message str Custom error message (default is \"Incompatible Genome Assembly\"). Source code in src/pheval/utils/phenopacket_utils.py 30 31 32 33 34 35 36 37 38 39 40 41 42 def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) PhenopacketRebuilder Class for rebuilding a Phenopacket Source code in src/pheval/utils/phenopacket_utils.py 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 class PhenopacketRebuilder : \"\"\"Class for rebuilding a Phenopacket\"\"\" def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket __init__ ( phenopacket ) Initialise PhenopacketUtil Attributes: Name Type Description phenopacket Union [ Phenopacket , Family ] Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 540 541 542 543 544 545 546 def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket add_randomised_hpo ( randomised_hpo ) Add randomised phenotypic profiles to a Phenopacket or Family. Parameters: Name Type Description Default randomised_hpo [ PhenotypicFeature ] The randomised phenotypic profiles to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. Source code in src/pheval/utils/phenopacket_utils.py 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket add_spiked_vcf_path ( spiked_vcf_file_data ) Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. Source code in src/pheval/utils/phenopacket_utils.py 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket update_interpretations ( interpretations ) Add the updated interpretations to a Phenopacket or Family. Parameters: Name Type Description Default interpretations List [ Interpretation ] The updated interpretations to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. Source code in src/pheval/utils/phenopacket_utils.py 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket PhenopacketUtil Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False __init__ ( phenopacket_contents ) Initialise PhenopacketUtil Parameters: Name Type Description Default phenopacket_contents Union [ Phenopacket , Family ] Phenopacket or Family object required Source code in src/pheval/utils/phenopacket_utils.py 222 223 224 225 226 227 228 def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents causative_variants () Retrieve a list of causative variants listed in a Phenopacket Returns: Type Description List [ ProbandCausativeVariant ] List[ProbandCausativeVariant]: List of proband causative variants Source code in src/pheval/utils/phenopacket_utils.py 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants check_incomplete_disease_record () Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: Name Type Description bool bool True if any disease record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 522 523 524 525 526 527 528 529 530 531 532 533 534 def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False check_incomplete_gene_record () Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: Name Type Description bool bool True if any gene record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False check_incomplete_variant_record () Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: Name Type Description bool bool True if any variant record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False diagnosed_genes () Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes Source code in src/pheval/utils/phenopacket_utils.py 446 447 448 449 450 451 452 453 454 455 456 457 458 def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes diagnosed_variants () Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants Source code in src/pheval/utils/phenopacket_utils.py 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants diagnoses () Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: Type Description List [ ProbandDisease ] List[ProbandDisease]: List of diagnosed diseases Source code in src/pheval/utils/phenopacket_utils.py 331 332 333 334 335 336 337 338 def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) diseases () Retrieve a list of Diseases associated with the proband Returns: Type Description List [ Disease ] List[Disease]: List of diseases Source code in src/pheval/utils/phenopacket_utils.py 283 284 285 286 287 288 289 290 291 292 293 def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases files () Retrieve a list of files associated with a phenopacket Returns: Type Description List [ File ] List[File]: List of files associated with a phenopacket Source code in src/pheval/utils/phenopacket_utils.py 380 381 382 383 384 385 386 387 def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files interpretations () Retrieve a list of interpretations from a Phenopacket Returns: Type Description List [ Interpretation ] List[Interpretation]: List of interpretations Source code in src/pheval/utils/phenopacket_utils.py 340 341 342 343 344 345 346 347 348 349 350 def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations negated_phenotypic_features () Retrieve a list of all negated HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of negated HPO terms Source code in src/pheval/utils/phenopacket_utils.py 269 270 271 272 273 274 275 276 277 278 279 280 281 def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features observed_phenotypic_features () Retrieve a list of all observed HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of observed HPO terms Source code in src/pheval/utils/phenopacket_utils.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features phenotypic_features () Retrieve a list of all HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of HPO terms Source code in src/pheval/utils/phenopacket_utils.py 242 243 244 245 246 247 248 249 250 251 252 def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features sample_id () Retrieve the sample ID from a Phenopacket or proband of a Family Returns: Name Type Description str str Sample ID Source code in src/pheval/utils/phenopacket_utils.py 230 231 232 233 234 235 236 237 238 239 240 def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id vcf_file_data ( phenopacket_path , vcf_dir ) Retrieve the genome assembly and VCF file name from a phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the phenopacket file. required vcf_dir Path The directory path where the VCF file is stored. required Returns: Name Type Description File File The VCF file with updated URI pointing to the specified directory. Raises: Type Description IncorrectFileFormatError If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError If the genome assembly of the VCF file is not compatible. Note This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. Source code in src/pheval/utils/phenopacket_utils.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data ProbandCausativeGene dataclass Represents a causative gene associated with a proband Parameters: Name Type Description Default gene_symbol str Symbol representing the gene required gene_identifier str The ENSEMBL gene identifier for the result entry required Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 @dataclass class ProbandCausativeGene : \"\"\" Represents a causative gene associated with a proband Args: gene_symbol (str): Symbol representing the gene gene_identifier (str): The ENSEMBL gene identifier for the result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : str gene_identifier : str ProbandCausativeVariant dataclass Represents a causative variant associated with a proband Parameters: Name Type Description Default proband_id str ID of the proband required assembly str Genome assembly required variant GenomicVariant Genomic variant associated with the proband required genotype str Genotype information for the variant required info str Additional information about the variant (default is an empty string) '' Source code in src/pheval/utils/phenopacket_utils.py 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 @dataclass class ProbandCausativeVariant : \"\"\" Represents a causative variant associated with a proband Args: proband_id (str): ID of the proband assembly (str): Genome assembly variant (GenomicVariant): Genomic variant associated with the proband genotype (str): Genotype information for the variant info (str, optional): Additional information about the variant (default is an empty string) \"\"\" proband_id : str assembly : str variant : GenomicVariant genotype : str info : str = \"\" ProbandDisease dataclass Represents a disease associated with a proband Parameters: Name Type Description Default disease_name str Name of the disease required disease_identifier str Identifier for the disease result entry in the OMIM namespace required Notes While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 @dataclass ( frozen = True , eq = True ) class ProbandDisease : \"\"\" Represents a disease associated with a proband Args: disease_name (str): Name of the disease disease_identifier (str): Identifier for the disease result entry in the OMIM namespace Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str create_gene_identifier_map () Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: Name Type Description dict dict A mapping of gene identifiers to gene symbols. Notes The dictionary structure: { 'identifier': 'gene_symbol', ... } Source code in src/pheval/utils/phenopacket_utils.py 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 def create_gene_identifier_map () -> dict : \"\"\" Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: dict: A mapping of gene identifiers to gene symbols. Notes: The dictionary structure: { 'identifier': 'gene_symbol', ... } \"\"\" hgnc_df = read_hgnc_data () identifier_map = {} for _index , row in hgnc_df . iterrows (): identifier_map [ row [ \"ensembl_gene_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"hgnc_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"entrez_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"refseq_accession\" ]] = row [ \"symbol\" ] return identifier_map create_hgnc_dict () Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: Name Type Description defaultdict defaultdict A dictionary containing gene symbols as keys and their associated gene information. Notes The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } Source code in src/pheval/utils/phenopacket_utils.py 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 def create_hgnc_dict () -> defaultdict : \"\"\" Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: defaultdict: A dictionary containing gene symbols as keys and their associated gene information. Notes: The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } \"\"\" hgnc_df = read_hgnc_data () hgnc_data = defaultdict ( dict ) for _index , row in hgnc_df . iterrows (): previous_names = [] hgnc_data [ row [ \"symbol\" ]][ \"ensembl_id\" ] = row [ \"ensembl_gene_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"hgnc_id\" ] = row [ \"hgnc_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"entrez_id\" ] = row [ \"entrez_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"refseq_accession\" ] = row [ \"refseq_accession\" ] previous = str ( row [ \"prev_symbol\" ]) . split ( \"|\" ) for p in previous : previous_names . append ( p . strip ( '\"' )) hgnc_data [ row [ \"symbol\" ]][ \"previous_symbol\" ] = previous_names return hgnc_data create_json_message ( phenopacket ) Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. Source code in src/pheval/utils/phenopacket_utils.py 608 609 610 611 612 613 614 615 616 617 618 def create_json_message ( phenopacket : Union [ Phenopacket , Family ]) -> str : \"\"\" Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. \"\"\" return MessageToJson ( phenopacket ) phenopacket_reader ( file ) Read a Phenopacket file and returns its contents as a Phenopacket or Family object Parameters: Name Type Description Default file Path Path to the Phenopacket file required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def phenopacket_reader ( file : Path ) -> Union [ Phenopacket , Family ]: \"\"\" Read a Phenopacket file and returns its contents as a Phenopacket or Family object Args: file (Path): Path to the Phenopacket file Returns: Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object \"\"\" file = open ( file , \"r\" ) phenopacket = json . load ( file ) file . close () if \"proband\" in phenopacket : return Parse ( json . dumps ( phenopacket ), Family ()) else : return Parse ( json . dumps ( phenopacket ), Phenopacket ()) read_hgnc_data () Read HGNC data from a file and return it as a Pandas DataFrame. Returns: Type Description DataFrame pd.DataFrame: DataFrame containing the HGNC data. Source code in src/pheval/utils/phenopacket_utils.py 125 126 127 128 129 130 131 132 133 134 135 136 def read_hgnc_data () -> pd . DataFrame : \"\"\" Read HGNC data from a file and return it as a Pandas DataFrame. Returns: pd.DataFrame: DataFrame containing the HGNC data. \"\"\" return pd . read_csv ( os . path . dirname ( __file__ ) . replace ( \"utils\" , \"resources/hgnc_complete_set.txt\" ), delimiter = \" \\t \" , dtype = str , ) write_phenopacket ( phenopacket , output_file ) Write a Phenopacket or Family object to a file in JSON format. Parameters: Name Type Description Default phenopacket Phenopacket or Family The Phenopacket or Family object to be written. required output_file Path The Path object representing the file to write the Phenopacket data. required Returns: Type Description None None Source code in src/pheval/utils/phenopacket_utils.py 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 def write_phenopacket ( phenopacket : Union [ Phenopacket , Family ], output_file : Path ) -> None : \"\"\" Write a Phenopacket or Family object to a file in JSON format. Args: phenopacket (Phenopacket or Family): The Phenopacket or Family object to be written. output_file (Path): The Path object representing the file to write the Phenopacket data. Returns: None \"\"\" phenopacket_json = create_json_message ( phenopacket ) with open ( output_file , \"w\" ) as outfile : outfile . write ( phenopacket_json ) outfile . close ()","title":"Phenopacket utils"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater","text":"Class for updating gene identifiers within genomic interpretations. Source code in src/pheval/utils/phenopacket_utils.py 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 class GeneIdentifierUpdater : \"\"\"Class for updating gene identifiers within genomic interpretations.\"\"\" def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] def _find_alternate_ids ( self , gene_symbol : str ) -> List [ str ]: \"\"\" Find the alternate IDs for a gene symbol. Args: gene_symbol (str): The gene symbol to find alternate IDs for. Returns: List[str]: List of alternate IDs for the gene symbol. \"\"\" if gene_symbol in self . hgnc_data . keys (): return [ self . hgnc_data [ gene_symbol ][ \"hgnc_id\" ], \"ncbigene:\" + self . hgnc_data [ gene_symbol ][ \"entrez_id\" ], \"ensembl:\" + self . hgnc_data [ gene_symbol ][ \"ensembl_id\" ], \"symbol:\" + gene_symbol , ] else : for symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return [ data [ \"hgnc_id\" ], \"ncbigene:\" + data [ \"entrez_id\" ], \"ensembl:\" + data [ \"ensembl_id\" ], \"symbol:\" + symbol , ] def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations","title":"GeneIdentifierUpdater"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.__init__","text":"Initialise the GeneIdentifierUpdater. Parameters: Name Type Description Default gene_identifier str The gene identifier to update to. required hgnc_data dict A dictionary containing HGNC data (default: None). None identifier_map dict A dictionary mapping gene identifiers (default: None). None Source code in src/pheval/utils/phenopacket_utils.py 641 642 643 644 645 646 647 648 649 650 651 652 653 def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.find_identifier","text":"Find the specified gene identifier for a gene symbol. Parameters: Name Type Description Default gene_symbol str The gene symbol to find the identifier for. required Returns: Name Type Description str str The identified gene identifier. Source code in src/pheval/utils/phenopacket_utils.py 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ]","title":"find_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.obtain_gene_symbol_from_identifier","text":"Obtain gene symbol from a gene identifier. Parameters: Name Type Description Default query_gene_identifier str The gene identifier. required Returns: Name Type Description str str The gene symbol corresponding to the identifier. Source code in src/pheval/utils/phenopacket_utils.py 673 674 675 676 677 678 679 680 681 682 683 def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ]","title":"obtain_gene_symbol_from_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.update_genomic_interpretations_gene_identifier","text":"Update the genomic interpretations of a Phenopacket. Parameters: Name Type Description Default interpretations List [ Interpretation ] List of Interpretation objects. required Returns: Type Description List [ Interpretation ] List[Interpretation]: Updated list of Interpretation objects. Source code in src/pheval/utils/phenopacket_utils.py 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations","title":"update_genomic_interpretations_gene_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GenomicVariant","text":"Represents a genomic variant. Parameters: Name Type Description Default chrom str The chromosome position of the variant recommended to be provided in the following format. required pos int Position of the variant following VCF convention. required ref str Reference allele following VCF convention. required alt str Alternate allele following VCF convention. required Source code in src/pheval/utils/phenopacket_utils.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 @dataclass class GenomicVariant : \"\"\" Represents a genomic variant. Args: chrom (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. pos (int): Position of the variant following VCF convention. ref (str): Reference allele following VCF convention. alt (str): Alternate allele following VCF convention. \"\"\" chrom : str pos : int ref : str alt : str","title":"GenomicVariant"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.IncompatibleGenomeAssemblyError","text":"Bases: Exception Exception raised for incompatible genome assembly. Source code in src/pheval/utils/phenopacket_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 class IncompatibleGenomeAssemblyError ( Exception ): \"\"\"Exception raised for incompatible genome assembly.\"\"\" def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . assembly } in { self . phenopacket } \"","title":"IncompatibleGenomeAssemblyError"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.IncompatibleGenomeAssemblyError.__init__","text":"Initialise IncompatibleGenomeAssemblyError. Attributes: Name Type Description assembly str Incompatible genome assembly encountered. phenopacket Path Path to the Phenopacket associated with the error. message str Custom error message (default is \"Incompatible Genome Assembly\"). Source code in src/pheval/utils/phenopacket_utils.py 30 31 32 33 34 35 36 37 38 39 40 41 42 def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message )","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder","text":"Class for rebuilding a Phenopacket Source code in src/pheval/utils/phenopacket_utils.py 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 class PhenopacketRebuilder : \"\"\"Class for rebuilding a Phenopacket\"\"\" def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket","title":"PhenopacketRebuilder"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.__init__","text":"Initialise PhenopacketUtil Attributes: Name Type Description phenopacket Union [ Phenopacket , Family ] Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 540 541 542 543 544 545 546 def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.add_randomised_hpo","text":"Add randomised phenotypic profiles to a Phenopacket or Family. Parameters: Name Type Description Default randomised_hpo [ PhenotypicFeature ] The randomised phenotypic profiles to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. Source code in src/pheval/utils/phenopacket_utils.py 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket","title":"add_randomised_hpo"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.add_spiked_vcf_path","text":"Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. Source code in src/pheval/utils/phenopacket_utils.py 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket","title":"add_spiked_vcf_path"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.update_interpretations","text":"Add the updated interpretations to a Phenopacket or Family. Parameters: Name Type Description Default interpretations List [ Interpretation ] The updated interpretations to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. Source code in src/pheval/utils/phenopacket_utils.py 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket","title":"update_interpretations"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil","text":"Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False","title":"PhenopacketUtil"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.__init__","text":"Initialise PhenopacketUtil Parameters: Name Type Description Default phenopacket_contents Union [ Phenopacket , Family ] Phenopacket or Family object required Source code in src/pheval/utils/phenopacket_utils.py 222 223 224 225 226 227 228 def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.causative_variants","text":"Retrieve a list of causative variants listed in a Phenopacket Returns: Type Description List [ ProbandCausativeVariant ] List[ProbandCausativeVariant]: List of proband causative variants Source code in src/pheval/utils/phenopacket_utils.py 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants","title":"causative_variants"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_disease_record","text":"Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: Name Type Description bool bool True if any disease record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 522 523 524 525 526 527 528 529 530 531 532 533 534 def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False","title":"check_incomplete_disease_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_gene_record","text":"Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: Name Type Description bool bool True if any gene record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False","title":"check_incomplete_gene_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_variant_record","text":"Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: Name Type Description bool bool True if any variant record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False","title":"check_incomplete_variant_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnosed_genes","text":"Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes Source code in src/pheval/utils/phenopacket_utils.py 446 447 448 449 450 451 452 453 454 455 456 457 458 def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes","title":"diagnosed_genes"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnosed_variants","text":"Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants Source code in src/pheval/utils/phenopacket_utils.py 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants","title":"diagnosed_variants"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnoses","text":"Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: Type Description List [ ProbandDisease ] List[ProbandDisease]: List of diagnosed diseases Source code in src/pheval/utils/phenopacket_utils.py 331 332 333 334 335 336 337 338 def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ()))","title":"diagnoses"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diseases","text":"Retrieve a list of Diseases associated with the proband Returns: Type Description List [ Disease ] List[Disease]: List of diseases Source code in src/pheval/utils/phenopacket_utils.py 283 284 285 286 287 288 289 290 291 292 293 def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases","title":"diseases"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.files","text":"Retrieve a list of files associated with a phenopacket Returns: Type Description List [ File ] List[File]: List of files associated with a phenopacket Source code in src/pheval/utils/phenopacket_utils.py 380 381 382 383 384 385 386 387 def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files","title":"files"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.interpretations","text":"Retrieve a list of interpretations from a Phenopacket Returns: Type Description List [ Interpretation ] List[Interpretation]: List of interpretations Source code in src/pheval/utils/phenopacket_utils.py 340 341 342 343 344 345 346 347 348 349 350 def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations","title":"interpretations"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.negated_phenotypic_features","text":"Retrieve a list of all negated HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of negated HPO terms Source code in src/pheval/utils/phenopacket_utils.py 269 270 271 272 273 274 275 276 277 278 279 280 281 def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features","title":"negated_phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.observed_phenotypic_features","text":"Retrieve a list of all observed HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of observed HPO terms Source code in src/pheval/utils/phenopacket_utils.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features","title":"observed_phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.phenotypic_features","text":"Retrieve a list of all HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of HPO terms Source code in src/pheval/utils/phenopacket_utils.py 242 243 244 245 246 247 248 249 250 251 252 def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features","title":"phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.sample_id","text":"Retrieve the sample ID from a Phenopacket or proband of a Family Returns: Name Type Description str str Sample ID Source code in src/pheval/utils/phenopacket_utils.py 230 231 232 233 234 235 236 237 238 239 240 def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id","title":"sample_id"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.vcf_file_data","text":"Retrieve the genome assembly and VCF file name from a phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the phenopacket file. required vcf_dir Path The directory path where the VCF file is stored. required Returns: Name Type Description File File The VCF file with updated URI pointing to the specified directory. Raises: Type Description IncorrectFileFormatError If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError If the genome assembly of the VCF file is not compatible. Note This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. Source code in src/pheval/utils/phenopacket_utils.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data","title":"vcf_file_data"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandCausativeGene","text":"Represents a causative gene associated with a proband Parameters: Name Type Description Default gene_symbol str Symbol representing the gene required gene_identifier str The ENSEMBL gene identifier for the result entry required Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 @dataclass class ProbandCausativeGene : \"\"\" Represents a causative gene associated with a proband Args: gene_symbol (str): Symbol representing the gene gene_identifier (str): The ENSEMBL gene identifier for the result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : str gene_identifier : str","title":"ProbandCausativeGene"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandCausativeVariant","text":"Represents a causative variant associated with a proband Parameters: Name Type Description Default proband_id str ID of the proband required assembly str Genome assembly required variant GenomicVariant Genomic variant associated with the proband required genotype str Genotype information for the variant required info str Additional information about the variant (default is an empty string) '' Source code in src/pheval/utils/phenopacket_utils.py 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 @dataclass class ProbandCausativeVariant : \"\"\" Represents a causative variant associated with a proband Args: proband_id (str): ID of the proband assembly (str): Genome assembly variant (GenomicVariant): Genomic variant associated with the proband genotype (str): Genotype information for the variant info (str, optional): Additional information about the variant (default is an empty string) \"\"\" proband_id : str assembly : str variant : GenomicVariant genotype : str info : str = \"\"","title":"ProbandCausativeVariant"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandDisease","text":"Represents a disease associated with a proband Parameters: Name Type Description Default disease_name str Name of the disease required disease_identifier str Identifier for the disease result entry in the OMIM namespace required Notes While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 @dataclass ( frozen = True , eq = True ) class ProbandDisease : \"\"\" Represents a disease associated with a proband Args: disease_name (str): Name of the disease disease_identifier (str): Identifier for the disease result entry in the OMIM namespace Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str","title":"ProbandDisease"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_gene_identifier_map","text":"Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: Name Type Description dict dict A mapping of gene identifiers to gene symbols. Notes The dictionary structure: { 'identifier': 'gene_symbol', ... } Source code in src/pheval/utils/phenopacket_utils.py 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 def create_gene_identifier_map () -> dict : \"\"\" Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: dict: A mapping of gene identifiers to gene symbols. Notes: The dictionary structure: { 'identifier': 'gene_symbol', ... } \"\"\" hgnc_df = read_hgnc_data () identifier_map = {} for _index , row in hgnc_df . iterrows (): identifier_map [ row [ \"ensembl_gene_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"hgnc_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"entrez_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"refseq_accession\" ]] = row [ \"symbol\" ] return identifier_map","title":"create_gene_identifier_map"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_hgnc_dict","text":"Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: Name Type Description defaultdict defaultdict A dictionary containing gene symbols as keys and their associated gene information. Notes The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } Source code in src/pheval/utils/phenopacket_utils.py 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 def create_hgnc_dict () -> defaultdict : \"\"\" Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: defaultdict: A dictionary containing gene symbols as keys and their associated gene information. Notes: The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } \"\"\" hgnc_df = read_hgnc_data () hgnc_data = defaultdict ( dict ) for _index , row in hgnc_df . iterrows (): previous_names = [] hgnc_data [ row [ \"symbol\" ]][ \"ensembl_id\" ] = row [ \"ensembl_gene_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"hgnc_id\" ] = row [ \"hgnc_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"entrez_id\" ] = row [ \"entrez_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"refseq_accession\" ] = row [ \"refseq_accession\" ] previous = str ( row [ \"prev_symbol\" ]) . split ( \"|\" ) for p in previous : previous_names . append ( p . strip ( '\"' )) hgnc_data [ row [ \"symbol\" ]][ \"previous_symbol\" ] = previous_names return hgnc_data","title":"create_hgnc_dict"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_json_message","text":"Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. Source code in src/pheval/utils/phenopacket_utils.py 608 609 610 611 612 613 614 615 616 617 618 def create_json_message ( phenopacket : Union [ Phenopacket , Family ]) -> str : \"\"\" Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. \"\"\" return MessageToJson ( phenopacket )","title":"create_json_message"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.phenopacket_reader","text":"Read a Phenopacket file and returns its contents as a Phenopacket or Family object Parameters: Name Type Description Default file Path Path to the Phenopacket file required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def phenopacket_reader ( file : Path ) -> Union [ Phenopacket , Family ]: \"\"\" Read a Phenopacket file and returns its contents as a Phenopacket or Family object Args: file (Path): Path to the Phenopacket file Returns: Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object \"\"\" file = open ( file , \"r\" ) phenopacket = json . load ( file ) file . close () if \"proband\" in phenopacket : return Parse ( json . dumps ( phenopacket ), Family ()) else : return Parse ( json . dumps ( phenopacket ), Phenopacket ())","title":"phenopacket_reader"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.read_hgnc_data","text":"Read HGNC data from a file and return it as a Pandas DataFrame. Returns: Type Description DataFrame pd.DataFrame: DataFrame containing the HGNC data. Source code in src/pheval/utils/phenopacket_utils.py 125 126 127 128 129 130 131 132 133 134 135 136 def read_hgnc_data () -> pd . DataFrame : \"\"\" Read HGNC data from a file and return it as a Pandas DataFrame. Returns: pd.DataFrame: DataFrame containing the HGNC data. \"\"\" return pd . read_csv ( os . path . dirname ( __file__ ) . replace ( \"utils\" , \"resources/hgnc_complete_set.txt\" ), delimiter = \" \\t \" , dtype = str , )","title":"read_hgnc_data"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.write_phenopacket","text":"Write a Phenopacket or Family object to a file in JSON format. Parameters: Name Type Description Default phenopacket Phenopacket or Family The Phenopacket or Family object to be written. required output_file Path The Path object representing the file to write the Phenopacket data. required Returns: Type Description None None Source code in src/pheval/utils/phenopacket_utils.py 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 def write_phenopacket ( phenopacket : Union [ Phenopacket , Family ], output_file : Path ) -> None : \"\"\" Write a Phenopacket or Family object to a file in JSON format. Args: phenopacket (Phenopacket or Family): The Phenopacket or Family object to be written. output_file (Path): The Path object representing the file to write the Phenopacket data. Returns: None \"\"\" phenopacket_json = create_json_message ( phenopacket ) with open ( output_file , \"w\" ) as outfile : outfile . write ( phenopacket_json ) outfile . close ()","title":"write_phenopacket"},{"location":"api/pheval/utils/semsim_utils/","text":"Contains all pheval utility methods diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) Calculates score difference between two semantic similarity profiles Parameters: Name Type Description Default semsim_left DataFrame first semantic similarity dataframe required semsim_right DataFrame second semantic similarity dataframe required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). required Returns: Type Description DataFrame pd.DataFrame: A dataframe with terms and its scores differences Source code in src/pheval/utils/semsim_utils.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def diff_semsim ( semsim_left : pd . DataFrame , semsim_right : pd . DataFrame , score_column : str , absolute_diff : bool ) -> pd . DataFrame : \"\"\"Calculates score difference between two semantic similarity profiles Args: semsim_left (pd.DataFrame): first semantic similarity dataframe semsim_right (pd.DataFrame): second semantic similarity dataframe score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: pd.DataFrame: A dataframe with terms and its scores differences \"\"\" df = pd . merge ( semsim_left , semsim_right , on = [ \"subject_id\" , \"object_id\" ], how = \"outer\" ) if absolute_diff : df [ \"diff\" ] = df [ f \" { score_column } _x\" ] - df [ f \" { score_column } _y\" ] return df [[ \"subject_id\" , \"object_id\" , \"diff\" ]] df [ \"diff\" ] = df . apply ( lambda row : get_percentage_diff ( row [ f \" { score_column } _x\" ], row [ f \" { score_column } _y\" ]), axis = 1 ) return df [[ \"subject_id\" , \"object_id\" , f \" { score_column } _x\" , f \" { score_column } _y\" , \"diff\" ]] filter_non_0_score ( data , col ) Removes rows that have value equal to 0 based on the given column passed by col parameter Parameters: Name Type Description Default data DataFrame Dirty dataframe required col str Column to be filtered required Returns: Type Description DataFrame pd.DataFrame: Filtered dataframe Source code in src/pheval/utils/semsim_utils.py 14 15 16 17 18 19 20 21 22 23 24 def filter_non_0_score ( data : pd . DataFrame , col : str ) -> pd . DataFrame : \"\"\"Removes rows that have value equal to 0 based on the given column passed by col parameter Args: data (pd.DataFrame): Dirty dataframe col (str): Column to be filtered Returns: pd.DataFrame: Filtered dataframe \"\"\" return data [ data [ col ] != 0 ] get_percentage_diff ( current_number , previous_number ) Gets the percentage difference between two numbers Parameters: Name Type Description Default current_number float second number in comparison required previous_number float first number in comparison required Returns: Name Type Description float float percentage difference between two numbers Source code in src/pheval/utils/semsim_utils.py 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 def get_percentage_diff ( current_number : float , previous_number : float ) -> float : \"\"\"Gets the percentage difference between two numbers Args: current_number (float): second number in comparison previous_number (float): first number in comparison Returns: float: percentage difference between two numbers \"\"\" try : if current_number == previous_number : return \" {:.2%} \" . format ( 0 ) if current_number > previous_number : number = ( 1 - (( current_number / previous_number ))) * 100 else : number = ( 100 - (( previous_number / current_number ) * 100 )) * - 1 return \" {:.2%} \" . format ( number / 100 ) except ZeroDivisionError : return None parse_semsim ( df , cols ) Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Parameters: Name Type Description Default df DataFrame semantic similarity profile dataframe required cols list list of columns that will be selected on semsim data required Returns: Type Description DataFrame pd.Dataframe: parsed semantic similarity dataframe Source code in src/pheval/utils/semsim_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 def parse_semsim ( df : pd . DataFrame , cols : list ) -> pd . DataFrame : \"\"\"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Args: df (pd.DataFrame): semantic similarity profile dataframe cols (list): list of columns that will be selected on semsim data Returns: pd.Dataframe: parsed semantic similarity dataframe \"\"\" df [ cols [ - 1 ]] = pd . to_numeric ( df [ cols [ - 1 ]], errors = \"coerce\" ) df . replace ( \"None\" , numpy . nan ) . dropna ( subset = cols [ - 1 ], inplace = True ) return df percentage_diff ( semsim_left , semsim_right , score_column , output ) Compares two semantic similarity profiles Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required output Path Output path for the difference tsv file required Source code in src/pheval/utils/semsim_utils.py 67 68 69 70 71 72 73 74 75 76 77 def percentage_diff ( semsim_left : Path , semsim_right : Path , score_column : str , output : Path ): \"\"\"Compares two semantic similarity profiles Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) output (Path): Output path for the difference tsv file \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = False ) clean_df . sort_values ( by = \"diff\" , ascending = False ) . to_csv ( output , sep = \" \\t \" , index = False ) semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = True ) semsim_analysis Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). True Returns: Type Description DataFrame [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles Source code in src/pheval/utils/semsim_utils.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def semsim_analysis ( semsim_left : Path , semsim_right : Path , score_column : str , absolute_diff = True ) -> pd . DataFrame : \"\"\"semsim_analysis Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles \"\"\" validate_semsim_file_comparison ( semsim_left , semsim_right ) cols = [ \"subject_id\" , \"object_id\" , score_column ] semsim_left = pd . read_csv ( semsim_left , sep = \" \\t \" ) semsim_right = pd . read_csv ( semsim_right , sep = \" \\t \" ) file_utils . ensure_columns_exists ( cols = cols , err_message = \"must exist in semsim dataframes\" , dataframes = [ semsim_left , semsim_right ], ) semsim_left = parse_semsim ( semsim_left , cols ) semsim_right = parse_semsim ( semsim_right , cols ) diff_df = diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) return filter_non_0_score ( diff_df , \"diff\" ) semsim_heatmap_plot ( semsim_left , semsim_right , score_column ) Plots semantic similarity profiles heatmap Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required Source code in src/pheval/utils/semsim_utils.py 80 81 82 83 84 85 86 87 88 89 90 91 def semsim_heatmap_plot ( semsim_left : Path , semsim_right : Path , score_column : str ): \"\"\"Plots semantic similarity profiles heatmap Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column ) df = clean_df . pivot ( index = \"subject_id\" , columns = \"object_id\" , values = \"diff\" ) fig = px . imshow ( df , text_auto = True ) fig . show () validate_semsim_file_comparison ( semsim_left , semsim_right ) Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException Source code in src/pheval/utils/semsim_utils.py 124 125 126 127 128 129 130 131 132 133 134 135 def validate_semsim_file_comparison ( semsim_left : Path , semsim_right : Path ): \"\"\"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException \"\"\" if semsim_left == semsim_right : errmsg = \"Semantic similarity profiles are equal. Make sure you have selected different files to analyze\" raise Exception ( errmsg ) file_utils . ensure_file_exists ( semsim_left , semsim_right )","title":"Semsim utils"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.diff_semsim","text":"Calculates score difference between two semantic similarity profiles Parameters: Name Type Description Default semsim_left DataFrame first semantic similarity dataframe required semsim_right DataFrame second semantic similarity dataframe required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). required Returns: Type Description DataFrame pd.DataFrame: A dataframe with terms and its scores differences Source code in src/pheval/utils/semsim_utils.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def diff_semsim ( semsim_left : pd . DataFrame , semsim_right : pd . DataFrame , score_column : str , absolute_diff : bool ) -> pd . DataFrame : \"\"\"Calculates score difference between two semantic similarity profiles Args: semsim_left (pd.DataFrame): first semantic similarity dataframe semsim_right (pd.DataFrame): second semantic similarity dataframe score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: pd.DataFrame: A dataframe with terms and its scores differences \"\"\" df = pd . merge ( semsim_left , semsim_right , on = [ \"subject_id\" , \"object_id\" ], how = \"outer\" ) if absolute_diff : df [ \"diff\" ] = df [ f \" { score_column } _x\" ] - df [ f \" { score_column } _y\" ] return df [[ \"subject_id\" , \"object_id\" , \"diff\" ]] df [ \"diff\" ] = df . apply ( lambda row : get_percentage_diff ( row [ f \" { score_column } _x\" ], row [ f \" { score_column } _y\" ]), axis = 1 ) return df [[ \"subject_id\" , \"object_id\" , f \" { score_column } _x\" , f \" { score_column } _y\" , \"diff\" ]]","title":"diff_semsim"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.filter_non_0_score","text":"Removes rows that have value equal to 0 based on the given column passed by col parameter Parameters: Name Type Description Default data DataFrame Dirty dataframe required col str Column to be filtered required Returns: Type Description DataFrame pd.DataFrame: Filtered dataframe Source code in src/pheval/utils/semsim_utils.py 14 15 16 17 18 19 20 21 22 23 24 def filter_non_0_score ( data : pd . DataFrame , col : str ) -> pd . DataFrame : \"\"\"Removes rows that have value equal to 0 based on the given column passed by col parameter Args: data (pd.DataFrame): Dirty dataframe col (str): Column to be filtered Returns: pd.DataFrame: Filtered dataframe \"\"\" return data [ data [ col ] != 0 ]","title":"filter_non_0_score"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.get_percentage_diff","text":"Gets the percentage difference between two numbers Parameters: Name Type Description Default current_number float second number in comparison required previous_number float first number in comparison required Returns: Name Type Description float float percentage difference between two numbers Source code in src/pheval/utils/semsim_utils.py 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 def get_percentage_diff ( current_number : float , previous_number : float ) -> float : \"\"\"Gets the percentage difference between two numbers Args: current_number (float): second number in comparison previous_number (float): first number in comparison Returns: float: percentage difference between two numbers \"\"\" try : if current_number == previous_number : return \" {:.2%} \" . format ( 0 ) if current_number > previous_number : number = ( 1 - (( current_number / previous_number ))) * 100 else : number = ( 100 - (( previous_number / current_number ) * 100 )) * - 1 return \" {:.2%} \" . format ( number / 100 ) except ZeroDivisionError : return None","title":"get_percentage_diff"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.parse_semsim","text":"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Parameters: Name Type Description Default df DataFrame semantic similarity profile dataframe required cols list list of columns that will be selected on semsim data required Returns: Type Description DataFrame pd.Dataframe: parsed semantic similarity dataframe Source code in src/pheval/utils/semsim_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 def parse_semsim ( df : pd . DataFrame , cols : list ) -> pd . DataFrame : \"\"\"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Args: df (pd.DataFrame): semantic similarity profile dataframe cols (list): list of columns that will be selected on semsim data Returns: pd.Dataframe: parsed semantic similarity dataframe \"\"\" df [ cols [ - 1 ]] = pd . to_numeric ( df [ cols [ - 1 ]], errors = \"coerce\" ) df . replace ( \"None\" , numpy . nan ) . dropna ( subset = cols [ - 1 ], inplace = True ) return df","title":"parse_semsim"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.percentage_diff","text":"Compares two semantic similarity profiles Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required output Path Output path for the difference tsv file required Source code in src/pheval/utils/semsim_utils.py 67 68 69 70 71 72 73 74 75 76 77 def percentage_diff ( semsim_left : Path , semsim_right : Path , score_column : str , output : Path ): \"\"\"Compares two semantic similarity profiles Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) output (Path): Output path for the difference tsv file \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = False ) clean_df . sort_values ( by = \"diff\" , ascending = False ) . to_csv ( output , sep = \" \\t \" , index = False )","title":"percentage_diff"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.semsim_analysis","text":"semsim_analysis Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). True Returns: Type Description DataFrame [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles Source code in src/pheval/utils/semsim_utils.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def semsim_analysis ( semsim_left : Path , semsim_right : Path , score_column : str , absolute_diff = True ) -> pd . DataFrame : \"\"\"semsim_analysis Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles \"\"\" validate_semsim_file_comparison ( semsim_left , semsim_right ) cols = [ \"subject_id\" , \"object_id\" , score_column ] semsim_left = pd . read_csv ( semsim_left , sep = \" \\t \" ) semsim_right = pd . read_csv ( semsim_right , sep = \" \\t \" ) file_utils . ensure_columns_exists ( cols = cols , err_message = \"must exist in semsim dataframes\" , dataframes = [ semsim_left , semsim_right ], ) semsim_left = parse_semsim ( semsim_left , cols ) semsim_right = parse_semsim ( semsim_right , cols ) diff_df = diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) return filter_non_0_score ( diff_df , \"diff\" )","title":"semsim_analysis"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.semsim_heatmap_plot","text":"Plots semantic similarity profiles heatmap Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required Source code in src/pheval/utils/semsim_utils.py 80 81 82 83 84 85 86 87 88 89 90 91 def semsim_heatmap_plot ( semsim_left : Path , semsim_right : Path , score_column : str ): \"\"\"Plots semantic similarity profiles heatmap Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column ) df = clean_df . pivot ( index = \"subject_id\" , columns = \"object_id\" , values = \"diff\" ) fig = px . imshow ( df , text_auto = True ) fig . show ()","title":"semsim_heatmap_plot"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.validate_semsim_file_comparison","text":"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException Source code in src/pheval/utils/semsim_utils.py 124 125 126 127 128 129 130 131 132 133 134 135 def validate_semsim_file_comparison ( semsim_left : Path , semsim_right : Path ): \"\"\"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException \"\"\" if semsim_left == semsim_right : errmsg = \"Semantic similarity profiles are equal. Make sure you have selected different files to analyze\" raise Exception ( errmsg ) file_utils . ensure_file_exists ( semsim_left , semsim_right )","title":"validate_semsim_file_comparison"},{"location":"api/pheval/utils/utils/","text":"Contains all pheval utility methods rand ( df , min_num , max_num , scramble_factor ) Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number Source code in src/pheval/utils/utils.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def rand ( df : pd . DataFrame , min_num : int , max_num : int , scramble_factor : float ) -> float : \"\"\" Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number \"\"\" try : return df + ( random . uniform ( min_num , max_num ) * scramble_factor ) except TypeError as err : info_log . error ( df , exc_info = err ) return df semsim_scramble ( input , output , columns_to_be_scrambled , scramble_factor = 0.5 ) Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 def semsim_scramble ( input : Path , output : Path , columns_to_be_scrambled : List [ str ], scramble_factor : float = 0.5 , ) -> pd . DataFrame : \"\"\" Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe \"\"\" semsim = pd . read_csv ( input , sep = \" \\t \" ) dataframe = semsim_scramble_df ( semsim , columns_to_be_scrambled , scramble_factor ) dataframe . to_csv ( output , sep = \" \\t \" , index = False ) semsim_scramble_df ( dataframe , columns_to_be_scrambled , scramble_factor ) scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 def semsim_scramble_df ( dataframe : pd . DataFrame , columns_to_be_scrambled : List [ str ], scramble_factor : float , ) -> pd . DataFrame : \"\"\"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe \"\"\" for col in columns_to_be_scrambled : min_num = dataframe [ col ] . min () max_num = dataframe [ col ] . max () dataframe [ col ] = dataframe [ col ] . apply ( rand , args = ( min_num , max_num , scramble_factor )) return dataframe","title":"Utils"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.rand","text":"Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number Source code in src/pheval/utils/utils.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def rand ( df : pd . DataFrame , min_num : int , max_num : int , scramble_factor : float ) -> float : \"\"\" Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number \"\"\" try : return df + ( random . uniform ( min_num , max_num ) * scramble_factor ) except TypeError as err : info_log . error ( df , exc_info = err ) return df","title":"rand"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.semsim_scramble","text":"Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 def semsim_scramble ( input : Path , output : Path , columns_to_be_scrambled : List [ str ], scramble_factor : float = 0.5 , ) -> pd . DataFrame : \"\"\" Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe \"\"\" semsim = pd . read_csv ( input , sep = \" \\t \" ) dataframe = semsim_scramble_df ( semsim , columns_to_be_scrambled , scramble_factor ) dataframe . to_csv ( output , sep = \" \\t \" , index = False )","title":"semsim_scramble"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.semsim_scramble_df","text":"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 def semsim_scramble_df ( dataframe : pd . DataFrame , columns_to_be_scrambled : List [ str ], scramble_factor : float , ) -> pd . DataFrame : \"\"\"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe \"\"\" for col in columns_to_be_scrambled : min_num = dataframe [ col ] . min () max_num = dataframe [ col ] . max () dataframe [ col ] = dataframe [ col ] . apply ( rand , args = ( min_num , max_num , scramble_factor )) return dataframe","title":"semsim_scramble_df"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Home Introduction PhEval - Phenotypic Inference Evaluation Framework PhEval: Tool-specific processing (VP pipeline) flowchart LR PC-->DP PC[(Phenopackets Corpus)] SSSOM[Semantic Similarity Profiles Mapping Commons]-->|OAK-SEMSIM|DP[Data Prepare] KG[Source data KG - Monarch KG]-->|KGX-BIOLINK|DP[Data Prepare] ONT[Ontologies - Phenio]-->|OAK-ONTO|DP[Data Prepare] DP-->RP[Run Prepare] RP-->PR[PhEval Runner] PR-->DP2[Data Process] ER[Exomiser Runner]-->PR EDP[Exomiser Data Prepare]-->DP ERP[Exomiser Run Prepare]-->RP PPP[Disease-profile similarity prediction Post-process]-->DP2 PV[Phenotype/Variant]-->DP2 GVP[Gene VP Post-process]-->DP2 EPP[Exomiser Post Process]-->GVP GVP-->VPR[VP Report] Quick links: GitHub page","title":"Home"},{"location":"#home","text":"","title":"Home"},{"location":"#introduction","text":"PhEval - Phenotypic Inference Evaluation Framework","title":"Introduction"},{"location":"#pheval-tool-specific-processing-vp-pipeline","text":"flowchart LR PC-->DP PC[(Phenopackets Corpus)] SSSOM[Semantic Similarity Profiles Mapping Commons]-->|OAK-SEMSIM|DP[Data Prepare] KG[Source data KG - Monarch KG]-->|KGX-BIOLINK|DP[Data Prepare] ONT[Ontologies - Phenio]-->|OAK-ONTO|DP[Data Prepare] DP-->RP[Run Prepare] RP-->PR[PhEval Runner] PR-->DP2[Data Process] ER[Exomiser Runner]-->PR EDP[Exomiser Data Prepare]-->DP ERP[Exomiser Run Prepare]-->RP PPP[Disease-profile similarity prediction Post-process]-->DP2 PV[Phenotype/Variant]-->DP2 GVP[Gene VP Post-process]-->DP2 EPP[Exomiser Post Process]-->GVP GVP-->VPR[VP Report] Quick links: GitHub page","title":"PhEval: Tool-specific processing (VP pipeline)"},{"location":"CODE_OF_CONDUCT/","text":"Contributor Covenant Code of Conduct Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. Our Standards Examples of behavior that contributes to creating a positive environment include: Using welcoming and inclusive language Being respectful of differing viewpoints and experiences Gracefully accepting constructive criticism Focusing on what is best for the community Showing empathy towards other community members Examples of unacceptable behavior by participants include: The use of sexualized language or imagery and unwelcome sexual attention or advances Trolling, insulting/derogatory comments, and personal or political attacks Public or private harassment Publishing others' private information, such as a physical or electronic address, without explicit permission Other conduct which could reasonably be considered inappropriate in a professional setting Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. Attribution This code of conduct has been derived from the excellent code of conduct of the ATOM project which in turn is adapted from the Contributor Covenant , version 1.4, available at https://contributor-covenant.org/version/1/4","title":"Contributor Covenant Code of Conduct"},{"location":"CODE_OF_CONDUCT/#contributor-covenant-code-of-conduct","text":"","title":"Contributor Covenant Code of Conduct"},{"location":"CODE_OF_CONDUCT/#our-pledge","text":"In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.","title":"Our Pledge"},{"location":"CODE_OF_CONDUCT/#our-standards","text":"Examples of behavior that contributes to creating a positive environment include: Using welcoming and inclusive language Being respectful of differing viewpoints and experiences Gracefully accepting constructive criticism Focusing on what is best for the community Showing empathy towards other community members Examples of unacceptable behavior by participants include: The use of sexualized language or imagery and unwelcome sexual attention or advances Trolling, insulting/derogatory comments, and personal or political attacks Public or private harassment Publishing others' private information, such as a physical or electronic address, without explicit permission Other conduct which could reasonably be considered inappropriate in a professional setting","title":"Our Standards"},{"location":"CODE_OF_CONDUCT/#our-responsibilities","text":"Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.","title":"Our Responsibilities"},{"location":"CODE_OF_CONDUCT/#scope","text":"This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.","title":"Scope"},{"location":"CODE_OF_CONDUCT/#enforcement","text":"Instances of abusive, harassing, or otherwise unacceptable behavior. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.","title":"Enforcement"},{"location":"CODE_OF_CONDUCT/#attribution","text":"This code of conduct has been derived from the excellent code of conduct of the ATOM project which in turn is adapted from the Contributor Covenant , version 1.4, available at https://contributor-covenant.org/version/1/4","title":"Attribution"},{"location":"about/","text":"PhEval - Phenotypic Inference Evaluation Framework Many variant prioritization tools (such as Exomiser and other computational approaches) rely on ontologies and phenotype matching, sometimes involving complex processes such as cross-species inference. The performance of such tools is exceedingly hard to evaluate because of the many factors involved: changes to the structure of the ontology, cross-species mappings, and semantic similarity algorithms can have significant consequences. Furthermore, the lack of suitable real-world problems/corpora leads to the situation that many algorithms are evaluated using simulations, which may fail to capture real-world scenarios. The lack of an evaluation framework that enables studying effects on data and knowledge inputs on real-world problems makes it difficult to optimize algorithms. To this end, we are developing a modular Phenotypic Inference Evaluation Framework (PhEval), which is delivered as a community resource.","title":"About"},{"location":"about/#pheval-phenotypic-inference-evaluation-framework","text":"Many variant prioritization tools (such as Exomiser and other computational approaches) rely on ontologies and phenotype matching, sometimes involving complex processes such as cross-species inference. The performance of such tools is exceedingly hard to evaluate because of the many factors involved: changes to the structure of the ontology, cross-species mappings, and semantic similarity algorithms can have significant consequences. Furthermore, the lack of suitable real-world problems/corpora leads to the situation that many algorithms are evaluated using simulations, which may fail to capture real-world scenarios. The lack of an evaluation framework that enables studying effects on data and knowledge inputs on real-world problems makes it difficult to optimize algorithms. To this end, we are developing a modular Phenotypic Inference Evaluation Framework (PhEval), which is delivered as a community resource.","title":"PhEval - Phenotypic Inference Evaluation Framework"},{"location":"contact/","text":"Contact The preferred way to contact the PhEval team is through the issue tracker (for problems with PhEval) or the GitHub discussions (for general questions). You can find any of the members of the PhEval core team on GitHub: https://github.com/orgs/monarch-initiative/teams/pheval-team Their GitHub profiles usually also provide email addresses.","title":"Contact Us"},{"location":"contact/#contact","text":"The preferred way to contact the PhEval team is through the issue tracker (for problems with PhEval) or the GitHub discussions (for general questions). You can find any of the members of the PhEval core team on GitHub: https://github.com/orgs/monarch-initiative/teams/pheval-team Their GitHub profiles usually also provide email addresses.","title":"Contact"},{"location":"contributing/","text":"Contributions First of all: Thank you for taking the time to contribute! The following is a set of guidelines for contributing to the PhEval framework. These guidelines are not strict rules. Use your best judgment, and feel free to propose changes to this document in a pull request. Table Of Contents Contributions Table Of Contents Code of Conduct Guidelines for Contributions and Requests Reporting problems with the data model Code of Conduct The monarch-technical-documentation team strives to create a welcoming environment for editors, users and other contributors. Please carefully read our Code of Conduct . Guidelines for Contributions and Requests Reporting problems with the data model Please use our Issue Tracker for reporting problems with the ontology.","title":"Contributions"},{"location":"contributing/#contributions","text":"First of all: Thank you for taking the time to contribute! The following is a set of guidelines for contributing to the PhEval framework. These guidelines are not strict rules. Use your best judgment, and feel free to propose changes to this document in a pull request.","title":"Contributions"},{"location":"contributing/#table-of-contents","text":"Contributions Table Of Contents Code of Conduct Guidelines for Contributions and Requests Reporting problems with the data model","title":"Table Of Contents"},{"location":"contributing/#code-of-conduct","text":"The monarch-technical-documentation team strives to create a welcoming environment for editors, users and other contributors. Please carefully read our Code of Conduct .","title":"Code of Conduct"},{"location":"contributing/#guidelines-for-contributions-and-requests","text":"","title":"Guidelines for Contributions and Requests"},{"location":"contributing/#reporting-problems-with-the-data-model","text":"Please use our Issue Tracker for reporting problems with the ontology.","title":"Reporting problems with the data model"},{"location":"developing_a_pheval_plugin/","text":"Developing a PhEval Plugin Description Plugin development allows PhEval to be extensible, as we have designed it. The plugin goal is to be flexible through custom runner implementations. This plugin development enhances the PhEval functionality. You can build one quickly using this step-by-step process. All custom Runners implementations must implement all PhevalRunner methods Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data Step-by-Step Plugin Development Process The plugin structure is derived from a cookiecutter template, Sphintoxetry-cookiecutter , and it uses Sphinx , tox and poetry as core dependencies. This allows PhEval extensibility to be standardized in terms of documentation and dependency management. 1. Sphintoxetry-cookiecutter scaffold First, install the cruft package. Cruft enables keeping projects up-to-date with future updates made to this original template. Install the latest release of cruft from pip pip install cruft NOTE: You may encounter an error with the naming of the project layout if using an older release of cruft. To avoid this, make sure you have installed the latest release version. Next, create a project using the sphintoxetry-cookiecutter template. cruft create https://github.com/monarch-initiative/monarch-project-template 2. Further setup Install poetry if you haven't already. pip install poetry Install dependencies poetry install Add PhEval dependency poetry add pheval Run tox to see if the setup works poetry run tox 3. Implement PhEval Custom Runner The runner name is arbitrary and custom Runner name was chose by demonstrative purposes Create a runner file inside the plugin project, e.g: \"\"\"Custom Pheval Runner.\"\"\" from dataclasses import dataclass from pathlib import Path from pheval.runners.runner import PhEvalRunner @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) 4. Add PhEval Plugins section to the pyproject.toml file [tool.poetry.plugins. \"pheval.plugins\" ] customrunner = \"pheval_plugin_example.runner:CustomPhevalRunner\" Replace the value above with the path to your custom runner plugin 5. Implementing PhEval helper methods Streamlining the creation of your custom PhEval runner can be facilitated by leveraging PhEval's versatile helper methods, where applicable. Within PhEval, numerous public methods have been designed to assist in your runner methods. The utilisation of these helper methods is optional, yet they are crafted to enhance the overall implementation process. Utility methods The PhenopacketUtil class is designed to aid in the collection of specific data from a Phenopacket. Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False PhenopacketUtil proves particularly beneficial in scenarios where the tool for which you're crafting a runner implementation does not directly accept Phenopackets as inputs. Instead, it might require elements\u2014such as HPO IDs\u2014 via the command-line interface (CLI). In this context, leveraging PhenopacketUtil within the runner's preparation phase enables the extraction of observed phenotypic features from the Phenopacket input, facilitating seamless processing. An example of how this could be implemented is outlined here: from pheval.utils.phenopacket_utils import phenopacket_reader from pheval.utils.phenopacket_utils import PhenopacketUtil phenopacket = phenopacket_reader ( \"/path/to/phenopacket.json\" ) phenopacket_util = PhenopacketUtil ( phenopacket ) # To return a list of all observed phenotypes for a phenopacket observed_phenotypes = phenopacket_util . observed_phenotypic_features () # To extract just the HPO ID as a list observed_phenotypes_hpo_ids = [ observed_phenotype . id for observed_phenotype in observed_phenotypes ] Additional tool-specific configurations For the pheval run command to execute successfully, a config.yaml should be found within the input directory supplied on the CLI. tool : tool_version : variant_analysis : gene_analysis : disease_analysis : tool_specific_configuration_options : The tool_specific_configuration_options is an optional field that can be populated with any variables specific to your runner implementation that is required for the running of your tool. All other fields are required to be filled in. The variant_analysis , gene_analysis , and disease_analysis are set as booleans and are for specifying what type of analysis/prioritisation the tool outputs. To populate the tool_specific_configurations_options with customised data, we suggest using the pydantic package as it can easily parse the data from the yaml structure. e.g., Define a BaseModel class with the fields that will populate the tool_specific_configuration_options from pydantic import BaseModel , Field class CustomisedConfigurations ( BaseModel ): \"\"\" Class for defining the customised configurations in tool_specific_configurations field, within the input_dir config.yaml Args: environment (str): Environment to run \"\"\" environment : str = Field ( ... ) Within your runner parse the field into an object. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) config = CustomisedConfigurations . parse_obj ( self . input_dir_config . tool_specific_configuration_options ) environment = config . environment def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) Post-processing methods PhEval currently supports the benchmarking of gene, variant, and disease prioritisation results. To benchmark these result types, PhEval TSV result files need to be generated. PhEval can deal with the ranking and generation of these files to the correct location. However, the runner implementation must handle the extraction of essential data from the tool-specific raw results. This involves transforming them into a list comprising PhEval data classes, with each instance representing a result entry. The dataclasses representing essential information extracted from tool-specific output for gene, variant, and disease prioritisation are defined as follows: Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float The generate_pheval_result() can be implemented in your runner to write out the PhEval TSV results. An example of how the method can be called is outlined here: from pheval.post_processing.post_processing import generate_pheval_result generate_pheval_result ( pheval_result = pheval_gene_result , # this is the list of extracted PhEval result requirements sort_order_str = \"descending\" , # or can be ascending - this determines in which order the scores will be ranked output_dir = output_directory , # this can be accessed from the runner instance e.g., self.output_dir tool_result_path = tool_result_json # this is the path to the tool-specific raw results file ) Adding metadata to the results.yml By default, PhEval will write a results.yml to the output directory supplied on the CLI. The results.yml contains basic metadata regarding the run configuration, however, there is also the option to add customised run metadata to the results.yml in the tool_specific_configuration_options field. To achieve this, you'll need to create a construct_meta_data() method within your runner implementation. This method is responsible for appending customised metadata to the metadata object in the form of a defined dataclass. It should return the entire metadata object once the addition is completed. e.g., Defined customised metadata dataclass: from dataclasses import dataclass @dataclass class CustomisedMetaData : customised_field : str Example of implementation in the runner. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) def construct_meta_data ( self ): \"\"\"Add metadata.\"\"\" self . meta_data . tool_specific_configuration_options = CustomisedMetaData ( customised_field = \"customised_value\" ) return self . meta_data 6. Test it. To update your custom pheval runner implementation, you must first install the package poetry install Now you have to be able to run PhEval passing your custom runner as parameter. e.g., pheval run -i ./input_dir -t ./test_data_dir -r 'customphevalrunner' -o output_dir The -r parameter stands for your plugin runner class name, and it must be entirely lowercase. Output: preparing running with custom pheval Runner post processing Pay attention to \" running with custom pheval Runner \" line, this is exactly what we had implemented in the CustomPhevalRunner Example","title":"Developing a PhEval Plugin"},{"location":"developing_a_pheval_plugin/#developing-a-pheval-plugin","text":"","title":"Developing a PhEval Plugin"},{"location":"developing_a_pheval_plugin/#description","text":"Plugin development allows PhEval to be extensible, as we have designed it. The plugin goal is to be flexible through custom runner implementations. This plugin development enhances the PhEval functionality. You can build one quickly using this step-by-step process. All custom Runners implementations must implement all PhevalRunner methods Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"Description"},{"location":"developing_a_pheval_plugin/#step-by-step-plugin-development-process","text":"The plugin structure is derived from a cookiecutter template, Sphintoxetry-cookiecutter , and it uses Sphinx , tox and poetry as core dependencies. This allows PhEval extensibility to be standardized in terms of documentation and dependency management.","title":"Step-by-Step Plugin Development Process"},{"location":"developing_a_pheval_plugin/#1-sphintoxetry-cookiecutter-scaffold","text":"First, install the cruft package. Cruft enables keeping projects up-to-date with future updates made to this original template. Install the latest release of cruft from pip pip install cruft NOTE: You may encounter an error with the naming of the project layout if using an older release of cruft. To avoid this, make sure you have installed the latest release version. Next, create a project using the sphintoxetry-cookiecutter template. cruft create https://github.com/monarch-initiative/monarch-project-template","title":"1. Sphintoxetry-cookiecutter scaffold"},{"location":"developing_a_pheval_plugin/#2-further-setup","text":"","title":"2. Further setup"},{"location":"developing_a_pheval_plugin/#install-poetry-if-you-havent-already","text":"pip install poetry","title":"Install poetry if you haven't already."},{"location":"developing_a_pheval_plugin/#install-dependencies","text":"poetry install","title":"Install dependencies"},{"location":"developing_a_pheval_plugin/#add-pheval-dependency","text":"poetry add pheval","title":"Add PhEval dependency"},{"location":"developing_a_pheval_plugin/#run-tox-to-see-if-the-setup-works","text":"poetry run tox","title":"Run tox to see if the setup works"},{"location":"developing_a_pheval_plugin/#3-implement-pheval-custom-runner","text":"The runner name is arbitrary and custom Runner name was chose by demonstrative purposes Create a runner file inside the plugin project, e.g: \"\"\"Custom Pheval Runner.\"\"\" from dataclasses import dataclass from pathlib import Path from pheval.runners.runner import PhEvalRunner @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" )","title":"3. Implement PhEval Custom Runner"},{"location":"developing_a_pheval_plugin/#4-add-pheval-plugins-section-to-the-pyprojecttoml-file","text":"[tool.poetry.plugins. \"pheval.plugins\" ] customrunner = \"pheval_plugin_example.runner:CustomPhevalRunner\" Replace the value above with the path to your custom runner plugin","title":"4. Add PhEval Plugins section to the pyproject.toml file"},{"location":"developing_a_pheval_plugin/#5-implementing-pheval-helper-methods","text":"Streamlining the creation of your custom PhEval runner can be facilitated by leveraging PhEval's versatile helper methods, where applicable. Within PhEval, numerous public methods have been designed to assist in your runner methods. The utilisation of these helper methods is optional, yet they are crafted to enhance the overall implementation process.","title":"5. Implementing PhEval helper methods"},{"location":"developing_a_pheval_plugin/#utility-methods","text":"The PhenopacketUtil class is designed to aid in the collection of specific data from a Phenopacket. Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False PhenopacketUtil proves particularly beneficial in scenarios where the tool for which you're crafting a runner implementation does not directly accept Phenopackets as inputs. Instead, it might require elements\u2014such as HPO IDs\u2014 via the command-line interface (CLI). In this context, leveraging PhenopacketUtil within the runner's preparation phase enables the extraction of observed phenotypic features from the Phenopacket input, facilitating seamless processing. An example of how this could be implemented is outlined here: from pheval.utils.phenopacket_utils import phenopacket_reader from pheval.utils.phenopacket_utils import PhenopacketUtil phenopacket = phenopacket_reader ( \"/path/to/phenopacket.json\" ) phenopacket_util = PhenopacketUtil ( phenopacket ) # To return a list of all observed phenotypes for a phenopacket observed_phenotypes = phenopacket_util . observed_phenotypic_features () # To extract just the HPO ID as a list observed_phenotypes_hpo_ids = [ observed_phenotype . id for observed_phenotype in observed_phenotypes ]","title":"Utility methods"},{"location":"developing_a_pheval_plugin/#additional-tool-specific-configurations","text":"For the pheval run command to execute successfully, a config.yaml should be found within the input directory supplied on the CLI. tool : tool_version : variant_analysis : gene_analysis : disease_analysis : tool_specific_configuration_options : The tool_specific_configuration_options is an optional field that can be populated with any variables specific to your runner implementation that is required for the running of your tool. All other fields are required to be filled in. The variant_analysis , gene_analysis , and disease_analysis are set as booleans and are for specifying what type of analysis/prioritisation the tool outputs. To populate the tool_specific_configurations_options with customised data, we suggest using the pydantic package as it can easily parse the data from the yaml structure. e.g., Define a BaseModel class with the fields that will populate the tool_specific_configuration_options from pydantic import BaseModel , Field class CustomisedConfigurations ( BaseModel ): \"\"\" Class for defining the customised configurations in tool_specific_configurations field, within the input_dir config.yaml Args: environment (str): Environment to run \"\"\" environment : str = Field ( ... ) Within your runner parse the field into an object. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) config = CustomisedConfigurations . parse_obj ( self . input_dir_config . tool_specific_configuration_options ) environment = config . environment def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" )","title":"Additional tool-specific configurations"},{"location":"developing_a_pheval_plugin/#post-processing-methods","text":"PhEval currently supports the benchmarking of gene, variant, and disease prioritisation results. To benchmark these result types, PhEval TSV result files need to be generated. PhEval can deal with the ranking and generation of these files to the correct location. However, the runner implementation must handle the extraction of essential data from the tool-specific raw results. This involves transforming them into a list comprising PhEval data classes, with each instance representing a result entry. The dataclasses representing essential information extracted from tool-specific output for gene, variant, and disease prioritisation are defined as follows: Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float The generate_pheval_result() can be implemented in your runner to write out the PhEval TSV results. An example of how the method can be called is outlined here: from pheval.post_processing.post_processing import generate_pheval_result generate_pheval_result ( pheval_result = pheval_gene_result , # this is the list of extracted PhEval result requirements sort_order_str = \"descending\" , # or can be ascending - this determines in which order the scores will be ranked output_dir = output_directory , # this can be accessed from the runner instance e.g., self.output_dir tool_result_path = tool_result_json # this is the path to the tool-specific raw results file )","title":"Post-processing methods"},{"location":"developing_a_pheval_plugin/#adding-metadata-to-the-resultsyml","text":"By default, PhEval will write a results.yml to the output directory supplied on the CLI. The results.yml contains basic metadata regarding the run configuration, however, there is also the option to add customised run metadata to the results.yml in the tool_specific_configuration_options field. To achieve this, you'll need to create a construct_meta_data() method within your runner implementation. This method is responsible for appending customised metadata to the metadata object in the form of a defined dataclass. It should return the entire metadata object once the addition is completed. e.g., Defined customised metadata dataclass: from dataclasses import dataclass @dataclass class CustomisedMetaData : customised_field : str Example of implementation in the runner. from dataclasses import dataclass from pheval.runners.runner import PhEvalRunner from pathlib import Path @dataclass class CustomPhevalRunner ( PhEvalRunner ): \"\"\"CustomPhevalRunner Class.\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): \"\"\"prepare method.\"\"\" print ( \"preparing\" ) def run ( self ): \"\"\"run method.\"\"\" print ( \"running with custom pheval runner\" ) def post_process ( self ): \"\"\"post_process method.\"\"\" print ( \"post processing\" ) def construct_meta_data ( self ): \"\"\"Add metadata.\"\"\" self . meta_data . tool_specific_configuration_options = CustomisedMetaData ( customised_field = \"customised_value\" ) return self . meta_data","title":"Adding metadata to the results.yml"},{"location":"developing_a_pheval_plugin/#6-test-it","text":"To update your custom pheval runner implementation, you must first install the package poetry install Now you have to be able to run PhEval passing your custom runner as parameter. e.g., pheval run -i ./input_dir -t ./test_data_dir -r 'customphevalrunner' -o output_dir The -r parameter stands for your plugin runner class name, and it must be entirely lowercase. Output: preparing running with custom pheval Runner post processing Pay attention to \" running with custom pheval Runner \" line, this is exactly what we had implemented in the CustomPhevalRunner Example","title":"6. Test it."},{"location":"executing_a_benchmark/","text":"Executing a Benchmark PhEval is designed for benchmarking algorithms across various datasets. To execute a benchmark using PhEval, you need to: Execute your runner; generating the PhEval standardised TSV outputs for gene/variant/disease prioritisation. Configure the benchmarking parameters. Run the benchmark. PhEval will generate various performance reports, allowing you to easily compare the effectiveness of different algorithms. After the Runner Execution After executing a run, you may be left with an output directory structure like so: . \u251c\u2500\u2500 pheval_disease_results \u2502 \u251c\u2500\u2500 patient_1-pheval_disease_result.tsv \u251c\u2500\u2500 pheval_gene_results \u2502 \u251c\u2500\u2500 patient_1-pheval_gene_result.tsv \u251c\u2500\u2500 pheval_variant_results \u2502 \u251c\u2500\u2500 patient_1-pheval_variant_result.tsv \u251c\u2500\u2500 raw_results \u2502 \u251c\u2500\u2500 patient_1.json \u251c\u2500\u2500 results.yml \u2514\u2500\u2500 tool_input_commands \u2514\u2500\u2500 tool_input_commands.txt Whether you have populated pheval_disease_results , pheval_gene_results , and pheval_variant_results directories will depend on what is specified in the config.yaml for the runner execution. It is the results in these directories that are consumed in the benchmarking to produce the statistical comparison reports. Benchmarking Configuration File To configure the benchmarking parameters, a YAML configuration file should be created and supplied to the CLI command. An outline of the configuration file structure follows below: benchmark_name : exomiser_14_benchmark runs : - run_identifier : run_identifier_1 results_dir : /path/to/results_dir_1 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : False disease_analysis : True threshold : score_order : descending - run_identifier : run_identifier_2 results_dir : /path/to/results_dir_2 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : True disease_analysis : True threshold : score_order : descending plot_customisation : gene_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : disease_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : variant_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : The benchmark_name is what will be used to name the duckdb database that will contain all the ranking and binary statistics as well as comparisons between runs. The name provided should not have any whitespace or special characters. Runs section The runs section specifies which run configurations should be included in the benchmarking. For each run configuration you will need to populate the following parameters: run_identifier : The identifier associated with the run - this should be meaningful as it will be used in the naming in tables and plots. results_dir : The full path to the root directory where the directories pheval_gene_results / pheval_variant_results / pheval_disease_results can be found. phenopacket_dir : The full path to the phenopacket directory used during the runner execution. gene_analysis : Boolean specifying whether to perform benchmarking for gene prioritisation analysis. variant_analysis : Boolean specifying whether to perform benchmarking for variant prioritisation analysis disease_analysis : Boolean specifying whether to perform benchmarking for disease prioritisation analysis threshold : OPTIONAL score threshold to consider for inclusion of results. score_order : Ordering of results for ranking. Either ascending or descending. Plot customisation section The plot_customisation section specifies any additional customisation to the plots output from the benchmarking. Here you can specify title names for all the plots output, as well as the plot type for displaying the summary ranking stats. This section is split by the plots output from the gene, variant and disease prioritisation benchmarking. The parameters in this section do not need to be populated - however, if left blank it will default to generic titles. The parameters as follows are: plot_type : The plot type output for the summary rank stats plot. This can be either, bar_cumulative, bar_non_cumulative or bar_stacked. rank_plot_title : The customised title for the summary rank stats plot. roc_curve_title : The customised title for the ROC curve plot. precision_recall_title The customised title for the precision-recall curve plot. Executing the benchmark After configuring the benchmarking YAML, executing the benchmark is relatively simple. pheval-utils generate-benchmark-stats --run-yaml benchmarking_config.yaml","title":"Executing a Benchmark"},{"location":"executing_a_benchmark/#executing-a-benchmark","text":"PhEval is designed for benchmarking algorithms across various datasets. To execute a benchmark using PhEval, you need to: Execute your runner; generating the PhEval standardised TSV outputs for gene/variant/disease prioritisation. Configure the benchmarking parameters. Run the benchmark. PhEval will generate various performance reports, allowing you to easily compare the effectiveness of different algorithms.","title":"Executing a Benchmark"},{"location":"executing_a_benchmark/#after-the-runner-execution","text":"After executing a run, you may be left with an output directory structure like so: . \u251c\u2500\u2500 pheval_disease_results \u2502 \u251c\u2500\u2500 patient_1-pheval_disease_result.tsv \u251c\u2500\u2500 pheval_gene_results \u2502 \u251c\u2500\u2500 patient_1-pheval_gene_result.tsv \u251c\u2500\u2500 pheval_variant_results \u2502 \u251c\u2500\u2500 patient_1-pheval_variant_result.tsv \u251c\u2500\u2500 raw_results \u2502 \u251c\u2500\u2500 patient_1.json \u251c\u2500\u2500 results.yml \u2514\u2500\u2500 tool_input_commands \u2514\u2500\u2500 tool_input_commands.txt Whether you have populated pheval_disease_results , pheval_gene_results , and pheval_variant_results directories will depend on what is specified in the config.yaml for the runner execution. It is the results in these directories that are consumed in the benchmarking to produce the statistical comparison reports.","title":"After the Runner Execution"},{"location":"executing_a_benchmark/#benchmarking-configuration-file","text":"To configure the benchmarking parameters, a YAML configuration file should be created and supplied to the CLI command. An outline of the configuration file structure follows below: benchmark_name : exomiser_14_benchmark runs : - run_identifier : run_identifier_1 results_dir : /path/to/results_dir_1 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : False disease_analysis : True threshold : score_order : descending - run_identifier : run_identifier_2 results_dir : /path/to/results_dir_2 phenopacket_dir : /path/to/phenopacket_dir gene_analysis : True variant_analysis : True disease_analysis : True threshold : score_order : descending plot_customisation : gene_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : disease_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : variant_plots : plot_type : bar_cumulative rank_plot_title : roc_curve_title : precision_recall_title : The benchmark_name is what will be used to name the duckdb database that will contain all the ranking and binary statistics as well as comparisons between runs. The name provided should not have any whitespace or special characters.","title":"Benchmarking Configuration File"},{"location":"executing_a_benchmark/#runs-section","text":"The runs section specifies which run configurations should be included in the benchmarking. For each run configuration you will need to populate the following parameters: run_identifier : The identifier associated with the run - this should be meaningful as it will be used in the naming in tables and plots. results_dir : The full path to the root directory where the directories pheval_gene_results / pheval_variant_results / pheval_disease_results can be found. phenopacket_dir : The full path to the phenopacket directory used during the runner execution. gene_analysis : Boolean specifying whether to perform benchmarking for gene prioritisation analysis. variant_analysis : Boolean specifying whether to perform benchmarking for variant prioritisation analysis disease_analysis : Boolean specifying whether to perform benchmarking for disease prioritisation analysis threshold : OPTIONAL score threshold to consider for inclusion of results. score_order : Ordering of results for ranking. Either ascending or descending.","title":"Runs section"},{"location":"executing_a_benchmark/#plot-customisation-section","text":"The plot_customisation section specifies any additional customisation to the plots output from the benchmarking. Here you can specify title names for all the plots output, as well as the plot type for displaying the summary ranking stats. This section is split by the plots output from the gene, variant and disease prioritisation benchmarking. The parameters in this section do not need to be populated - however, if left blank it will default to generic titles. The parameters as follows are: plot_type : The plot type output for the summary rank stats plot. This can be either, bar_cumulative, bar_non_cumulative or bar_stacked. rank_plot_title : The customised title for the summary rank stats plot. roc_curve_title : The customised title for the ROC curve plot. precision_recall_title The customised title for the precision-recall curve plot.","title":"Plot customisation section"},{"location":"executing_a_benchmark/#executing-the-benchmark","text":"After configuring the benchmarking YAML, executing the benchmark is relatively simple. pheval-utils generate-benchmark-stats --run-yaml benchmarking_config.yaml","title":"Executing the benchmark"},{"location":"exomiser_pipeline/","text":"PhEval Pipeline Exomiser Runner Step by Step to PhEval Run Pipeline (with ExomiserRunner) 1. Download Exomiser Software wget https://github.com/exomiser/Exomiser/releases/download/13.2.0/exomiser-cli-13.2.0-distribution.zip 2. Download Phenotype Data wget https://data.monarchinitiative.org/exomiser/latest/2302_hg19.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_hg38.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_phenotype.zip 3. Unzip data # unzip the distribution and data files - this will create a directory called 'exomiser-cli-13.1.0' in the current working directory unzip exomiser-cli-13.2.0-distribution.zip unzip 2302_hg19.zip -d exomiser-cli-13.2.0/data unzip 2302_hg38.zip -d exomiser-cli-13.2.0/data 4. Clone PhEval repo and follow steps described in Pipeline Documentation: git clone https://github.com/monarch-initiative/pheval.git cd pheval poetry shell poetry install pip install pheval.exomiser 5. Set PhEval Config YAML File directories : tmp : data/tmp exomiser : /path_where_exomiser_was_extracted phenotype : /path_where_phenotype_was_extracted workspace : /pheval's_path # path where pheval was cloned corpora : - id : small_test scrambled : - factor : 0.5 - factor : 0.7 custom_variants : - id : no_phenotype configs : - tool : exomiser version : 13.2.0 configuration : default exomiser_db : semsim1 runs : - tool : exomiser configuration : default corpus : small_test corpusvariant : scrambled-0.5 version : 13.2.0 6. Generate Makefile based on configuration bash ./resources/generatemakefile.sh 7. Exomiser Runner requires the following configuration The config.yaml file should be formatted like the example below and must be placed in exomiser: /path_where_exomiser_was_extracted declared in pheval-config.yaml file. tool : exomiser tool_version : 13.2.0 variant_analysis : True gene_analysis : True disease_analysis : True tool_specific_configuration_options : environment : local exomiser_software_directory : . analysis_configuration_file : preset-exome-analysis.yml max_jobs : 0 application_properties : remm_version : cadd_version : hg19_data_version : 2302 hg19_local_frequency_path : hg38_data_version : 2302 phenotype_data_version : 2302 cache_type : cache_caffeine_spec : post_process : score_name : combinedScore sort_order : DESCENDING 8. Preset Exome Analysis File Exomiser requires a preset-exome-analysis.yml file saved at /path_where_exomiser_was_extracted/preset-exome-analysis.yml This is an example of preset-exome-analysis.yml file ## Exomiser Analysis Template. # These are all the possible options for running exomiser. Use this as a template for # your own set-up. --- analysisMode : PASS_ONLY inheritanceModes : { AUTOSOMAL_DOMINANT : 0.1 , AUTOSOMAL_RECESSIVE_HOM_ALT : 0.1 , AUTOSOMAL_RECESSIVE_COMP_HET : 2.0 , X_DOMINANT : 0.1 , X_RECESSIVE_HOM_ALT : 0.1 , X_RECESSIVE_COMP_HET : 2.0 , MITOCHONDRIAL : 0.2 } frequencySources : [ THOUSAND_GENOMES , TOPMED , UK10K , ESP_AFRICAN_AMERICAN , ESP_EUROPEAN_AMERICAN , ESP_ALL , EXAC_AFRICAN_INC_AFRICAN_AMERICAN , EXAC_AMERICAN , EXAC_SOUTH_ASIAN , EXAC_EAST_ASIAN , EXAC_FINNISH , EXAC_NON_FINNISH_EUROPEAN , EXAC_OTHER , GNOMAD_E_AFR , GNOMAD_E_AMR , # GNOMAD_E_ASJ, GNOMAD_E_EAS , GNOMAD_E_FIN , GNOMAD_E_NFE , GNOMAD_E_OTH , GNOMAD_E_SAS , GNOMAD_G_AFR , GNOMAD_G_AMR , # GNOMAD_G_ASJ, GNOMAD_G_EAS , GNOMAD_G_FIN , GNOMAD_G_NFE , GNOMAD_G_OTH , GNOMAD_G_SAS ] # Possible pathogenicitySources: (POLYPHEN, MUTATION_TASTER, SIFT), (REVEL, MVP), CADD, REMM # REMM is trained on non-coding regulatory regions # *WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files # and updated their location in the application.properties. Exomiser will not run without this. pathogenicitySources : [ REVEL , MVP ] #this is the standard exomiser order. steps : [ failedVariantFilter : { }, variantEffectFilter : { remove : [ FIVE_PRIME_UTR_EXON_VARIANT , FIVE_PRIME_UTR_INTRON_VARIANT , THREE_PRIME_UTR_EXON_VARIANT , THREE_PRIME_UTR_INTRON_VARIANT , NON_CODING_TRANSCRIPT_EXON_VARIANT , NON_CODING_TRANSCRIPT_INTRON_VARIANT , CODING_TRANSCRIPT_INTRON_VARIANT , UPSTREAM_GENE_VARIANT , DOWNSTREAM_GENE_VARIANT , INTERGENIC_VARIANT , REGULATORY_REGION_VARIANT ] }, frequencyFilter : { maxFrequency : 2.0 }, pathogenicityFilter : { keepNonPathogenic : true }, inheritanceFilter : { }, omimPrioritiser : { }, hiPhivePrioritiser : { } ] 9. PhEval Run make pheval run","title":"PhEval Pipeline Exomiser Runner"},{"location":"exomiser_pipeline/#pheval-pipeline-exomiser-runner","text":"","title":"PhEval Pipeline Exomiser Runner"},{"location":"exomiser_pipeline/#step-by-step-to-pheval-run-pipeline-with-exomiserrunner","text":"","title":"Step by Step to PhEval Run Pipeline (with ExomiserRunner)"},{"location":"exomiser_pipeline/#1-download-exomiser-software","text":"wget https://github.com/exomiser/Exomiser/releases/download/13.2.0/exomiser-cli-13.2.0-distribution.zip","title":"1. Download Exomiser Software"},{"location":"exomiser_pipeline/#2-download-phenotype-data","text":"wget https://data.monarchinitiative.org/exomiser/latest/2302_hg19.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_hg38.zip wget https://data.monarchinitiative.org/exomiser/latest/2302_phenotype.zip","title":"2. Download Phenotype Data"},{"location":"exomiser_pipeline/#3-unzip-data","text":"# unzip the distribution and data files - this will create a directory called 'exomiser-cli-13.1.0' in the current working directory unzip exomiser-cli-13.2.0-distribution.zip unzip 2302_hg19.zip -d exomiser-cli-13.2.0/data unzip 2302_hg38.zip -d exomiser-cli-13.2.0/data","title":"3. Unzip data"},{"location":"exomiser_pipeline/#4-clone-pheval-repo-and-follow-steps-described-in-pipeline-documentation","text":"git clone https://github.com/monarch-initiative/pheval.git cd pheval poetry shell poetry install pip install pheval.exomiser","title":"4. Clone PhEval repo and follow steps described in Pipeline Documentation:"},{"location":"exomiser_pipeline/#5-set-pheval-config-yaml-file","text":"directories : tmp : data/tmp exomiser : /path_where_exomiser_was_extracted phenotype : /path_where_phenotype_was_extracted workspace : /pheval's_path # path where pheval was cloned corpora : - id : small_test scrambled : - factor : 0.5 - factor : 0.7 custom_variants : - id : no_phenotype configs : - tool : exomiser version : 13.2.0 configuration : default exomiser_db : semsim1 runs : - tool : exomiser configuration : default corpus : small_test corpusvariant : scrambled-0.5 version : 13.2.0","title":"5. Set PhEval Config YAML File"},{"location":"exomiser_pipeline/#6-generate-makefile-based-on-configuration","text":"bash ./resources/generatemakefile.sh","title":"6. Generate Makefile based on configuration"},{"location":"exomiser_pipeline/#7-exomiser-runner-requires-the-following-configuration","text":"The config.yaml file should be formatted like the example below and must be placed in exomiser: /path_where_exomiser_was_extracted declared in pheval-config.yaml file. tool : exomiser tool_version : 13.2.0 variant_analysis : True gene_analysis : True disease_analysis : True tool_specific_configuration_options : environment : local exomiser_software_directory : . analysis_configuration_file : preset-exome-analysis.yml max_jobs : 0 application_properties : remm_version : cadd_version : hg19_data_version : 2302 hg19_local_frequency_path : hg38_data_version : 2302 phenotype_data_version : 2302 cache_type : cache_caffeine_spec : post_process : score_name : combinedScore sort_order : DESCENDING","title":"7. Exomiser Runner requires the following configuration"},{"location":"exomiser_pipeline/#8-preset-exome-analysis-file","text":"Exomiser requires a preset-exome-analysis.yml file saved at /path_where_exomiser_was_extracted/preset-exome-analysis.yml This is an example of preset-exome-analysis.yml file ## Exomiser Analysis Template. # These are all the possible options for running exomiser. Use this as a template for # your own set-up. --- analysisMode : PASS_ONLY inheritanceModes : { AUTOSOMAL_DOMINANT : 0.1 , AUTOSOMAL_RECESSIVE_HOM_ALT : 0.1 , AUTOSOMAL_RECESSIVE_COMP_HET : 2.0 , X_DOMINANT : 0.1 , X_RECESSIVE_HOM_ALT : 0.1 , X_RECESSIVE_COMP_HET : 2.0 , MITOCHONDRIAL : 0.2 } frequencySources : [ THOUSAND_GENOMES , TOPMED , UK10K , ESP_AFRICAN_AMERICAN , ESP_EUROPEAN_AMERICAN , ESP_ALL , EXAC_AFRICAN_INC_AFRICAN_AMERICAN , EXAC_AMERICAN , EXAC_SOUTH_ASIAN , EXAC_EAST_ASIAN , EXAC_FINNISH , EXAC_NON_FINNISH_EUROPEAN , EXAC_OTHER , GNOMAD_E_AFR , GNOMAD_E_AMR , # GNOMAD_E_ASJ, GNOMAD_E_EAS , GNOMAD_E_FIN , GNOMAD_E_NFE , GNOMAD_E_OTH , GNOMAD_E_SAS , GNOMAD_G_AFR , GNOMAD_G_AMR , # GNOMAD_G_ASJ, GNOMAD_G_EAS , GNOMAD_G_FIN , GNOMAD_G_NFE , GNOMAD_G_OTH , GNOMAD_G_SAS ] # Possible pathogenicitySources: (POLYPHEN, MUTATION_TASTER, SIFT), (REVEL, MVP), CADD, REMM # REMM is trained on non-coding regulatory regions # *WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files # and updated their location in the application.properties. Exomiser will not run without this. pathogenicitySources : [ REVEL , MVP ] #this is the standard exomiser order. steps : [ failedVariantFilter : { }, variantEffectFilter : { remove : [ FIVE_PRIME_UTR_EXON_VARIANT , FIVE_PRIME_UTR_INTRON_VARIANT , THREE_PRIME_UTR_EXON_VARIANT , THREE_PRIME_UTR_INTRON_VARIANT , NON_CODING_TRANSCRIPT_EXON_VARIANT , NON_CODING_TRANSCRIPT_INTRON_VARIANT , CODING_TRANSCRIPT_INTRON_VARIANT , UPSTREAM_GENE_VARIANT , DOWNSTREAM_GENE_VARIANT , INTERGENIC_VARIANT , REGULATORY_REGION_VARIANT ] }, frequencyFilter : { maxFrequency : 2.0 }, pathogenicityFilter : { keepNonPathogenic : true }, inheritanceFilter : { }, omimPrioritiser : { }, hiPhivePrioritiser : { } ]","title":"8. Preset Exome Analysis File"},{"location":"exomiser_pipeline/#9-pheval-run","text":"make pheval run","title":"9. PhEval Run"},{"location":"pipeline/","text":"PhEval Pipeline TLDR The Pipeline presented on PhEval preprint was moved to a new repository - Monarch PhEval . NOTE: The default Monarch PhEval pipeline, as proposed in the paper preprint, requires approximately 1 TB of disk space. Learn how to modify the pipeline configuration here to customize the experiments. 1. Clone Monarch PhEval git clone https://github.com/monarch-initiative/monarch_pheval.git 2. Installing PhEval Pipeline dependencies Enter in the cloned folder and enter the following commands: poetry shell poetry install 3. Executing Pipeline make pheval Pipeline Description The Pipeline is divided in three main steps 1. Data Preparation Phase The data preparation phase, checks the completeness of the disease, gene and variant input data and optionally preparing simulated VCF files if required, gives the user the ability to randomise phenotypic profiles using the PhEval corpus scramble command utility, allowing for the assessment of how well VGPAs handle noise and less specific phenotypic profiles when making predict. 2. Runner Phase The runner phase is structured into three stages: prepare, run, and post-process. The prepare step plays a crucial role in adapting the input data to meet the specific requirements of the tool. In the run step, the VGPA is executed, applying the selected algorithm to the prepared data and generating the tool-specific outputs. Within the run stage, an essential task is the generation of input command files for the algorithm. These files serve as collections of individual commands, each tailored to run the targeted VGPA on specific samples. These commands are configured with the appropriate inputs, outputs and specific configuration settings, allowing for the automated and efficient processing of large corpora. Finally, the post-processing step takes care of harmonising the tool-specific outputs into standardised PhEval TSV format, ensuring uniformity and ease of analysis of results from all VGPAs. In this context, the tool-specific output is condensed to provide only two essential elements, the entity of interest, which can either be a variant, gene, or disease, and its corresponding score. PhEval then assumes the responsibility of subsequent standardisation processes. This involves the reranking of the results in a uniform manner, ensuring that fair and comprehensive comparisons can be made between tools. 3. Analysis Phase In the analysis phase, PhEval generates comprehensive statistical reports based on standardised outputs from the runner phase. Customising PhEval Pipeline Experiments The PhEval pipeline is orchestrated using a Makefile Jinja template strategy. Therefore, to describe a new experiment in the pipeline, the user needs to generate a Makefile workflow based on a configuration file. In the resources folder are the following files responsible for Makefile generation: \ud83d\udce6resources \u2523 \ud83d\udcdcMakefile.j2 \u2523 \ud83d\udcdccustom.Makefile \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml Let's begin by describing the pheval-config.yaml file and its structure. PhEval Configuration File This file is responsible define the experiment settings and will be used to generate the Makefile using a Jinja template which consumes this YAML configuration file. Directories Section The data and tmp properties are mandatory and must be specified in this section. data property refers to the folder location where the necessary phenotypic data for the pipeline will be downloaded and extracted. tmp property points to the folder where all temporary intermediate files will be generated. directories : data : data tmp : data/tmp Corpora Section The corpora section specifies which corpus will be used in the experiment. In this example is defined LIRICAL corpus, A small comparison corpus created for benchmarking the LIRICAL system which contains 385 case reports. The user needs to specify corpus id and it must be equals to the corpora folder structure, e.g. \ud83d\udce6corpora \u2503 \u2523 \ud83d\udcc2lirical \u2503 \u2523 \u2523 \ud83d\udcc2small_version \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2phenopackets \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.json \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.json \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2vcf \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.vcf.gz \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.vcf.gz \u2503 \u2523 \u2523 \u2523 \ud83d\udcdccorpus.yml \u2503 \u2523 \u2523 \u2523 \ud83d\udcdctemplate_exome_hg19.vcf.gz corpora : - id : lirical variant : small_version Configs Section The configs section holds all custom configurations for the different VGPAs. It must declare: - tool: VGPA tool name. - id: it's an arbiratry unique identifier that will be used in the runs section - version: VGPA tool version configs : - tool : phen2gene id : phen2gene-1.2.3 version : 1.2.3 configs section can also deal with special VGPA data preparation steps, for example, Semantic Similarity ingestions into Exomiser phenotypic database e.g. configs : - tool : exomiser id : exomiser-semsim-ingest-13.3.0 version : 13.3.0 phenotype : 2309 preprocessing : - phenio-monarch-hp-hp.0.4.semsimian.sql phenotype property describes the Exomiser phenotype database version and the preprocessing section will execute SQL scripts into that phenotypic database. Runs Section The \"runs\" section will integrate all previously described sections and pass them to pheval VGPA for concrete execution. tool property specifies which runner will be called corpus and corpusvariant must match properties declared on the corpora section . version should correspond to the tool version configuration must match the id described on the configuration section . runs : - tool : exomiser corpus : lirical corpusvariant : small_version version : 13.3.0 configuration : exomiser-semsim-ingest-13.3.0 Generating new Makefile based on PhEval configuration file \ud83d\udce6resources \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml To generate a new Makefile, simply execute the generatemakefile.sh script, which encapsulates the Makefile rendering process dynamically filling it using the pheval-config.yaml configuration file. ./resources/generatemakefile.sh","title":"PhEval Pipeline"},{"location":"pipeline/#pheval-pipeline","text":"","title":"PhEval Pipeline"},{"location":"pipeline/#tldr","text":"The Pipeline presented on PhEval preprint was moved to a new repository - Monarch PhEval . NOTE: The default Monarch PhEval pipeline, as proposed in the paper preprint, requires approximately 1 TB of disk space. Learn how to modify the pipeline configuration here to customize the experiments.","title":"TLDR"},{"location":"pipeline/#1-clone-monarch-pheval","text":"git clone https://github.com/monarch-initiative/monarch_pheval.git","title":"1. Clone Monarch PhEval"},{"location":"pipeline/#2-installing-pheval-pipeline-dependencies","text":"Enter in the cloned folder and enter the following commands: poetry shell poetry install","title":"2. Installing PhEval Pipeline dependencies"},{"location":"pipeline/#3-executing-pipeline","text":"make pheval","title":"3. Executing Pipeline"},{"location":"pipeline/#pipeline-description","text":"The Pipeline is divided in three main steps","title":"Pipeline Description"},{"location":"pipeline/#1-data-preparation-phase","text":"The data preparation phase, checks the completeness of the disease, gene and variant input data and optionally preparing simulated VCF files if required, gives the user the ability to randomise phenotypic profiles using the PhEval corpus scramble command utility, allowing for the assessment of how well VGPAs handle noise and less specific phenotypic profiles when making predict.","title":"1. Data Preparation Phase"},{"location":"pipeline/#2-runner-phase","text":"The runner phase is structured into three stages: prepare, run, and post-process. The prepare step plays a crucial role in adapting the input data to meet the specific requirements of the tool. In the run step, the VGPA is executed, applying the selected algorithm to the prepared data and generating the tool-specific outputs. Within the run stage, an essential task is the generation of input command files for the algorithm. These files serve as collections of individual commands, each tailored to run the targeted VGPA on specific samples. These commands are configured with the appropriate inputs, outputs and specific configuration settings, allowing for the automated and efficient processing of large corpora. Finally, the post-processing step takes care of harmonising the tool-specific outputs into standardised PhEval TSV format, ensuring uniformity and ease of analysis of results from all VGPAs. In this context, the tool-specific output is condensed to provide only two essential elements, the entity of interest, which can either be a variant, gene, or disease, and its corresponding score. PhEval then assumes the responsibility of subsequent standardisation processes. This involves the reranking of the results in a uniform manner, ensuring that fair and comprehensive comparisons can be made between tools.","title":"2. Runner Phase"},{"location":"pipeline/#3-analysis-phase","text":"In the analysis phase, PhEval generates comprehensive statistical reports based on standardised outputs from the runner phase.","title":"3. Analysis Phase"},{"location":"pipeline/#customising-pheval-pipeline-experiments","text":"The PhEval pipeline is orchestrated using a Makefile Jinja template strategy. Therefore, to describe a new experiment in the pipeline, the user needs to generate a Makefile workflow based on a configuration file. In the resources folder are the following files responsible for Makefile generation: \ud83d\udce6resources \u2523 \ud83d\udcdcMakefile.j2 \u2523 \ud83d\udcdccustom.Makefile \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml Let's begin by describing the pheval-config.yaml file and its structure.","title":"Customising PhEval Pipeline Experiments"},{"location":"pipeline/#pheval-configuration-file","text":"This file is responsible define the experiment settings and will be used to generate the Makefile using a Jinja template which consumes this YAML configuration file.","title":"PhEval Configuration File"},{"location":"pipeline/#directories-section","text":"The data and tmp properties are mandatory and must be specified in this section. data property refers to the folder location where the necessary phenotypic data for the pipeline will be downloaded and extracted. tmp property points to the folder where all temporary intermediate files will be generated. directories : data : data tmp : data/tmp","title":"Directories Section"},{"location":"pipeline/#corpora-section","text":"The corpora section specifies which corpus will be used in the experiment. In this example is defined LIRICAL corpus, A small comparison corpus created for benchmarking the LIRICAL system which contains 385 case reports. The user needs to specify corpus id and it must be equals to the corpora folder structure, e.g. \ud83d\udce6corpora \u2503 \u2523 \ud83d\udcc2lirical \u2503 \u2523 \u2523 \ud83d\udcc2small_version \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2phenopackets \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.json \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.json \u2503 \u2523 \u2523 \u2523 \ud83d\udcc2vcf \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT1.vcf.gz \u2503 \u2523 \u2523 \u2523 \u2523 \ud83d\udcdcPATIENT2.vcf.gz \u2503 \u2523 \u2523 \u2523 \ud83d\udcdccorpus.yml \u2503 \u2523 \u2523 \u2523 \ud83d\udcdctemplate_exome_hg19.vcf.gz corpora : - id : lirical variant : small_version","title":"Corpora Section"},{"location":"pipeline/#configs-section","text":"The configs section holds all custom configurations for the different VGPAs. It must declare: - tool: VGPA tool name. - id: it's an arbiratry unique identifier that will be used in the runs section - version: VGPA tool version configs : - tool : phen2gene id : phen2gene-1.2.3 version : 1.2.3 configs section can also deal with special VGPA data preparation steps, for example, Semantic Similarity ingestions into Exomiser phenotypic database e.g. configs : - tool : exomiser id : exomiser-semsim-ingest-13.3.0 version : 13.3.0 phenotype : 2309 preprocessing : - phenio-monarch-hp-hp.0.4.semsimian.sql phenotype property describes the Exomiser phenotype database version and the preprocessing section will execute SQL scripts into that phenotypic database.","title":"Configs Section"},{"location":"pipeline/#runs-section","text":"The \"runs\" section will integrate all previously described sections and pass them to pheval VGPA for concrete execution. tool property specifies which runner will be called corpus and corpusvariant must match properties declared on the corpora section . version should correspond to the tool version configuration must match the id described on the configuration section . runs : - tool : exomiser corpus : lirical corpusvariant : small_version version : 13.3.0 configuration : exomiser-semsim-ingest-13.3.0","title":"Runs Section"},{"location":"pipeline/#generating-new-makefile-based-on-pheval-configuration-file","text":"\ud83d\udce6resources \u2523 \ud83d\udcdcgeneratemakefile.sh \u2517 \ud83d\udcdcpheval-config.yaml To generate a new Makefile, simply execute the generatemakefile.sh script, which encapsulates the Makefile rendering process dynamically filling it using the pheval-config.yaml configuration file. ./resources/generatemakefile.sh","title":"Generating new Makefile based on PhEval configuration file"},{"location":"plugins/","text":"A full list of implemented PhEval runners are listed below along with links to the original tool: Tool PhEval plugin Comment Exomiser Exomiser runner The link to the original tool can be found here Phen2Gene Phen2Gene runner The link to the original tool can be found here LIRICAL LIRICAL runner The link to the original tool can be found here SvAnna SvAnna runner The link to the original tool can be found here GADO GADO runner The link to the original tool can be found here Template Template runner OntoGPT OntoGPT runner ELDER ELDER runner MALCO MALCO runner AI MARRVEL AI MARRVEL runner The link to the original tool can be found here OAK OAK runner","title":"Plugins"},{"location":"roadmap/","text":"Roadmap The Roadmap is a rough plan, changes are expected throughout the year. 2023 Q1 Finalising the PhEval architecture (draft is done) End-to-end pipeline for testing PhEval with Exomiser and two versions of HPO Submitting a poster to Biocuration which outlines the full vision Q2 Focus on an analytic framework around PhEval, focusing on studying how changes to ontologies affect changes in variant prioritisation Extend phenotype pipeline to enable base releases and alternative patterns Q3 Improving the analytic framework of PhEval, especially phenotype analysis All intermediate files of pipeline have a corresponding LinkML model Focus on studying the effect of KG snippets (p2ds) on VP performance Q4 Drafting a PhEval paper Building standalone pipeline that reports changes in algorithm behaviours to ontology developers.","title":"Roadmap"},{"location":"roadmap/#roadmap","text":"The Roadmap is a rough plan, changes are expected throughout the year.","title":"Roadmap"},{"location":"roadmap/#2023","text":"","title":"2023"},{"location":"roadmap/#q1","text":"Finalising the PhEval architecture (draft is done) End-to-end pipeline for testing PhEval with Exomiser and two versions of HPO Submitting a poster to Biocuration which outlines the full vision","title":"Q1"},{"location":"roadmap/#q2","text":"Focus on an analytic framework around PhEval, focusing on studying how changes to ontologies affect changes in variant prioritisation Extend phenotype pipeline to enable base releases and alternative patterns","title":"Q2"},{"location":"roadmap/#q3","text":"Improving the analytic framework of PhEval, especially phenotype analysis All intermediate files of pipeline have a corresponding LinkML model Focus on studying the effect of KG snippets (p2ds) on VP performance","title":"Q3"},{"location":"roadmap/#q4","text":"Drafting a PhEval paper Building standalone pipeline that reports changes in algorithm behaviours to ontology developers.","title":"Q4"},{"location":"styleguide/","text":"Monarch Style Guide for PhEval No code in CLI methods","title":"Monarch Style Guide for PhEval"},{"location":"styleguide/#monarch-style-guide-for-pheval","text":"No code in CLI methods","title":"Monarch Style Guide for PhEval"},{"location":"api/pheval/cli/","text":"main main CLI method for PhEval Args: verbose (int, optional): Verbose flag. quiet (bool, optional): Queit Flag. Usage: main [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default -v , --verbose integer range ( 0 and above) N/A 0 -q , --quiet text N/A None --help boolean Show this message and exit. False pheval pheval Usage: pheval [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands run : PhEval Runner Command Line Interface run PhEval Runner Command Line Interface Args: input_dir (Path): The input directory (relative path: e.g exomiser-13.11) testdata_dir (Path): The input directory (relative path: e.g ./data runner (str): Runner implementation (e.g exomiser-13.11) tmp_dir (Path): The path of the temporary directory (optional) output_dir (Path): The path of the output directory config (Path): The path of the configuration file (optional e.g., config.yaml) version (str): The version of the tool implementation Usage: pheval run [OPTIONS] Options: Name Type Description Default --input-dir , -i Path The input directory (relative path: e.g exomiser-13.11) _required --testdata-dir , -t Path The input directory (relative path: e.g ./data) _required --runner , -r text Runner implementation (e.g exomiser-13.11) _required --tmp-dir , -m Path The path of the temporary directory (optional) None --output-dir , -o Path The path of the output directory _required --config , -c Path The path of the configuration file (optional e.g config.yaml) None --version , -v text Version of the tool implementation. None --help boolean Show this message and exit. False pheval-utils pheval_utils Usage: pheval-utils [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands create-spiked-vcfs : generate-benchmark-stats : Benchmark the gene/variant/disease prioritisation performance for runs. generate-stats-plot : Generate bar plot from benchmark db. prepare-corpus : scramble-phenopackets : Generate noisy phenopackets from existing ones. semsim-scramble : Scrambles semsim profile multiplying score value by scramble factor semsim-to-exomiserdb : ingests semsim file into exomiser phenotypic database update-phenopackets : Update gene symbols and identifiers for phenopackets. create-spiked-vcfs Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). output_dir (Path): The directory to store the generated spiked VCF file(s). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). Usage: pheval-utils create-spiked-vcfs [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -O Path Path for creation of output directory vcf --help boolean Show this message and exit. False generate-benchmark-stats Benchmark the gene/variant/disease prioritisation performance for runs. Usage: pheval-utils generate-benchmark-stats [OPTIONS] Options: Name Type Description Default --run-yaml , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False generate-stats-plot Generate bar plot from benchmark db. Usage: pheval-utils generate-stats-plot [OPTIONS] Options: Name Type Description Default --benchmark-db , -b Path Path to benchmark db output by PhEval benchmark commands. _required --run-data , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False prepare-corpus Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Usage: pheval-utils prepare-corpus [OPTIONS] Options: Name Type Description Default --phenopacket-dir , -p Path Path to phenopacket corpus directory.. _required --variant-analysis / --no-variant-analysis boolean Specify whether to check for complete variant records in the phenopackets. False --gene-analysis / --no-gene-analysis boolean Specify whether to check for complete gene records in the phenopackets. False --disease-analysis / --no-disease-analysis boolean Specify whether to check for complete disease records in the phenopackets. False --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to update in phenopacket None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -o Path Path to output prepared corpus. prepared_corpus --help boolean Show this message and exit. False scramble-phenopackets Generate noisy phenopackets from existing ones. Usage: pheval-utils scramble-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopackets directory. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --scramble-factor , -s float Scramble factor for randomising phenopacket phenotypic profiles. 0.5 --output-dir , -O Path Path for creation of output directory noisy_phenopackets --local-ontology-cache , -l Path Path to the local ontology cache, e.g., path to the hp.obo. None --help boolean Show this message and exit. False semsim-scramble Scrambles semsim profile multiplying score value by scramble factor Args: input (Path): Path file that points out to the semsim profile output (Path): Path file that points out to the output file score_column (List[str]): Score column(s) that will be scrambled scramble_factor (float): Scramble Magnitude Usage: pheval-utils semsim-scramble [OPTIONS] Options: Name Type Description Default --input , -i Path Path to the semantic similarity profile to be scrambled. _required --output , -o Path Path where the scrambled semsim file will be written. _required --score-column , -c choice ( jaccard_similarity | dice_similarity | phenodigm_score ) Score column that will be scrambled _required --scramble-factor , -s float Scramble Magnitude (noise) that will be applied to semantic similarity score column (e.g. jaccard similarity). 0.5 --help boolean Show this message and exit. False semsim-to-exomiserdb ingests semsim file into exomiser phenotypic database Args: input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) Usage: pheval-utils semsim-to-exomiserdb [OPTIONS] Options: Name Type Description Default --input-file , -i Path Semsim input file. _required --object-prefix text Object Prefix. e.g. MP _required --subject-prefix text Subject Prefix. e.g. HP _required --db-path , -d Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/). This is the path where the phenotypic database folder will be written out. _required --help boolean Show this message and exit. False update-phenopackets Update gene symbols and identifiers for phenopackets. Usage: pheval-utils update-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --output-dir , -o Path Path to write phenopacket. _required --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to add to phenopacket ensembl_id --help boolean Show this message and exit. False","title":"Cli"},{"location":"api/pheval/cli/#main","text":"main CLI method for PhEval Args: verbose (int, optional): Verbose flag. quiet (bool, optional): Queit Flag. Usage: main [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default -v , --verbose integer range ( 0 and above) N/A 0 -q , --quiet text N/A None --help boolean Show this message and exit. False","title":"main"},{"location":"api/pheval/cli/#pheval","text":"pheval Usage: pheval [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands run : PhEval Runner Command Line Interface","title":"pheval"},{"location":"api/pheval/cli/#run","text":"PhEval Runner Command Line Interface Args: input_dir (Path): The input directory (relative path: e.g exomiser-13.11) testdata_dir (Path): The input directory (relative path: e.g ./data runner (str): Runner implementation (e.g exomiser-13.11) tmp_dir (Path): The path of the temporary directory (optional) output_dir (Path): The path of the output directory config (Path): The path of the configuration file (optional e.g., config.yaml) version (str): The version of the tool implementation Usage: pheval run [OPTIONS] Options: Name Type Description Default --input-dir , -i Path The input directory (relative path: e.g exomiser-13.11) _required --testdata-dir , -t Path The input directory (relative path: e.g ./data) _required --runner , -r text Runner implementation (e.g exomiser-13.11) _required --tmp-dir , -m Path The path of the temporary directory (optional) None --output-dir , -o Path The path of the output directory _required --config , -c Path The path of the configuration file (optional e.g config.yaml) None --version , -v text Version of the tool implementation. None --help boolean Show this message and exit. False","title":"run"},{"location":"api/pheval/cli/#pheval-utils","text":"pheval_utils Usage: pheval-utils [OPTIONS] COMMAND [ARGS]... Options: Name Type Description Default --help boolean Show this message and exit. False Subcommands create-spiked-vcfs : generate-benchmark-stats : Benchmark the gene/variant/disease prioritisation performance for runs. generate-stats-plot : Generate bar plot from benchmark db. prepare-corpus : scramble-phenopackets : Generate noisy phenopackets from existing ones. semsim-scramble : Scrambles semsim profile multiplying score value by scramble factor semsim-to-exomiserdb : ingests semsim file into exomiser phenotypic database update-phenopackets : Update gene symbols and identifiers for phenopackets.","title":"pheval-utils"},{"location":"api/pheval/cli/#create-spiked-vcfs","text":"Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). output_dir (Path): The directory to store the generated spiked VCF file(s). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). Usage: pheval-utils create-spiked-vcfs [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -O Path Path for creation of output directory vcf --help boolean Show this message and exit. False","title":"create-spiked-vcfs"},{"location":"api/pheval/cli/#generate-benchmark-stats","text":"Benchmark the gene/variant/disease prioritisation performance for runs. Usage: pheval-utils generate-benchmark-stats [OPTIONS] Options: Name Type Description Default --run-yaml , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False","title":"generate-benchmark-stats"},{"location":"api/pheval/cli/#generate-stats-plot","text":"Generate bar plot from benchmark db. Usage: pheval-utils generate-stats-plot [OPTIONS] Options: Name Type Description Default --benchmark-db , -b Path Path to benchmark db output by PhEval benchmark commands. _required --run-data , -r Path Path to yaml configuration file for benchmarking. _required --help boolean Show this message and exit. False","title":"generate-stats-plot"},{"location":"api/pheval/cli/#prepare-corpus","text":"Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Usage: pheval-utils prepare-corpus [OPTIONS] Options: Name Type Description Default --phenopacket-dir , -p Path Path to phenopacket corpus directory.. _required --variant-analysis / --no-variant-analysis boolean Specify whether to check for complete variant records in the phenopackets. False --gene-analysis / --no-gene-analysis boolean Specify whether to check for complete gene records in the phenopackets. False --disease-analysis / --no-disease-analysis boolean Specify whether to check for complete disease records in the phenopackets. False --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to update in phenopacket None --hg19-template-vcf , -hg19 Path Template hg19 VCF file NOTE: This argument is mutually exclusive with arguments: [hg19_vcf_dir]. None --hg38-template-vcf , -hg38 Path Template hg38 VCF file NOTE: This argument is mutually exclusive with arguments: [hg38_vcf_dir]. None --hg19-vcf-dir , -hg19-dir Path Path to directory containing hg19 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg19_template_vcf]. None --hg38-vcf-dir , -hg38-dir Path Path to directory containing hg38 VCF templates. NOTE: This argument is mutually exclusive with arguments: [hg38_template_vcf]. None --output-dir , -o Path Path to output prepared corpus. prepared_corpus --help boolean Show this message and exit. False","title":"prepare-corpus"},{"location":"api/pheval/cli/#scramble-phenopackets","text":"Generate noisy phenopackets from existing ones. Usage: pheval-utils scramble-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopackets directory. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --scramble-factor , -s float Scramble factor for randomising phenopacket phenotypic profiles. 0.5 --output-dir , -O Path Path for creation of output directory noisy_phenopackets --local-ontology-cache , -l Path Path to the local ontology cache, e.g., path to the hp.obo. None --help boolean Show this message and exit. False","title":"scramble-phenopackets"},{"location":"api/pheval/cli/#semsim-scramble","text":"Scrambles semsim profile multiplying score value by scramble factor Args: input (Path): Path file that points out to the semsim profile output (Path): Path file that points out to the output file score_column (List[str]): Score column(s) that will be scrambled scramble_factor (float): Scramble Magnitude Usage: pheval-utils semsim-scramble [OPTIONS] Options: Name Type Description Default --input , -i Path Path to the semantic similarity profile to be scrambled. _required --output , -o Path Path where the scrambled semsim file will be written. _required --score-column , -c choice ( jaccard_similarity | dice_similarity | phenodigm_score ) Score column that will be scrambled _required --scramble-factor , -s float Scramble Magnitude (noise) that will be applied to semantic similarity score column (e.g. jaccard similarity). 0.5 --help boolean Show this message and exit. False","title":"semsim-scramble"},{"location":"api/pheval/cli/#semsim-to-exomiserdb","text":"ingests semsim file into exomiser phenotypic database Args: input_file (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) Usage: pheval-utils semsim-to-exomiserdb [OPTIONS] Options: Name Type Description Default --input-file , -i Path Semsim input file. _required --object-prefix text Object Prefix. e.g. MP _required --subject-prefix text Subject Prefix. e.g. HP _required --db-path , -d Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/). This is the path where the phenotypic database folder will be written out. _required --help boolean Show this message and exit. False","title":"semsim-to-exomiserdb"},{"location":"api/pheval/cli/#update-phenopackets","text":"Update gene symbols and identifiers for phenopackets. Usage: pheval-utils update-phenopackets [OPTIONS] Options: Name Type Description Default --phenopacket-path , -p Path Path to phenopacket. NOTE: This argument is mutually exclusive with arguments: [phenopacket_dir]. None --phenopacket-dir , -P Path Path to phenopacket directory for updating. NOTE: This argument is mutually exclusive with arguments: [phenopacket_path]. None --output-dir , -o Path Path to write phenopacket. _required --gene-identifier , -g choice ( ensembl_id | entrez_id | hgnc_id ) Gene identifier to add to phenopacket ensembl_id --help boolean Show this message and exit. False","title":"update-phenopackets"},{"location":"api/pheval/config_parser/","text":"InputDirConfig dataclass Class for defining the fields within the input directory config. Parameters: Name Type Description Default tool str Name of the tool implementation (e.g. exomiser/phen2gene) required tool_version str Version of the tool implementation required variant_analysis bool Whether to extract prioritised variants from results. required gene_analysis bool Whether to extract prioritised genes from results. required disease_analysis bool Whether to extract prioritised diseases from results. required tool_specific_configuration_options Any Tool specific configurations required Source code in src/pheval/config_parser.py 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 @serde @dataclass class InputDirConfig : \"\"\" Class for defining the fields within the input directory config. Args: tool (str): Name of the tool implementation (e.g. exomiser/phen2gene) tool_version (str): Version of the tool implementation variant_analysis (bool): Whether to extract prioritised variants from results. gene_analysis (bool): Whether to extract prioritised genes from results. disease_analysis (bool): Whether to extract prioritised diseases from results. tool_specific_configuration_options (Any): Tool specific configurations \"\"\" tool : str tool_version : str variant_analysis : bool gene_analysis : bool disease_analysis : bool tool_specific_configuration_options : Any parse_input_dir_config ( input_dir ) Reads the config file. Source code in src/pheval/config_parser.py 35 36 37 38 39 40 def parse_input_dir_config ( input_dir : Path ) -> InputDirConfig : \"\"\"Reads the config file.\"\"\" with open ( Path ( input_dir ) . joinpath ( \"config.yaml\" ), \"r\" ) as config_file : config = yaml . safe_load ( config_file ) config_file . close () return from_yaml ( InputDirConfig , yaml . dump ( config ))","title":"Config parser"},{"location":"api/pheval/config_parser/#src.pheval.config_parser.InputDirConfig","text":"Class for defining the fields within the input directory config. Parameters: Name Type Description Default tool str Name of the tool implementation (e.g. exomiser/phen2gene) required tool_version str Version of the tool implementation required variant_analysis bool Whether to extract prioritised variants from results. required gene_analysis bool Whether to extract prioritised genes from results. required disease_analysis bool Whether to extract prioritised diseases from results. required tool_specific_configuration_options Any Tool specific configurations required Source code in src/pheval/config_parser.py 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 @serde @dataclass class InputDirConfig : \"\"\" Class for defining the fields within the input directory config. Args: tool (str): Name of the tool implementation (e.g. exomiser/phen2gene) tool_version (str): Version of the tool implementation variant_analysis (bool): Whether to extract prioritised variants from results. gene_analysis (bool): Whether to extract prioritised genes from results. disease_analysis (bool): Whether to extract prioritised diseases from results. tool_specific_configuration_options (Any): Tool specific configurations \"\"\" tool : str tool_version : str variant_analysis : bool gene_analysis : bool disease_analysis : bool tool_specific_configuration_options : Any","title":"InputDirConfig"},{"location":"api/pheval/config_parser/#src.pheval.config_parser.parse_input_dir_config","text":"Reads the config file. Source code in src/pheval/config_parser.py 35 36 37 38 39 40 def parse_input_dir_config ( input_dir : Path ) -> InputDirConfig : \"\"\"Reads the config file.\"\"\" with open ( Path ( input_dir ) . joinpath ( \"config.yaml\" ), \"r\" ) as config_file : config = yaml . safe_load ( config_file ) config_file . close () return from_yaml ( InputDirConfig , yaml . dump ( config ))","title":"parse_input_dir_config"},{"location":"api/pheval/run_metadata/","text":"BasicOutputRunMetaData dataclass Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run Source code in src/pheval/run_metadata.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 @serde @dataclass class BasicOutputRunMetaData : \"\"\"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run \"\"\" tool : str tool_version : str config : Path run_timestamp : int corpus : Path tool_specific_configuration_options : Any = None","title":"Run metadata"},{"location":"api/pheval/run_metadata/#src.pheval.run_metadata.BasicOutputRunMetaData","text":"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run Source code in src/pheval/run_metadata.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 @serde @dataclass class BasicOutputRunMetaData : \"\"\"Class for defining variables for the run metadata. Args: tool (str): Name of the tool implementation tool_version (str): Version of the tool implementation config (Path): Path to the config file located in the input directory run_timestamp (int): Time taken for run to complete corpus (Path): Path to corpus used in pheval run tool_specific_configuration_options (Any): Special field that can be overwritten by tool implementations to contain any extra tool specific configurations used in the run \"\"\" tool : str tool_version : str config : Path run_timestamp : int corpus : Path tool_specific_configuration_options : Any = None","title":"BasicOutputRunMetaData"},{"location":"api/pheval/analyse/analysis/","text":"benchmark_run_comparisons ( run_config ) Benchmark prioritisation performance for several runs. Parameters: Name Type Description Default run_config Config Run configurations. required Source code in src/pheval/analyse/analysis.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def benchmark_run_comparisons ( run_config : Config , ) -> None : \"\"\" Benchmark prioritisation performance for several runs. Args: run_config (Config): Run configurations. \"\"\" gene_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . gene_analysis ], plot_customisation = run_config . plot_customisation , ) variant_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . variant_analysis ], plot_customisation = run_config . plot_customisation , ) disease_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . disease_analysis ], plot_customisation = run_config . plot_customisation , ) if gene_analysis_runs . runs : _run_benchmark_comparison ( run_config = gene_analysis_runs , benchmark_generator = GeneBenchmarkRunOutputGenerator ( plot_customisation = gene_analysis_runs . plot_customisation . gene_plots ), ) if variant_analysis_runs . runs : _run_benchmark_comparison ( run_config = variant_analysis_runs , benchmark_generator = VariantBenchmarkRunOutputGenerator ( plot_customisation = variant_analysis_runs . plot_customisation . variant_plots ), ) if disease_analysis_runs . runs : _run_benchmark_comparison ( run_config = disease_analysis_runs , benchmark_generator = DiseaseBenchmarkRunOutputGenerator ( plot_customisation = disease_analysis_runs . plot_customisation . disease_plots ), )","title":"Analysis"},{"location":"api/pheval/analyse/analysis/#src.pheval.analyse.analysis.benchmark_run_comparisons","text":"Benchmark prioritisation performance for several runs. Parameters: Name Type Description Default run_config Config Run configurations. required Source code in src/pheval/analyse/analysis.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def benchmark_run_comparisons ( run_config : Config , ) -> None : \"\"\" Benchmark prioritisation performance for several runs. Args: run_config (Config): Run configurations. \"\"\" gene_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . gene_analysis ], plot_customisation = run_config . plot_customisation , ) variant_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . variant_analysis ], plot_customisation = run_config . plot_customisation , ) disease_analysis_runs = Config ( benchmark_name = run_config . benchmark_name , runs = [ run for run in run_config . runs if run . disease_analysis ], plot_customisation = run_config . plot_customisation , ) if gene_analysis_runs . runs : _run_benchmark_comparison ( run_config = gene_analysis_runs , benchmark_generator = GeneBenchmarkRunOutputGenerator ( plot_customisation = gene_analysis_runs . plot_customisation . gene_plots ), ) if variant_analysis_runs . runs : _run_benchmark_comparison ( run_config = variant_analysis_runs , benchmark_generator = VariantBenchmarkRunOutputGenerator ( plot_customisation = variant_analysis_runs . plot_customisation . variant_plots ), ) if disease_analysis_runs . runs : _run_benchmark_comparison ( run_config = disease_analysis_runs , benchmark_generator = DiseaseBenchmarkRunOutputGenerator ( plot_customisation = disease_analysis_runs . plot_customisation . disease_plots ), )","title":"benchmark_run_comparisons"},{"location":"api/pheval/analyse/assess_prioritisation_base/","text":"AssessPrioritisationBase Source code in src/pheval/analyse/assess_prioritisation_base.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 class AssessPrioritisationBase : def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 ) def _assess_with_threshold_ascending_order ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the ascending order threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) > float ( result_entry . score ): return result_entry . rank else : return 0 def _assess_with_threshold ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the score threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) < float ( result_entry . score ): return result_entry . rank else : return 0 def _record_matched_entity ( self , standardised_result : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Return the rank result - handling the specification of a threshold. Args: standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval disease result entry Returns: int: Recorded entity prioritisation rank \"\"\" if float ( self . threshold ) == 0.0 : return standardised_result . rank else : return ( self . _assess_with_threshold ( standardised_result ) if self . score_order != \"ascending\" else self . _assess_with_threshold_ascending_order ( standardised_result , ) ) __init__ ( db_connection , table_name , column , threshold , score_order ) Initialise AssessPrioritisationBase class Parameters: Name Type Description Default db_connection BenchmarkDBManager DB connection. required table_name str Table name. required column str Column name. required threshold float Threshold for scores required score_order str Score order for results, either ascending or descending required Source code in src/pheval/analyse/assess_prioritisation_base.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 )","title":"Assess prioritisation base"},{"location":"api/pheval/analyse/assess_prioritisation_base/#src.pheval.analyse.assess_prioritisation_base.AssessPrioritisationBase","text":"Source code in src/pheval/analyse/assess_prioritisation_base.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 class AssessPrioritisationBase : def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 ) def _assess_with_threshold_ascending_order ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the ascending order threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) > float ( result_entry . score ): return result_entry . rank else : return 0 def _assess_with_threshold ( self , result_entry : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Record the prioritisation rank if it meets the score threshold. Args: result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval result entry Returns: int: Recorded prioritisation rank \"\"\" if float ( self . threshold ) < float ( result_entry . score ): return result_entry . rank else : return 0 def _record_matched_entity ( self , standardised_result : Union [ RankedPhEvalGeneResult , RankedPhEvalDiseaseResult , RankedPhEvalVariantResult ], ) -> int : \"\"\" Return the rank result - handling the specification of a threshold. Args: standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]): Ranked PhEval disease result entry Returns: int: Recorded entity prioritisation rank \"\"\" if float ( self . threshold ) == 0.0 : return standardised_result . rank else : return ( self . _assess_with_threshold ( standardised_result ) if self . score_order != \"ascending\" else self . _assess_with_threshold_ascending_order ( standardised_result , ) )","title":"AssessPrioritisationBase"},{"location":"api/pheval/analyse/assess_prioritisation_base/#src.pheval.analyse.assess_prioritisation_base.AssessPrioritisationBase.__init__","text":"Initialise AssessPrioritisationBase class Parameters: Name Type Description Default db_connection BenchmarkDBManager DB connection. required table_name str Table name. required column str Column name. required threshold float Threshold for scores required score_order str Score order for results, either ascending or descending required Source code in src/pheval/analyse/assess_prioritisation_base.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 def __init__ ( self , db_connection : BenchmarkDBManager , table_name : str , column : str , threshold : float , score_order : str , ): \"\"\" Initialise AssessPrioritisationBase class Args: db_connection (BenchmarkDBManager): DB connection. table_name (str): Table name. column (str): Column name. threshold (float): Threshold for scores score_order (str): Score order for results, either ascending or descending \"\"\" self . threshold = threshold self . score_order = score_order self . db_connection = db_connection self . conn = db_connection . conn self . column = column self . table_name = table_name db_connection . add_column_integer_default ( table_name = table_name , column = self . column , default = 0 )","title":"__init__"},{"location":"api/pheval/analyse/benchmark_db_manager/","text":"BenchmarkDBManager Class to connect to database. Source code in src/pheval/analyse/benchmark_db_manager.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 class BenchmarkDBManager : \"\"\" Class to connect to database. \"\"\" def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ] def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close () __init__ ( benchmark_name ) Initialise the BenchmarkDBManager class. Source code in src/pheval/analyse/benchmark_db_manager.py 20 21 22 23 24 def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) add_column_integer_default ( table_name , column , default = 0 ) Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. Source code in src/pheval/analyse/benchmark_db_manager.py 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass add_contains_function () Adds a custom contains_entity_function to the DuckDB connection if it does not already exist. Source code in src/pheval/analyse/benchmark_db_manager.py 84 85 86 87 88 89 90 91 92 def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) check_table_exists ( table_name ) Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns True if the table exists in the database, False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False close () Close the connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 138 139 140 def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close () contains_entity_function ( entity , known_causative_entity ) staticmethod Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the entity . Returns: Name Type Description bool bool True if known_causative_entity is found in entity (or its list representation), False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity drop_table ( table_name ) Drop a table from the database. Args: table_name: Name of the table to drop from the database Source code in src/pheval/analyse/benchmark_db_manager.py 57 58 59 60 61 62 63 def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) get_connection ( db_name ) staticmethod Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 30 31 32 33 34 35 36 37 38 @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn initialise () Initialise the duckdb connection. Source code in src/pheval/analyse/benchmark_db_manager.py 26 27 28 def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () parse_table_into_dataclass ( table_name , dataclass ) Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: Type Description Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ]] List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. Source code in src/pheval/analyse/benchmark_db_manager.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ]","title":"Benchmark db manager"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager","text":"Class to connect to database. Source code in src/pheval/analyse/benchmark_db_manager.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 class BenchmarkDBManager : \"\"\" Class to connect to database. \"\"\" def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" ) def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function () @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" ) @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function ) def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ] def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close ()","title":"BenchmarkDBManager"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.__init__","text":"Initialise the BenchmarkDBManager class. Source code in src/pheval/analyse/benchmark_db_manager.py 20 21 22 23 24 def __init__ ( self , benchmark_name : str ): \"\"\"Initialise the BenchmarkDBManager class.\"\"\" self . conn = self . get_connection ( f \" { benchmark_name } \" if str ( benchmark_name ) . endswith ( \".db\" ) else f \" { benchmark_name } .db\" )","title":"__init__"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.add_column_integer_default","text":"Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. Source code in src/pheval/analyse/benchmark_db_manager.py 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_column_integer_default ( self , table_name : str , column : str , default : int = 0 ) -> None : \"\"\" Add a column to an existing table with an integer default value. Args: table_name (str): Name of the table. column (str): Name of the column to add. default (int): Default integer value to add. \"\"\" try : self . conn . execute ( f 'ALTER TABLE { table_name } ADD COLUMN \" { column } \" INTEGER DEFAULT { default } ' ) self . conn . execute ( f 'UPDATE { table_name } SET \" { column } \" = ?' , ( default ,)) self . conn . commit () except duckdb . CatalogException : pass","title":"add_column_integer_default"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.add_contains_function","text":"Adds a custom contains_entity_function to the DuckDB connection if it does not already exist. Source code in src/pheval/analyse/benchmark_db_manager.py 84 85 86 87 88 89 90 91 92 def add_contains_function ( self ) -> None : \"\"\" Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist. \"\"\" result = self . conn . execute ( \"SELECT * FROM duckdb_functions() WHERE function_name = ?\" , [ \"contains_entity_function\" ] ) . fetchall () if not result : self . conn . create_function ( \"contains_entity_function\" , self . contains_entity_function )","title":"add_contains_function"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.check_table_exists","text":"Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns True if the table exists in the database, False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def check_table_exists ( self , table_name : str ) -> bool : \"\"\" Check if a table exists in the connected DuckDB database. Args: table_name (str): The name of the table to check for existence. Returns: bool: Returns `True` if the table exists in the database, `False` otherwise. \"\"\" result = self . conn . execute ( f \"SELECT * FROM information_schema.tables WHERE table_name = ' { table_name } '\" ) . fetchall () if result : return True return False","title":"check_table_exists"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.close","text":"Close the connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 138 139 140 def close ( self ): \"\"\"Close the connection to the database.\"\"\" self . conn . close ()","title":"close"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.contains_entity_function","text":"Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the entity . Returns: Name Type Description bool bool True if known_causative_entity is found in entity (or its list representation), False otherwise. Source code in src/pheval/analyse/benchmark_db_manager.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 @staticmethod def contains_entity_function ( entity : str , known_causative_entity : str ) -> bool : \"\"\" Determines if a known causative entity is present within an entity or list of entities. Args: entity (str): The entity to be checked. It can be a single entity or a string representation of a list. known_causative_entity (str): The entity to search for within the `entity`. Returns: bool: `True` if `known_causative_entity` is found in `entity` (or its list representation), `False` otherwise. \"\"\" list_pattern = re . compile ( r \"^\\[\\s*(?:[^\\[\\],\\s]+(?:\\s*,\\s*[^\\[\\],\\s]+)*)?\\s*]$\" ) if list_pattern . match ( str ( entity )): list_representation = ast . literal_eval ( entity ) if isinstance ( list_representation , list ): return known_causative_entity in list_representation return known_causative_entity == entity","title":"contains_entity_function"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.drop_table","text":"Drop a table from the database. Args: table_name: Name of the table to drop from the database Source code in src/pheval/analyse/benchmark_db_manager.py 57 58 59 60 61 62 63 def drop_table ( self , table_name : str ) -> None : \"\"\" Drop a table from the database. Args: table_name: Name of the table to drop from the database \"\"\" self . conn . execute ( f \"\"\"DROP TABLE IF EXISTS \" { table_name } \";\"\"\" )","title":"drop_table"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.get_connection","text":"Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. Source code in src/pheval/analyse/benchmark_db_manager.py 30 31 32 33 34 35 36 37 38 @staticmethod def get_connection ( db_name : str ) -> DuckDBPyConnection : \"\"\" Get a connection to the database. Returns: DuckDBPyConnection: Connection to the database. \"\"\" conn = duckdb . connect ( db_name ) return conn","title":"get_connection"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.initialise","text":"Initialise the duckdb connection. Source code in src/pheval/analyse/benchmark_db_manager.py 26 27 28 def initialise ( self ): \"\"\"Initialise the duckdb connection.\"\"\" self . add_contains_function ()","title":"initialise"},{"location":"api/pheval/analyse/benchmark_db_manager/#src.pheval.analyse.benchmark_db_manager.BenchmarkDBManager.parse_table_into_dataclass","text":"Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: Type Description Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ]] List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. Source code in src/pheval/analyse/benchmark_db_manager.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def parse_table_into_dataclass ( self , table_name : str , dataclass : Union [ Type [ RankedPhEvalGeneResult ], Type [ RankedPhEvalVariantResult ], Type [ RankedPhEvalDiseaseResult ], ], ) -> Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ]: \"\"\" Parses a DuckDB table into a list of dataclass instances. Args: table_name (str): The name of the DuckDB table to be parsed. dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult], Type[RankedPhEvalDiseaseResult]]): The dataclass type to which each row in the table should be mapped. Returns: List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table. \"\"\" result = ( self . conn . execute ( f \"SELECT * FROM ' { table_name } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) return [ dataclass ( ** row ) for row in result ]","title":"parse_table_into_dataclass"},{"location":"api/pheval/analyse/benchmark_generator/","text":"BenchmarkRunOutputGenerator dataclass Base class for recording data required for generating benchmarking outputs. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. y_label str Label for the y-axis in benchmarking outputs. generate_benchmark_run_results Callable Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the rank comparison file. Source code in src/pheval/analyse/benchmark_generator.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 @dataclass class BenchmarkRunOutputGenerator : \"\"\"Base class for recording data required for generating benchmarking outputs. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. y_label (str): Label for the y-axis in benchmarking outputs. generate_benchmark_run_results (Callable): Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the rank comparison file. \"\"\" plot_customisation : SinglePlotCustomisation prioritisation_type_string : str y_label : str generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] stats_comparison_file : str DiseaseBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". Source code in src/pheval/analyse/benchmark_generator.py 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 @dataclass class DiseaseBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"disease\" y_label : str = \"Known diseases (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_disease_prioritisation ) stats_comparison_file : str = \"disease_summary\" GeneBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". Source code in src/pheval/analyse/benchmark_generator.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @dataclass class GeneBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"gene\" y_label : str = \"Disease-causing genes (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_gene_prioritisation ) stats_comparison_file : str = \"gene_summary\" VariantBenchmarkRunOutputGenerator dataclass Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". Source code in src/pheval/analyse/benchmark_generator.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 @dataclass class VariantBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"variant\" y_label : str = \"Disease-causing variants (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_variant_prioritisation ) stats_comparison_file : str = \"variant_summary\"","title":"Benchmark generator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.BenchmarkRunOutputGenerator","text":"Base class for recording data required for generating benchmarking outputs. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. y_label str Label for the y-axis in benchmarking outputs. generate_benchmark_run_results Callable Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the rank comparison file. Source code in src/pheval/analyse/benchmark_generator.py 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 @dataclass class BenchmarkRunOutputGenerator : \"\"\"Base class for recording data required for generating benchmarking outputs. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. y_label (str): Label for the y-axis in benchmarking outputs. generate_benchmark_run_results (Callable): Callable to generate benchmark run results. Takes parameters: input and results directory, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the rank comparison file. \"\"\" plot_customisation : SinglePlotCustomisation prioritisation_type_string : str y_label : str generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] stats_comparison_file : str","title":"BenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.DiseaseBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". Source code in src/pheval/analyse/benchmark_generator.py 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 @dataclass class DiseaseBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing disease prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for disease prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to DISEASE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs. Defaults to DISEASE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate disease prioritisation benchmark run results. Defaults to benchmark_disease_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the disease rank comparison file. Defaults to \"-disease_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"disease\" y_label : str = \"Known diseases (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_disease_prioritisation ) stats_comparison_file : str = \"disease_summary\"","title":"DiseaseBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.GeneBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". Source code in src/pheval/analyse/benchmark_generator.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @dataclass class GeneBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing gene prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for gene prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to GENE_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs. Defaults to GENE_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate gene prioritisation benchmark run results. Defaults to benchmark_gene_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the gene rank comparison file. Defaults to \"-gene_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"gene\" y_label : str = \"Disease-causing genes (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_gene_prioritisation ) stats_comparison_file : str = \"gene_summary\"","title":"GeneBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmark_generator/#src.pheval.analyse.benchmark_generator.VariantBenchmarkRunOutputGenerator","text":"Bases: BenchmarkRunOutputGenerator Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: Name Type Description plot_customisation SinglePlotCustomisation Customisation for plot. prioritisation_type_string str Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label str Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results Callable Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file str Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". Source code in src/pheval/analyse/benchmark_generator.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 @dataclass class VariantBenchmarkRunOutputGenerator ( BenchmarkRunOutputGenerator ): \"\"\" Subclass of BenchmarkRunOutputGenerator specialised for producing variant prioritisation benchmarking outputs. This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes specifically for variant prioritisation benchmarking. Attributes: plot_customisation (SinglePlotCustomisation): Customisation for plot. prioritisation_type_string (str): Prioritisation type string. Defaults to VARIANT_PRIORITISATION_TYPE_STR. y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs. Defaults to VARIANT_PLOT_Y_LABEL. generate_benchmark_run_results (Callable): Callable to generate variant prioritisation benchmark run results. Defaults to benchmark_variant_prioritisation. Takes parameters: run configuration, score order, threshold, rank comparison, and returns BenchmarkRunResults. stats_comparison_file (str): Suffix for the variant rank comparison file. Defaults to \"-variant_summary\". \"\"\" plot_customisation : SinglePlotCustomisation = None prioritisation_type_string : str = \"variant\" y_label : str = \"Disease-causing variants (%)\" generate_benchmark_run_results : Callable [[ str , RunConfig , str , float ], BenchmarkRunResults ] = ( benchmark_variant_prioritisation ) stats_comparison_file : str = \"variant_summary\"","title":"VariantBenchmarkRunOutputGenerator"},{"location":"api/pheval/analyse/benchmarking_data/","text":"BenchmarkRunResults dataclass Benchmarking results for a run. Attributes: Name Type Description rank_stats RankStats Statistics related to benchmark. binary_classification_stats BinaryClassificationStats Binary statistics related to benchmark. results_dir Path Path to the result directory. Defaults to None. benchmark_name str Name of the benchmark run. Defaults to None. phenopacket_dir Path Path to the phenopacket directory. Defaults to None. Source code in src/pheval/analyse/benchmarking_data.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 @dataclass class BenchmarkRunResults : \"\"\" Benchmarking results for a run. Attributes: rank_stats (RankStats): Statistics related to benchmark. binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark. results_dir (Path, optional): Path to the result directory. Defaults to None. benchmark_name (str, optional): Name of the benchmark run. Defaults to None. phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None. \"\"\" rank_stats : RankStats binary_classification_stats : BinaryClassificationStats results_dir : Path = None benchmark_name : str = None phenopacket_dir : Path = None","title":"Benchmarking data"},{"location":"api/pheval/analyse/benchmarking_data/#src.pheval.analyse.benchmarking_data.BenchmarkRunResults","text":"Benchmarking results for a run. Attributes: Name Type Description rank_stats RankStats Statistics related to benchmark. binary_classification_stats BinaryClassificationStats Binary statistics related to benchmark. results_dir Path Path to the result directory. Defaults to None. benchmark_name str Name of the benchmark run. Defaults to None. phenopacket_dir Path Path to the phenopacket directory. Defaults to None. Source code in src/pheval/analyse/benchmarking_data.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 @dataclass class BenchmarkRunResults : \"\"\" Benchmarking results for a run. Attributes: rank_stats (RankStats): Statistics related to benchmark. binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark. results_dir (Path, optional): Path to the result directory. Defaults to None. benchmark_name (str, optional): Name of the benchmark run. Defaults to None. phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None. \"\"\" rank_stats : RankStats binary_classification_stats : BinaryClassificationStats results_dir : Path = None benchmark_name : str = None phenopacket_dir : Path = None","title":"BenchmarkRunResults"},{"location":"api/pheval/analyse/binary_classification_stats/","text":"BinaryClassificationStats dataclass A data class representing counts of different categories in binary classification. Attributes: Name Type Description true_positives int The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives int The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives int The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives int The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. Source code in src/pheval/analyse/binary_classification_stats.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 @dataclass class BinaryClassificationStats : \"\"\" A data class representing counts of different categories in binary classification. Attributes: true_positives (int): The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives (int): The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives (int): The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives (int): The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. \"\"\" true_positives : int = 0 true_negatives : int = 0 false_positives : int = 0 false_negatives : int = 0 labels : List = field ( default_factory = list ) scores : List = field ( default_factory = list ) @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 ) def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) accuracy () Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: Name Type Description float float The Accuracy of the model, calculated as the sum of true positives and true negatives divided by float the sum of true positives, false positives, true negatives, and false negatives. float Returns 0.0 if the total sum of counts is zero. Source code in src/pheval/analyse/binary_classification_stats.py 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) add_classification ( pheval_results , relevant_ranks ) Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Source code in src/pheval/analyse/binary_classification_stats.py 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) add_classification_for_known_entities ( relevant_ranks ) Update binary classification metrics for known entities based on their ranking. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 63 64 65 66 67 68 69 70 71 72 73 74 def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 add_classification_for_other_entities ( ranks ) Update binary classification metrics for other entities based on their ranking. Parameters: Name Type Description Default ranks List [ int ] A list of the ranks for all other entities. required Source code in src/pheval/analyse/binary_classification_stats.py 76 77 78 79 80 81 82 83 84 85 86 87 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 add_labels_and_scores ( pheval_results , relevant_ranks ) Adds scores and labels from the PhEval results. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None f1_score () Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: Name Type Description float float The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) false_discovery_rate () Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Discovery Rate of the model, calculated as false positives divided by the sum of float false positives and true positives. Returns 0.0 if both false positives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) false_negative_rate () Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: Name Type Description float float The False Negative Rate of the model, calculated as false negatives divided by the sum of float false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) false_positive_rate () Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Positive Rate of the model, calculated as false positives divided by the sum of float false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) matthews_correlation_coefficient () Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: Name Type Description float float The Matthews Correlation Coefficient of the model, calculated as float ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) negative_predictive_value () Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: Name Type Description float float The Negative Predictive Value of the model, calculated as true negatives divided by the sum of float true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) precision () Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: Name Type Description float float The precision of the model, calculated as true positives divided by the sum of true positives float and false positives. Returns 0.0 if both true positives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) remove_relevant_ranks ( pheval_results , relevant_ranks ) staticmethod Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: Type Description List [ int ] List[int]: A list of the ranks with the relevant entity ranks removed. Source code in src/pheval/analyse/binary_classification_stats.py 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks sensitivity () Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: Name Type Description float float The sensitivity of the model, calculated as true positives divided by the sum of true positives float and false negatives. Returns 0 if both true positives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) specificity () Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: Name Type Description float float The specificity of the model, calculated as true negatives divided by the sum of true negatives float and false positives. Returns 0.0 if both true negatives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 )","title":"Binary classification stats"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats","text":"A data class representing counts of different categories in binary classification. Attributes: Name Type Description true_positives int The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives int The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives int The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives int The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. Source code in src/pheval/analyse/binary_classification_stats.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 @dataclass class BinaryClassificationStats : \"\"\" A data class representing counts of different categories in binary classification. Attributes: true_positives (int): The count of true positive instances - i.e., the number of known entities ranked 1 in the results. true_negatives (int): The count of true negative instances - i.e., the number of non-relevant entities ranked at a position other than 1 in the results. false_positives (int): The count of false positive instances - i.e., the number of non-relevant entities ranked at position 1 in the results. false_negatives (int): The count of false negative instances - i.e., the number of known entities ranked at a position other than 1 in the results. \"\"\" true_positives : int = 0 true_negatives : int = 0 false_positives : int = 0 false_negatives : int = 0 labels : List = field ( default_factory = list ) scores : List = field ( default_factory = list ) @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks ) def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 ) def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 ) def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 ) def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 ) def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 ) def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 ) def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 ) def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 ) def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"BinaryClassificationStats"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.accuracy","text":"Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: Name Type Description float float The Accuracy of the model, calculated as the sum of true positives and true negatives divided by float the sum of true positives, false positives, true negatives, and false negatives. float Returns 0.0 if the total sum of counts is zero. Source code in src/pheval/analyse/binary_classification_stats.py 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 def accuracy ( self ) -> float : \"\"\" Calculate Accuracy. Accuracy measures the proportion of correctly predicted instances out of all instances. Returns: float: The Accuracy of the model, calculated as the sum of true positives and true negatives divided by the sum of true positives, false positives, true negatives, and false negatives. Returns 0.0 if the total sum of counts is zero. \"\"\" return ( ( self . true_positives + self . true_negatives ) / ( self . true_positives + self . false_positives + self . true_negatives + self . false_negatives ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"accuracy"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification","text":"Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Source code in src/pheval/analyse/binary_classification_stats.py 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 def add_classification ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> None : \"\"\" Update binary classification metrics for known and unknown entities based on their ranks. Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" self . add_classification_for_known_entities ( relevant_ranks ) self . add_classification_for_other_entities ( self . remove_relevant_ranks ( pheval_results , relevant_ranks ) ) self . add_labels_and_scores ( pheval_results , relevant_ranks )","title":"add_classification"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification_for_known_entities","text":"Update binary classification metrics for known entities based on their ranking. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 63 64 65 66 67 68 69 70 71 72 73 74 def add_classification_for_known_entities ( self , relevant_ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for known entities based on their ranking. Args: relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" for rank in relevant_ranks : if rank == 1 : self . true_positives += 1 elif rank != 1 : self . false_negatives += 1","title":"add_classification_for_known_entities"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_classification_for_other_entities","text":"Update binary classification metrics for other entities based on their ranking. Parameters: Name Type Description Default ranks List [ int ] A list of the ranks for all other entities. required Source code in src/pheval/analyse/binary_classification_stats.py 76 77 78 79 80 81 82 83 84 85 86 87 def add_classification_for_other_entities ( self , ranks : List [ int ]) -> None : \"\"\" Update binary classification metrics for other entities based on their ranking. Args: ranks (List[int]): A list of the ranks for all other entities. \"\"\" for rank in ranks : if rank == 1 : self . false_positives += 1 elif rank != 1 : self . true_negatives += 1","title":"add_classification_for_other_entities"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.add_labels_and_scores","text":"Adds scores and labels from the PhEval results. Parameters: Name Type Description Default relevant_ranks List [ int ] A list of the ranks associated with the known entities. required Source code in src/pheval/analyse/binary_classification_stats.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 def add_labels_and_scores ( self , pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ): \"\"\" Adds scores and labels from the PhEval results. Args: pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): List of all PhEval results relevant_ranks (List[int]): A list of the ranks associated with the known entities. \"\"\" relevant_ranks_copy = relevant_ranks . copy () for result in pheval_results : self . scores . append ( result . score ) label = 1 if result . rank in relevant_ranks_copy else 0 self . labels . append ( label ) relevant_ranks_copy . remove ( result . rank ) if label == 1 else None","title":"add_labels_and_scores"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.f1_score","text":"Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: Name Type Description float float The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 def f1_score ( self ) -> float : \"\"\" Calculate F1 Score. F1 Score is the harmonic mean of precision and recall, providing a balance between false positives and false negatives. Returns: float: The F1 Score of the model, calculated as 2 * TP / (2 * TP + FP + FN). Returns 0.0 if the denominator is zero. \"\"\" return ( ( 2 * self . true_positives ) / (( 2 * self . true_positives ) + self . false_positives + self . false_negatives ) if ( self . true_positives + self . false_positives + self . false_negatives ) > 0 else 0.0 )","title":"f1_score"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_discovery_rate","text":"Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Discovery Rate of the model, calculated as false positives divided by the sum of float false positives and true positives. Returns 0.0 if both false positives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 def false_discovery_rate ( self ) -> float : \"\"\" Calculate False Discovery Rate (FDR). FDR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Discovery Rate of the model, calculated as false positives divided by the sum of false positives and true positives. Returns 0.0 if both false positives and true positives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_positives ) if ( self . false_positives + self . true_positives ) > 0 else 0.0 )","title":"false_discovery_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_negative_rate","text":"Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: Name Type Description float float The False Negative Rate of the model, calculated as false negatives divided by the sum of float false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 def false_negative_rate ( self ) -> float : \"\"\" Calculate False Negative Rate (FNR). FNR measures the proportion of instances that are actually positive but predicted as negative. Returns: float: The False Negative Rate of the model, calculated as false negatives divided by the sum of false negatives and true positives. Returns 0.0 if both false negatives and true positives are zero. \"\"\" return ( self . false_negatives / ( self . false_negatives + self . true_positives ) if ( self . false_negatives + self . true_positives ) > 0 else 0.0 )","title":"false_negative_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.false_positive_rate","text":"Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: Name Type Description float float The False Positive Rate of the model, calculated as false positives divided by the sum of float false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def false_positive_rate ( self ) -> float : \"\"\" Calculate False Positive Rate (FPR). FPR measures the proportion of instances predicted as positive that are actually negative. Returns: float: The False Positive Rate of the model, calculated as false positives divided by the sum of false positives and true negatives. Returns 0.0 if both false positives and true negatives are zero. \"\"\" return ( self . false_positives / ( self . false_positives + self . true_negatives ) if ( self . false_positives + self . true_negatives ) > 0 else 0.0 )","title":"false_positive_rate"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.matthews_correlation_coefficient","text":"Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: Name Type Description float float The Matthews Correlation Coefficient of the model, calculated as float ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). float Returns 0.0 if the denominator is zero. Source code in src/pheval/analyse/binary_classification_stats.py 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 def matthews_correlation_coefficient ( self ) -> float : \"\"\" Calculate Matthews Correlation Coefficient (MCC). MCC is a measure of the quality of binary classifications, accounting for imbalances in the data. Returns: float: The Matthews Correlation Coefficient of the model, calculated as ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)). Returns 0.0 if the denominator is zero. \"\"\" return ( ( ( self . true_positives * self . true_negatives ) - ( self . false_positives * self . false_negatives ) ) / ( sqrt ( ( self . true_positives + self . false_positives ) * ( self . true_positives + self . false_negatives ) * ( self . true_negatives + self . false_positives ) * ( self . true_negatives + self . false_negatives ) ) ) if ( self . true_positives + self . false_negatives + self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"matthews_correlation_coefficient"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.negative_predictive_value","text":"Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: Name Type Description float float The Negative Predictive Value of the model, calculated as true negatives divided by the sum of float true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 def negative_predictive_value ( self ) -> float : \"\"\" Calculate Negative Predictive Value (NPV). NPV measures the proportion of correctly predicted negative instances out of all instances predicted negative. Returns: float: The Negative Predictive Value of the model, calculated as true negatives divided by the sum of true negatives and false negatives. Returns 0.0 if both true negatives and false negatives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_negatives ) if ( self . true_negatives + self . false_negatives ) > 0 else 0.0 )","title":"negative_predictive_value"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.precision","text":"Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: Name Type Description float float The precision of the model, calculated as true positives divided by the sum of true positives float and false positives. Returns 0.0 if both true positives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 def precision ( self ) -> float : \"\"\" Calculate precision. Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive. Returns: float: The precision of the model, calculated as true positives divided by the sum of true positives and false positives. Returns 0.0 if both true positives and false positives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_positives ) if ( self . true_positives + self . false_positives ) > 0 else 0.0 )","title":"precision"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.remove_relevant_ranks","text":"Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: Type Description List [ int ] List[int]: A list of the ranks with the relevant entity ranks removed. Source code in src/pheval/analyse/binary_classification_stats.py 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 @staticmethod def remove_relevant_ranks ( pheval_results : Union [ List [ RankedPhEvalGeneResult ], List [ RankedPhEvalVariantResult ], List [ RankedPhEvalDiseaseResult ], ], relevant_ranks : List [ int ], ) -> List [ int ]: \"\"\" Remove the relevant entity ranks from all result ranks Args: pheval_results: (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult], List[RankedPhEvalDiseaseResult]]): The list of all pheval results. relevant_ranks (List[int]): A list of the ranks associated with the known entities. Returns: List[int]: A list of the ranks with the relevant entity ranks removed. \"\"\" all_result_ranks = [ pheval_result . rank for pheval_result in pheval_results ] for rank in relevant_ranks : if rank in all_result_ranks : all_result_ranks . remove ( rank ) continue return all_result_ranks","title":"remove_relevant_ranks"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.sensitivity","text":"Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: Name Type Description float float The sensitivity of the model, calculated as true positives divided by the sum of true positives float and false negatives. Returns 0 if both true positives and false negatives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 def sensitivity ( self ) -> float : \"\"\" Calculate sensitivity. Sensitivity measures the proportion of actual positive instances correctly identified by the model. Returns: float: The sensitivity of the model, calculated as true positives divided by the sum of true positives and false negatives. Returns 0 if both true positives and false negatives are zero. \"\"\" return ( self . true_positives / ( self . true_positives + self . false_negatives ) if ( self . true_positives + self . false_negatives ) > 0 else 0.0 )","title":"sensitivity"},{"location":"api/pheval/analyse/binary_classification_stats/#src.pheval.analyse.binary_classification_stats.BinaryClassificationStats.specificity","text":"Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: Name Type Description float float The specificity of the model, calculated as true negatives divided by the sum of true negatives float and false positives. Returns 0.0 if both true negatives and false positives are zero. Source code in src/pheval/analyse/binary_classification_stats.py 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 def specificity ( self ) -> float : \"\"\" Calculate specificity. Specificity measures the proportion of actual negative instances correctly identified by the model. Returns: float: The specificity of the model, calculated as true negatives divided by the sum of true negatives and false positives. Returns 0.0 if both true negatives and false positives are zero. \"\"\" return ( self . true_negatives / ( self . true_negatives + self . false_positives ) if ( self . true_negatives + self . false_positives ) > 0 else 0.0 )","title":"specificity"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/","text":"AssessDiseasePrioritisation Bases: AssessPrioritisationBase Class for assessing disease prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 class AssessDiseasePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing disease prioritisation based on thresholds and scoring orders.\"\"\" def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , ) assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , binary_classification_stats ) Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_disease_result_path Path Path to the standardised disease TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , ) assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker ) Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required disease_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required disease_benchmarker AssessDiseasePrioritisation AssessDiseasePrioritisation class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def assess_phenopacket_disease_prioritisation ( phenopacket_path : Path , run : RunConfig , disease_binary_classification_stats : BinaryClassificationStats , disease_benchmarker : AssessDiseasePrioritisation , ) -> None : \"\"\" Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance. \"\"\" standardised_disease_result_path = run . results_dir . joinpath ( f \"pheval_disease_results/ { phenopacket_path . stem } -pheval_disease_result.tsv\" ) disease_benchmarker . assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , disease_binary_classification_stats , ) benchmark_disease_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on disease prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 def benchmark_disease_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on disease prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" disease_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () disease_benchmarker = AssessDiseasePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker , ) db_connection . close () disease_rank_stats = RankStats () disease_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = disease_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = disease_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Disease prioritisation analysis"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.AssessDiseasePrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing disease prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 class AssessDiseasePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing disease prioritisation based on thresholds and scoring orders.\"\"\" def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , )","title":"AssessDiseasePrioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.AssessDiseasePrioritisation.assess_disease_prioritisation","text":"Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_disease_result_path Path Path to the standardised disease TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def assess_disease_prioritisation ( self , standardised_disease_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess disease prioritisation. This method assesses the prioritisation of diseases based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_disease_result_path (Path): Path to the standardised disease TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"SELECT * FROM { self . table_name } WHERE phenopacket = ? \" , ( phenopacket_path . name ,), ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_disease_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),\" f \" ' { row [ 'disease_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), \" f \"' { row [ 'disease_name' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : disease_match = self . _record_matched_entity ( RankedPhEvalDiseaseResult ( ** result [ 0 ])) relevant_ranks . append ( disease_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'disease_identifier' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( disease_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_disease_result_path ), RankedPhEvalDiseaseResult ), relevant_ranks , )","title":"assess_disease_prioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.assess_phenopacket_disease_prioritisation","text":"Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required disease_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required disease_benchmarker AssessDiseasePrioritisation AssessDiseasePrioritisation class instance. required Source code in src/pheval/analyse/disease_prioritisation_analysis.py 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 def assess_phenopacket_disease_prioritisation ( phenopacket_path : Path , run : RunConfig , disease_binary_classification_stats : BinaryClassificationStats , disease_benchmarker : AssessDiseasePrioritisation , ) -> None : \"\"\" Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results against the recorded causative diseases for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance. \"\"\" standardised_disease_result_path = run . results_dir . joinpath ( f \"pheval_disease_results/ { phenopacket_path . stem } -pheval_disease_result.tsv\" ) disease_benchmarker . assess_disease_prioritisation ( standardised_disease_result_path , phenopacket_path , disease_binary_classification_stats , )","title":"assess_phenopacket_disease_prioritisation"},{"location":"api/pheval/analyse/disease_prioritisation_analysis/#src.pheval.analyse.disease_prioritisation_analysis.benchmark_disease_prioritisation","text":"Benchmark a directory based on disease prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/disease_prioritisation_analysis.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 def benchmark_disease_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on disease prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for disease prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" disease_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () disease_benchmarker = AssessDiseasePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_disease_prioritisation ( phenopacket_path , run , disease_binary_classification_stats , disease_benchmarker , ) db_connection . close () disease_rank_stats = RankStats () disease_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _disease\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = disease_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = disease_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_disease_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/","text":"AssessGenePrioritisation Bases: AssessPrioritisationBase Class for assessing gene prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 class AssessGenePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing gene prioritisation based on thresholds and scoring orders.\"\"\" def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , ) assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , binary_classification_stats ) Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_gene_result_path Path Path to the standardised gene TSV result. required phenopacket_path Path Path to the Phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , ) assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker ) Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required gene_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required gene_benchmarker AssessGenePrioritisation AssessGenePrioritisation class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 def assess_phenopacket_gene_prioritisation ( phenopacket_path : Path , run : RunConfig , gene_binary_classification_stats : BinaryClassificationStats , gene_benchmarker : AssessGenePrioritisation , ) -> None : \"\"\" Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance. \"\"\" standardised_gene_result_path = run . results_dir . joinpath ( f \"pheval_gene_results/ { phenopacket_path . stem } -pheval_gene_result.tsv\" ) gene_benchmarker . assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , gene_binary_classification_stats , ) benchmark_gene_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def benchmark_gene_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ) -> BenchmarkRunResults : \"\"\" Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" gene_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () gene_benchmarker = AssessGenePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_gene\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker , ) db_connection . close () gene_rank_stats = RankStats () gene_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _gene\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = gene_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = gene_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Gene prioritisation analysis"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.AssessGenePrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing gene prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 class AssessGenePrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing gene prioritisation based on thresholds and scoring orders.\"\"\" def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , )","title":"AssessGenePrioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.AssessGenePrioritisation.assess_gene_prioritisation","text":"Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_gene_result_path Path Path to the standardised gene TSV result. required phenopacket_path Path Path to the Phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 def assess_gene_prioritisation ( self , standardised_gene_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess gene prioritisation. This method assesses the prioritisation of genes based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_gene_result_path (Path): Path to the standardised gene TSV result. phenopacket_path (Path): Path to the Phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_gene_result_path } ' \" f \"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),\" f \" ' { row [ 'gene_identifier' ] } ') \" f \"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), \" f \"' { row [ 'gene_symbol' ] } ')\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : gene_match = self . _record_matched_entity ( RankedPhEvalGeneResult ( ** result [ 0 ])) relevant_ranks . append ( gene_match ) primary_key = f \" { phenopacket_path . name } - { row [ 'gene_symbol' ] } \" self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( gene_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_gene_result_path ), RankedPhEvalGeneResult ), relevant_ranks , )","title":"assess_gene_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.assess_phenopacket_gene_prioritisation","text":"Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required gene_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required gene_benchmarker AssessGenePrioritisation AssessGenePrioritisation class instance. required Source code in src/pheval/analyse/gene_prioritisation_analysis.py 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 def assess_phenopacket_gene_prioritisation ( phenopacket_path : Path , run : RunConfig , gene_binary_classification_stats : BinaryClassificationStats , gene_benchmarker : AssessGenePrioritisation , ) -> None : \"\"\" Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results against the recorded causative genes for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance. \"\"\" standardised_gene_result_path = run . results_dir . joinpath ( f \"pheval_gene_results/ { phenopacket_path . stem } -pheval_gene_result.tsv\" ) gene_benchmarker . assess_gene_prioritisation ( standardised_gene_result_path , phenopacket_path , gene_binary_classification_stats , )","title":"assess_phenopacket_gene_prioritisation"},{"location":"api/pheval/analyse/gene_prioritisation_analysis/#src.pheval.analyse.gene_prioritisation_analysis.benchmark_gene_prioritisation","text":"Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/gene_prioritisation_analysis.py 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 def benchmark_gene_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ) -> BenchmarkRunResults : \"\"\" Benchmark a directory based on gene prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for gene prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" gene_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) db_connection . initialise () gene_benchmarker = AssessGenePrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_gene\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_gene_prioritisation ( phenopacket_path , run , gene_binary_classification_stats , gene_benchmarker , ) db_connection . close () gene_rank_stats = RankStats () gene_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _gene\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( rank_stats = gene_rank_stats , benchmark_name = run . run_identifier , binary_classification_stats = gene_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_gene_prioritisation"},{"location":"api/pheval/analyse/generate_plots/","text":"PlotGenerator Class to generate plots. Source code in src/pheval/analyse/generate_plots.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 class PlotGenerator : \"\"\"Class to generate plots.\"\"\" palette_hex_codes = [ \"#f4ae3d\" , \"#ee5825\" , \"#2b7288\" , \"#9a84b2\" , \"#0c604c\" , \"#c94c4c\" , \"#3d8e83\" , \"#725ac1\" , \"#e7ba52\" , \"#1b9e77\" , ] def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False @staticmethod def _create_run_identifier ( results_dir : Path ) -> str : \"\"\" Create a run identifier from a path. Args: results_dir (Path): The directory path for results. Returns: str: A string representing the run identifier created from the given path. \"\"\" return f \" { Path ( results_dir ) . parents [ 0 ] . name } _ { trim_corpus_results_directory_suffix ( Path ( results_dir ) . name ) } \" def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) def _generate_stacked_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for a stacked bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats self . stats . append ( { \"Run\" : self . return_benchmark_name ( benchmark_result ), \"Top\" : benchmark_result . rank_stats . percentage_top (), \"2-3\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ), \"4-5\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ), \"6-10\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ), \">10\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ), \"Missed\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found ()), } ) def _generate_stats_mrr_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for MRR (Mean Reciprocal Rank) bar plot, appending to the self.mrr attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" self . mrr . extend ( [ { \"Rank\" : \"MRR\" , \"Percentage\" : benchmark_result . rank_stats . return_mean_reciprocal_rank (), \"Run\" : self . return_benchmark_name ( benchmark_result ), } ] ) def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ): \"\"\" Generate data in the correct format for dataframe creation for a cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top3\" , \"Percentage\" : rank_stats . percentage_top3 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top5\" , \"Percentage\" : rank_stats . percentage_top5 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top10\" , \"Percentage\" : rank_stats . percentage_top10 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Found\" , \"Percentage\" : rank_stats . percentage_found () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_non_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> [ dict ]: \"\"\" Generate data in the correct format for dataframe creation for a non-cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"2-3\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"4-5\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"6-10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \">10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) __init__ ( benchmark_name ) Initialise the PlotGenerator class. Note: self.stats will be used to store statistics data. self.mrr will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. Source code in src/pheval/analyse/generate_plots.py 50 51 52 53 54 55 56 57 58 59 60 61 62 def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False generate_cumulative_bar ( benchmarking_results , benchmark_generator ) Generate a cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_non_cumulative_bar ( benchmarking_results , benchmark_generator ) Generate a non-cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_precision_recall ( benchmarking_results , benchmark_generator ) Generate and plot Precision-Recall curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_roc_curve ( benchmarking_results , benchmark_generator ) Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) return_benchmark_name ( benchmark_result ) Return the benchmark name for a run. Parameters: Name Type Description Default benchmark_result BenchmarkRunResults The benchmarking results for a run. required Returns: Name Type Description str str The benchmark name obtained from the given BenchmarkRunResults instance. Source code in src/pheval/analyse/generate_plots.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , generate_from_db = False ) Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Parameters: Name Type Description Default benchmarking_results list [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required generate_from_db bool Specify whether to generate plots from the db file. Defaults to False. False Source code in src/pheval/analyse/generate_plots.py 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 def generate_plots ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , generate_from_db : bool = False , ) -> None : \"\"\" Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Args: benchmarking_results (list[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. generate_from_db (bool): Specify whether to generate plots from the db file. Defaults to False. \"\"\" plot_generator = PlotGenerator ( benchmark_name ) if not generate_from_db : plot_generator . generate_roc_curve ( benchmarking_results , benchmark_generator ) plot_generator . generate_precision_recall ( benchmarking_results , benchmark_generator ) if benchmark_generator . plot_customisation . plot_type == \"bar_stacked\" : plot_generator . generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_cumulative\" : plot_generator . generate_cumulative_bar ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_non_cumulative\" : plot_generator . generate_non_cumulative_bar ( benchmarking_results , benchmark_generator ) generate_plots_from_benchmark_summary_db ( benchmark_db , run_data ) Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Parameters: Name Type Description Default benchmark_db Path Path to the summary TSV file containing benchmark results. required run_data Path Path to YAML benchmarking configuration file. required Source code in src/pheval/analyse/generate_plots.py 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 def generate_plots_from_benchmark_summary_db ( benchmark_db : Path , run_data : Path , ): \"\"\" Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Args: benchmark_db (Path): Path to the summary TSV file containing benchmark results. run_data (Path): Path to YAML benchmarking configuration file. \"\"\" benchmark_stats_summary = parse_benchmark_db ( benchmark_db ) config = parse_run_config ( run_data ) if benchmark_stats_summary . gene_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . gene_results , GeneBenchmarkRunOutputGenerator ( config . plot_customisation . gene_plots ), True , ) if benchmark_stats_summary . variant_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . variant_results , VariantBenchmarkRunOutputGenerator ( config . plot_customisation . variant_plots ), True , ) elif benchmark_stats_summary . disease_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . disease_results , DiseaseBenchmarkRunOutputGenerator ( config . plot_customisation . disease_plots ), True , ) trim_corpus_results_directory_suffix ( corpus_results_directory ) Trim the suffix from the corpus results directory name. Parameters: Name Type Description Default corpus_results_directory Path The directory path containing corpus results. required Returns: Name Type Description Path Path The Path object with the suffix removed from the directory name. Source code in src/pheval/analyse/generate_plots.py 21 22 23 24 25 26 27 28 29 30 31 def trim_corpus_results_directory_suffix ( corpus_results_directory : Path ) -> Path : \"\"\" Trim the suffix from the corpus results directory name. Args: corpus_results_directory (Path): The directory path containing corpus results. Returns: Path: The Path object with the suffix removed from the directory name. \"\"\" return Path ( str ( corpus_results_directory ) . replace ( \"_results\" , \"\" ))","title":"Generate plots"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator","text":"Class to generate plots. Source code in src/pheval/analyse/generate_plots.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 class PlotGenerator : \"\"\"Class to generate plots.\"\"\" palette_hex_codes = [ \"#f4ae3d\" , \"#ee5825\" , \"#2b7288\" , \"#9a84b2\" , \"#0c604c\" , \"#c94c4c\" , \"#3d8e83\" , \"#725ac1\" , \"#e7ba52\" , \"#1b9e77\" , ] def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False @staticmethod def _create_run_identifier ( results_dir : Path ) -> str : \"\"\" Create a run identifier from a path. Args: results_dir (Path): The directory path for results. Returns: str: A string representing the run identifier created from the given path. \"\"\" return f \" { Path ( results_dir ) . parents [ 0 ] . name } _ { trim_corpus_results_directory_suffix ( Path ( results_dir ) . name ) } \" def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) ) def _generate_stacked_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for a stacked bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats self . stats . append ( { \"Run\" : self . return_benchmark_name ( benchmark_result ), \"Top\" : benchmark_result . rank_stats . percentage_top (), \"2-3\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ), \"4-5\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ), \"6-10\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ), \">10\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ), \"Missed\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found ()), } ) def _generate_stats_mrr_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> None : \"\"\" Generate data in the correct format for dataframe creation for MRR (Mean Reciprocal Rank) bar plot, appending to the self.mrr attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" self . mrr . extend ( [ { \"Rank\" : \"MRR\" , \"Percentage\" : benchmark_result . rank_stats . return_mean_reciprocal_rank (), \"Run\" : self . return_benchmark_name ( benchmark_result ), } ] ) def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ): \"\"\" Generate data in the correct format for dataframe creation for a cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top3\" , \"Percentage\" : rank_stats . percentage_top3 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top5\" , \"Percentage\" : rank_stats . percentage_top5 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Top10\" , \"Percentage\" : rank_stats . percentage_top10 () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Found\" , \"Percentage\" : rank_stats . percentage_found () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def _generate_non_cumulative_bar_plot_data ( self , benchmark_result : BenchmarkRunResults ) -> [ dict ]: \"\"\" Generate data in the correct format for dataframe creation for a non-cumulative bar plot, appending to the self.stats attribute of the class. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. \"\"\" rank_stats = benchmark_result . rank_stats run_identifier = self . return_benchmark_name ( benchmark_result ) self . stats . extend ( [ { \"Rank\" : \"Top\" , \"Percentage\" : rank_stats . percentage_top () / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"2-3\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top3 (), rank_stats . percentage_top () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"4-5\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top5 (), rank_stats . percentage_top3 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"6-10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_top10 (), rank_stats . percentage_top5 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \">10\" , \"Percentage\" : rank_stats . percentage_difference ( rank_stats . percentage_found (), rank_stats . percentage_top10 () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"Missed\" , \"Percentage\" : rank_stats . percentage_difference ( 100 , rank_stats . percentage_found () ) / 100 , \"Run\" : run_identifier , }, { \"Rank\" : \"MRR\" , \"Percentage\" : rank_stats . return_mean_reciprocal_rank (), \"Run\" : run_identifier , }, ] ) def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"PlotGenerator"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.__init__","text":"Initialise the PlotGenerator class. Note: self.stats will be used to store statistics data. self.mrr will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. Source code in src/pheval/analyse/generate_plots.py 50 51 52 53 54 55 56 57 58 59 60 61 62 def __init__ ( self , benchmark_name : str ): \"\"\" Initialise the PlotGenerator class. Note: `self.stats` will be used to store statistics data. `self.mrr` will store Mean Reciprocal Rank (MRR) values. Matplotlib settings are configured to remove the right and top axes spines for generated plots. \"\"\" self . benchmark_name = benchmark_name self . stats , self . mrr = [], [] matplotlib . rcParams [ \"axes.spines.right\" ] = False matplotlib . rcParams [ \"axes.spines.top\" ] = False","title":"__init__"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_cumulative_bar","text":"Generate a cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 def generate_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_cumulative_bar"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_non_cumulative_bar","text":"Generate a non-cumulative bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 def generate_non_cumulative_bar ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a non-cumulative bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for benchmark_result in benchmarking_results : self . _generate_non_cumulative_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) sns . catplot ( data = stats_df , kind = \"bar\" , x = \"Rank\" , y = \"Percentage\" , hue = \"Run\" , palette = self . palette_hex_codes , edgecolor = \"white\" , legend = False , ) . set ( xlabel = \"Rank\" , ylabel = benchmark_generator . y_label ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 ), ncol = 3 , title = \"Run\" ) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Non-Cumulative Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_non_cumulative_bar"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_precision_recall","text":"Generate and plot Precision-Recall curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 def generate_precision_recall ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Precision-Recall curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () plt . figure () for i , benchmark_result in enumerate ( benchmarking_results ): precision , recall , thresh = precision_recall_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , ) precision_recall_auc = auc ( recall , precision ) plt . plot ( recall , precision , label = f \" { self . return_benchmark_name ( benchmark_result ) } Precision-Recall Curve \" f \"(AUC = { precision_recall_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"Recall\" ) plt . ylabel ( \"Precision\" ) if benchmark_generator . plot_customisation . precision_recall_title is None : plt . title ( \"Precision-Recall Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . precision_recall_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _pr_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_precision_recall"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_roc_curve","text":"Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 def generate_roc_curve ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ): \"\"\" Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" plt . clf () for i , benchmark_result in enumerate ( benchmarking_results ): fpr , tpr , thresh = roc_curve ( benchmark_result . binary_classification_stats . labels , benchmark_result . binary_classification_stats . scores , pos_label = 1 , ) roc_auc = auc ( fpr , tpr ) plt . plot ( fpr , tpr , label = f \" { self . return_benchmark_name ( benchmark_result ) } ROC Curve (AUC = { roc_auc : .2f } )\" , color = self . palette_hex_codes [ i ], ) plt . plot ( linestyle = \"--\" , color = \"gray\" ) plt . xlabel ( \"False Positive Rate\" ) plt . ylabel ( \"True Positive Rate\" ) if benchmark_generator . plot_customisation . roc_curve_title is None : plt . title ( \"Receiver Operating Characteristic (ROC) Curve\" ) else : plt . title ( benchmark_generator . plot_customisation . roc_curve_title ) plt . legend ( loc = \"upper center\" , bbox_to_anchor = ( 0.5 , - 0.15 )) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _roc_curve.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_roc_curve"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.generate_stacked_bar_plot","text":"Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Parameters: Name Type Description Default benchmarking_results List [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required Source code in src/pheval/analyse/generate_plots.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 def generate_stacked_bar_plot ( self , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , ) -> None : \"\"\" Generate a stacked bar plot and Mean Reciprocal Rank (MRR) bar plot. Args: benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. \"\"\" for benchmark_result in benchmarking_results : self . _generate_stacked_bar_plot_data ( benchmark_result ) self . _generate_stats_mrr_bar_plot_data ( benchmark_result ) stats_df = pd . DataFrame ( self . stats ) plt . clf () stats_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , stacked = True , color = self . palette_hex_codes , ylabel = benchmark_generator . y_label , edgecolor = \"white\" , ) . legend ( loc = \"center left\" , bbox_to_anchor = ( 1.0 , 0.5 )) if benchmark_generator . plot_customisation . rank_plot_title is None : plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } Rank Stats\" ) else : plt . title ( benchmark_generator . plot_customisation . rank_plot_title , loc = \"center\" , fontsize = 15 ) plt . ylim ( 0 , 100 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _rank_stats.svg\" , format = \"svg\" , bbox_inches = \"tight\" , ) mrr_df = pd . DataFrame ( self . mrr ) mrr_df . set_index ( \"Run\" ) . plot ( kind = \"bar\" , color = self . palette_hex_codes , ylabel = f \" { benchmark_generator . prioritisation_type_string . capitalize () } mean reciprocal rank\" , legend = False , edgecolor = \"white\" , ) plt . title ( f \" { benchmark_generator . prioritisation_type_string . capitalize () } results - mean reciprocal rank\" ) plt . ylim ( 0 , 1 ) plt . savefig ( f \" { self . benchmark_name } _ { benchmark_generator . prioritisation_type_string } _mrr.svg\" , format = \"svg\" , bbox_inches = \"tight\" , )","title":"generate_stacked_bar_plot"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.PlotGenerator.return_benchmark_name","text":"Return the benchmark name for a run. Parameters: Name Type Description Default benchmark_result BenchmarkRunResults The benchmarking results for a run. required Returns: Name Type Description str str The benchmark name obtained from the given BenchmarkRunResults instance. Source code in src/pheval/analyse/generate_plots.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 def return_benchmark_name ( self , benchmark_result : BenchmarkRunResults ) -> str : \"\"\" Return the benchmark name for a run. Args: benchmark_result (BenchmarkRunResults): The benchmarking results for a run. Returns: str: The benchmark name obtained from the given BenchmarkRunResults instance. \"\"\" return ( benchmark_result . benchmark_name if benchmark_result . results_dir is None else self . _create_run_identifier ( benchmark_result . results_dir ) )","title":"return_benchmark_name"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.generate_plots","text":"Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Parameters: Name Type Description Default benchmarking_results list [ BenchmarkRunResults ] List of benchmarking results for multiple runs. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required generate_from_db bool Specify whether to generate plots from the db file. Defaults to False. False Source code in src/pheval/analyse/generate_plots.py 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 def generate_plots ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], benchmark_generator : BenchmarkRunOutputGenerator , generate_from_db : bool = False , ) -> None : \"\"\" Generate summary statistics bar plots for prioritisation. This method generates summary statistics bar plots based on the provided benchmarking results and plot type. Args: benchmarking_results (list[BenchmarkRunResults]): List of benchmarking results for multiple runs. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. generate_from_db (bool): Specify whether to generate plots from the db file. Defaults to False. \"\"\" plot_generator = PlotGenerator ( benchmark_name ) if not generate_from_db : plot_generator . generate_roc_curve ( benchmarking_results , benchmark_generator ) plot_generator . generate_precision_recall ( benchmarking_results , benchmark_generator ) if benchmark_generator . plot_customisation . plot_type == \"bar_stacked\" : plot_generator . generate_stacked_bar_plot ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_cumulative\" : plot_generator . generate_cumulative_bar ( benchmarking_results , benchmark_generator ) elif benchmark_generator . plot_customisation . plot_type == \"bar_non_cumulative\" : plot_generator . generate_non_cumulative_bar ( benchmarking_results , benchmark_generator )","title":"generate_plots"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.generate_plots_from_benchmark_summary_db","text":"Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Parameters: Name Type Description Default benchmark_db Path Path to the summary TSV file containing benchmark results. required run_data Path Path to YAML benchmarking configuration file. required Source code in src/pheval/analyse/generate_plots.py 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 def generate_plots_from_benchmark_summary_db ( benchmark_db : Path , run_data : Path , ): \"\"\" Generate bar plot from summary benchmark results. Reads a summary of benchmark results from a benchmark db and generates a bar plot based on the analysis type and plot type. Args: benchmark_db (Path): Path to the summary TSV file containing benchmark results. run_data (Path): Path to YAML benchmarking configuration file. \"\"\" benchmark_stats_summary = parse_benchmark_db ( benchmark_db ) config = parse_run_config ( run_data ) if benchmark_stats_summary . gene_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . gene_results , GeneBenchmarkRunOutputGenerator ( config . plot_customisation . gene_plots ), True , ) if benchmark_stats_summary . variant_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . variant_results , VariantBenchmarkRunOutputGenerator ( config . plot_customisation . variant_plots ), True , ) elif benchmark_stats_summary . disease_results : generate_plots ( config . benchmark_name , benchmark_stats_summary . disease_results , DiseaseBenchmarkRunOutputGenerator ( config . plot_customisation . disease_plots ), True , )","title":"generate_plots_from_benchmark_summary_db"},{"location":"api/pheval/analyse/generate_plots/#src.pheval.analyse.generate_plots.trim_corpus_results_directory_suffix","text":"Trim the suffix from the corpus results directory name. Parameters: Name Type Description Default corpus_results_directory Path The directory path containing corpus results. required Returns: Name Type Description Path Path The Path object with the suffix removed from the directory name. Source code in src/pheval/analyse/generate_plots.py 21 22 23 24 25 26 27 28 29 30 31 def trim_corpus_results_directory_suffix ( corpus_results_directory : Path ) -> Path : \"\"\" Trim the suffix from the corpus results directory name. Args: corpus_results_directory (Path): The directory path containing corpus results. Returns: Path: The Path object with the suffix removed from the directory name. \"\"\" return Path ( str ( corpus_results_directory ) . replace ( \"_results\" , \"\" ))","title":"trim_corpus_results_directory_suffix"},{"location":"api/pheval/analyse/generate_summary_outputs/","text":"create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name ) Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from Source code in src/pheval/analyse/generate_summary_outputs.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 def create_comparison_table ( comparison_table_name : str , connector : BenchmarkDBManager , drop_columns : List [ str ], run_identifier_1 : str , run_identifier_2 : str , table_name : str , ) -> None : \"\"\" Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from \"\"\" connector . drop_table ( comparison_table_name ) excluded_columns = tuple ( drop_columns + [ \"identifier\" ]) if drop_columns else ( \"identifier\" ,) connector . conn . execute ( f 'CREATE TABLE \" { comparison_table_name } \" AS SELECT * ' f \"EXCLUDE { excluded_columns } FROM { table_name } \" ) connector . conn . execute ( f \"\"\"ALTER TABLE \" { comparison_table_name } \" ADD COLUMN rank_change VARCHAR;\"\"\" ) connector . conn . execute ( f 'UPDATE \" { comparison_table_name } \" SET rank_change = CASE WHEN \" { run_identifier_1 } \" = 0 ' f 'AND \" { run_identifier_2 } \" != 0 ' f \"THEN 'GAINED' WHEN \\\" { run_identifier_1 } \\\" != 0 AND \\\" { run_identifier_2 } \\\" = 0 THEN 'LOST' ELSE \" f 'CAST (\" { run_identifier_1 } \" - \" { run_identifier_2 } \" AS VARCHAR) END;' ) connector . conn . commit () generate_benchmark_comparison_output ( benchmark_name , benchmarking_results , run_identifiers , benchmark_generator , table_name ) Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of BenchmarkRunResults instances in benchmarking_results and generates rank comparison outputs using RankComparisonGenerator for each pair. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required benchmarking_results List [ BenchmarkRunResults ] A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. required run_identifiers List [ str ] A list of run identifiers. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required table_name str The name of the table where ranks are stored. required Source code in src/pheval/analyse/generate_summary_outputs.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def generate_benchmark_comparison_output ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], run_identifiers : List [ str ], benchmark_generator : BenchmarkRunOutputGenerator , table_name : str , ) -> None : \"\"\" Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of `BenchmarkRunResults` instances in `benchmarking_results` and generates rank comparison outputs using `RankComparisonGenerator` for each pair. Args: benchmark_name (str): Name of the benchmark. benchmarking_results (List[BenchmarkRunResults]): A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. run_identifiers (List[str]): A list of run identifiers. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. table_name (str): The name of the table where ranks are stored. \"\"\" output_prefix = benchmark_generator . prioritisation_type_string connector = BenchmarkDBManager ( benchmark_name ) for pair in itertools . combinations ( [ str ( result . benchmark_name ) for result in benchmarking_results ], 2 ): run_identifier_1 = pair [ 0 ] run_identifier_2 = pair [ 1 ] drop_columns = [ run for run in run_identifiers if run not in pair ] comparison_table_name = get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name , ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , ) get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. Source code in src/pheval/analyse/generate_summary_outputs.py 10 11 12 13 14 15 16 17 18 19 20 def get_new_table_name ( run_identifier_1 : str , run_identifier_2 : str , output_prefix : str ) -> str : \"\"\" Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. \"\"\" return f \" { run_identifier_1 } _vs_\" f \" { run_identifier_2 } _\" f \" { output_prefix } _rank_comparison\"","title":"Generate summary outputs"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.create_comparison_table","text":"Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from Source code in src/pheval/analyse/generate_summary_outputs.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 def create_comparison_table ( comparison_table_name : str , connector : BenchmarkDBManager , drop_columns : List [ str ], run_identifier_1 : str , run_identifier_2 : str , table_name : str , ) -> None : \"\"\" Create rank comparison tables. Args: comparison_table_name (str): Name of the comparison table to create. connector (BenchmarkDBManager): DBConnector instance. drop_columns (List[str]): List of columns to drop. run_identifier_1 (str): The first run identifier. run_identifier_2 (str): The second run identifier. table_name (str): Name of the table to extract ranks from \"\"\" connector . drop_table ( comparison_table_name ) excluded_columns = tuple ( drop_columns + [ \"identifier\" ]) if drop_columns else ( \"identifier\" ,) connector . conn . execute ( f 'CREATE TABLE \" { comparison_table_name } \" AS SELECT * ' f \"EXCLUDE { excluded_columns } FROM { table_name } \" ) connector . conn . execute ( f \"\"\"ALTER TABLE \" { comparison_table_name } \" ADD COLUMN rank_change VARCHAR;\"\"\" ) connector . conn . execute ( f 'UPDATE \" { comparison_table_name } \" SET rank_change = CASE WHEN \" { run_identifier_1 } \" = 0 ' f 'AND \" { run_identifier_2 } \" != 0 ' f \"THEN 'GAINED' WHEN \\\" { run_identifier_1 } \\\" != 0 AND \\\" { run_identifier_2 } \\\" = 0 THEN 'LOST' ELSE \" f 'CAST (\" { run_identifier_1 } \" - \" { run_identifier_2 } \" AS VARCHAR) END;' ) connector . conn . commit ()","title":"create_comparison_table"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.generate_benchmark_comparison_output","text":"Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of BenchmarkRunResults instances in benchmarking_results and generates rank comparison outputs using RankComparisonGenerator for each pair. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required benchmarking_results List [ BenchmarkRunResults ] A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. required run_identifiers List [ str ] A list of run identifiers. required benchmark_generator BenchmarkRunOutputGenerator Object containing benchmarking output generation details. required table_name str The name of the table where ranks are stored. required Source code in src/pheval/analyse/generate_summary_outputs.py 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def generate_benchmark_comparison_output ( benchmark_name : str , benchmarking_results : List [ BenchmarkRunResults ], run_identifiers : List [ str ], benchmark_generator : BenchmarkRunOutputGenerator , table_name : str , ) -> None : \"\"\" Generate prioritisation outputs for benchmarking multiple runs. This function generates comparison outputs for benchmarking multiple runs. It compares the results between pairs of `BenchmarkRunResults` instances in `benchmarking_results` and generates rank comparison outputs using `RankComparisonGenerator` for each pair. Args: benchmark_name (str): Name of the benchmark. benchmarking_results (List[BenchmarkRunResults]): A list containing BenchmarkRunResults instances representing the benchmarking results of multiple runs. run_identifiers (List[str]): A list of run identifiers. benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details. table_name (str): The name of the table where ranks are stored. \"\"\" output_prefix = benchmark_generator . prioritisation_type_string connector = BenchmarkDBManager ( benchmark_name ) for pair in itertools . combinations ( [ str ( result . benchmark_name ) for result in benchmarking_results ], 2 ): run_identifier_1 = pair [ 0 ] run_identifier_2 = pair [ 1 ] drop_columns = [ run for run in run_identifiers if run not in pair ] comparison_table_name = get_new_table_name ( run_identifier_1 , run_identifier_2 , output_prefix ) create_comparison_table ( comparison_table_name , connector , drop_columns , run_identifier_1 , run_identifier_2 , table_name , ) generate_plots ( benchmark_name , benchmarking_results , benchmark_generator , )","title":"generate_benchmark_comparison_output"},{"location":"api/pheval/analyse/generate_summary_outputs/#src.pheval.analyse.generate_summary_outputs.get_new_table_name","text":"Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. Source code in src/pheval/analyse/generate_summary_outputs.py 10 11 12 13 14 15 16 17 18 19 20 def get_new_table_name ( run_identifier_1 : str , run_identifier_2 : str , output_prefix : str ) -> str : \"\"\" Get the new table name for rank comparison tables. Args: run_identifier_1: The first run identifier. run_identifier_2: The second run identifier. output_prefix: The output prefix of the table Returns: The new table name. \"\"\" return f \" { run_identifier_1 } _vs_\" f \" { run_identifier_2 } _\" f \" { output_prefix } _rank_comparison\"","title":"get_new_table_name"},{"location":"api/pheval/analyse/parse_benchmark_summary/","text":"parse_benchmark_db ( benchmarking_db ) Read the summary benchmark TSV output generated from the benchmark-comparison command. Parameters: Name Type Description Default benchmarking_db Path Path to the benchmark db. required Returns: Name Type Description BenchmarkSummaryResults BenchmarkSummaryResults A dataclass containing all benchmarking results contained in the db. Source code in src/pheval/analyse/parse_benchmark_summary.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 def parse_benchmark_db ( benchmarking_db : Path ) -> BenchmarkSummaryResults : \"\"\" Read the summary benchmark TSV output generated from the benchmark-comparison command. Args: benchmarking_db (Path): Path to the benchmark db. Returns: BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db. \"\"\" db_connector = BenchmarkDBManager ( benchmarking_db ) gene_benchmarking_results , disease_benchmarking_results , variant_benchmarking_results = ( None , None , None , ) if db_connector . check_table_exists ( \"gene_summary\" ): gene_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM gene_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"disease_summary\" ): disease_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM disease_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"variant_summary\" ): variant_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM variant_summary\" ) . fetchdf () ) return BenchmarkSummaryResults ( gene_results = gene_benchmarking_results , disease_results = disease_benchmarking_results , variant_results = variant_benchmarking_results , ) parse_benchmark_results ( benchmark_summary_table ) Parse benchmark results from a DataFrame. Parameters: Name Type Description Default benchmark_summary_table DataFrame DataFrame containing benchmark results. required Returns: Type Description List [ BenchmarkRunResults ] List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. Source code in src/pheval/analyse/parse_benchmark_summary.py 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 def parse_benchmark_results ( benchmark_summary_table : pd . DataFrame ) -> List [ BenchmarkRunResults ]: \"\"\" Parse benchmark results from a DataFrame. Args: benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results. Returns: List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. \"\"\" results = [] for _ , row in benchmark_summary_table . iterrows (): benchmarking_result = BenchmarkRunResults ( rank_stats = RankStats ( top = row [ \"top\" ], top3 = row [ \"top3\" ], top5 = row [ \"top5\" ], top10 = row [ \"top10\" ], found = row [ \"found\" ], total = row [ \"total\" ], mrr = row [ \"mean_reciprocal_rank\" ], ), benchmark_name = row [ \"results_directory_path\" ], binary_classification_stats = BinaryClassificationStats (), ) results . append ( benchmarking_result ) return results","title":"Parse benchmark summary"},{"location":"api/pheval/analyse/parse_benchmark_summary/#src.pheval.analyse.parse_benchmark_summary.parse_benchmark_db","text":"Read the summary benchmark TSV output generated from the benchmark-comparison command. Parameters: Name Type Description Default benchmarking_db Path Path to the benchmark db. required Returns: Name Type Description BenchmarkSummaryResults BenchmarkSummaryResults A dataclass containing all benchmarking results contained in the db. Source code in src/pheval/analyse/parse_benchmark_summary.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 def parse_benchmark_db ( benchmarking_db : Path ) -> BenchmarkSummaryResults : \"\"\" Read the summary benchmark TSV output generated from the benchmark-comparison command. Args: benchmarking_db (Path): Path to the benchmark db. Returns: BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db. \"\"\" db_connector = BenchmarkDBManager ( benchmarking_db ) gene_benchmarking_results , disease_benchmarking_results , variant_benchmarking_results = ( None , None , None , ) if db_connector . check_table_exists ( \"gene_summary\" ): gene_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM gene_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"disease_summary\" ): disease_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM disease_summary\" ) . fetchdf () ) if db_connector . check_table_exists ( \"variant_summary\" ): variant_benchmarking_results = parse_benchmark_results ( db_connector . conn . execute ( \"SELECT * FROM variant_summary\" ) . fetchdf () ) return BenchmarkSummaryResults ( gene_results = gene_benchmarking_results , disease_results = disease_benchmarking_results , variant_results = variant_benchmarking_results , )","title":"parse_benchmark_db"},{"location":"api/pheval/analyse/parse_benchmark_summary/#src.pheval.analyse.parse_benchmark_summary.parse_benchmark_results","text":"Parse benchmark results from a DataFrame. Parameters: Name Type Description Default benchmark_summary_table DataFrame DataFrame containing benchmark results. required Returns: Type Description List [ BenchmarkRunResults ] List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. Source code in src/pheval/analyse/parse_benchmark_summary.py 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 def parse_benchmark_results ( benchmark_summary_table : pd . DataFrame ) -> List [ BenchmarkRunResults ]: \"\"\" Parse benchmark results from a DataFrame. Args: benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results. Returns: List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame. \"\"\" results = [] for _ , row in benchmark_summary_table . iterrows (): benchmarking_result = BenchmarkRunResults ( rank_stats = RankStats ( top = row [ \"top\" ], top3 = row [ \"top3\" ], top5 = row [ \"top5\" ], top10 = row [ \"top10\" ], found = row [ \"found\" ], total = row [ \"total\" ], mrr = row [ \"mean_reciprocal_rank\" ], ), benchmark_name = row [ \"results_directory_path\" ], binary_classification_stats = BinaryClassificationStats (), ) results . append ( benchmarking_result ) return results","title":"parse_benchmark_results"},{"location":"api/pheval/analyse/parse_corpus/","text":"CorpusParser Class for parsing phenopacket corpus and retrieving known variants/genes/diseases. Source code in src/pheval/analyse/parse_corpus.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 class CorpusParser : \"\"\"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases.\"\"\" def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name def _create_gene_table ( self ) -> None : \"\"\" Create the Gene benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _gene ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, gene_symbol VARCHAR, gene_identifier VARCHAR ) \"\"\" ) def _create_variant_table ( self ) -> None : \"\"\" Create the Variant benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _variant ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, chrom VARCHAR, pos INTEGER, \"ref\" VARCHAR, alt VARCHAR ) \"\"\" ) def _create_disease_table ( self ): \"\"\" Create the Disease benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _disease ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, disease_identifier VARCHAR, disease_name VARCHAR ) \"\"\" ) def _create_tables ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Create tables based on the benchmarking analysis specified. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): self . _create_gene_table () if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): self . _create_variant_table () if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): self . _create_disease_table () def _insert_genes ( self , phenopacket_path : Path , genes : List [ ProbandCausativeGene ]) -> None : \"\"\" Insert known disease-causing genes into the Gene benchmarking table. Args: phenopacket_path(Path): Path to the Phenopacket file. genes(List[ProbandCausativeGene]): List of known genes associated with the proband. \"\"\" for gene in genes : identifier = f \" { phenopacket_path . name } - { gene . gene_symbol } \" self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _gene (identifier, phenopacket, gene_symbol, gene_identifier) VALUES (?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , gene . gene_symbol , gene . gene_identifier ), ) def _insert_variants ( self , phenopacket_path : Path , variants : List [ GenomicVariant ]) -> None : \"\"\" Insert known variants into the Variant benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: variants (List[GenomicVariant]): List of known variants associated with the proband. \"\"\" for variant in variants : identifier = ( f \" { phenopacket_path . name } - { variant . chrom } - { variant . pos } - { variant . ref } - { variant . alt } \" ) self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _variant (identifier, phenopacket, chrom, pos, \"ref\", alt) VALUES (?, ?, ?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , variant . chrom , variant . pos , variant . ref , variant . alt , ), ) def _insert_diseases ( self , phenopacket_path : Path , diseases : List [ ProbandDisease ]) -> None : \"\"\" Insert known diseases into the Disease benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: diseases (List[ProbandDisease]): List of known diseases associated with the proband. \"\"\" for disease in diseases : identifier = f \" { phenopacket_path . name } - { disease . disease_identifier } \" self . conn . execute ( f \"INSERT OR IGNORE INTO { self . table_name } _disease \" f \"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)\" , ( identifier , phenopacket_path . name , disease . disease_identifier , disease . disease_name , ), ) def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close () __init__ ( benchmark_name , phenopacket_dir ) Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. Source code in src/pheval/analyse/parse_corpus.py 68 69 70 71 72 73 74 75 76 def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name parse_corpus ( benchmark_generator ) Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. Source code in src/pheval/analyse/parse_corpus.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"Parse corpus"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser","text":"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases. Source code in src/pheval/analyse/parse_corpus.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 class CorpusParser : \"\"\"Class for parsing phenopacket corpus and retrieving known variants/genes/diseases.\"\"\" def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name def _create_gene_table ( self ) -> None : \"\"\" Create the Gene benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _gene ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, gene_symbol VARCHAR, gene_identifier VARCHAR ) \"\"\" ) def _create_variant_table ( self ) -> None : \"\"\" Create the Variant benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _variant ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, chrom VARCHAR, pos INTEGER, \"ref\" VARCHAR, alt VARCHAR ) \"\"\" ) def _create_disease_table ( self ): \"\"\" Create the Disease benchmarking table if it doesn't already exist. \"\"\" self . conn . execute ( f \"\"\" CREATE TABLE IF NOT EXISTS { self . table_name } _disease ( identifier VARCHAR(255) PRIMARY KEY, phenopacket VARCHAR, disease_identifier VARCHAR, disease_name VARCHAR ) \"\"\" ) def _create_tables ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Create tables based on the benchmarking analysis specified. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): self . _create_gene_table () if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): self . _create_variant_table () if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): self . _create_disease_table () def _insert_genes ( self , phenopacket_path : Path , genes : List [ ProbandCausativeGene ]) -> None : \"\"\" Insert known disease-causing genes into the Gene benchmarking table. Args: phenopacket_path(Path): Path to the Phenopacket file. genes(List[ProbandCausativeGene]): List of known genes associated with the proband. \"\"\" for gene in genes : identifier = f \" { phenopacket_path . name } - { gene . gene_symbol } \" self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _gene (identifier, phenopacket, gene_symbol, gene_identifier) VALUES (?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , gene . gene_symbol , gene . gene_identifier ), ) def _insert_variants ( self , phenopacket_path : Path , variants : List [ GenomicVariant ]) -> None : \"\"\" Insert known variants into the Variant benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: variants (List[GenomicVariant]): List of known variants associated with the proband. \"\"\" for variant in variants : identifier = ( f \" { phenopacket_path . name } - { variant . chrom } - { variant . pos } - { variant . ref } - { variant . alt } \" ) self . conn . execute ( f \"\"\" INSERT OR IGNORE INTO { self . table_name } _variant (identifier, phenopacket, chrom, pos, \"ref\", alt) VALUES (?, ?, ?, ?, ?, ?) \"\"\" , ( identifier , phenopacket_path . name , variant . chrom , variant . pos , variant . ref , variant . alt , ), ) def _insert_diseases ( self , phenopacket_path : Path , diseases : List [ ProbandDisease ]) -> None : \"\"\" Insert known diseases into the Disease benchmarking table. Args: phenopacket_path (Path): Path to the Phenopacket file.: diseases (List[ProbandDisease]): List of known diseases associated with the proband. \"\"\" for disease in diseases : identifier = f \" { phenopacket_path . name } - { disease . disease_identifier } \" self . conn . execute ( f \"INSERT OR IGNORE INTO { self . table_name } _disease \" f \"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)\" , ( identifier , phenopacket_path . name , disease . disease_identifier , disease . disease_name , ), ) def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"CorpusParser"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser.__init__","text":"Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. Source code in src/pheval/analyse/parse_corpus.py 68 69 70 71 72 73 74 75 76 def __init__ ( self , benchmark_name : str , phenopacket_dir : Path ) -> None : \"\"\" Initialise the CorpusParser class. Args: phenopacket_dir (Path): Path to the Phenopacket directory. \"\"\" self . phenopacket_dir = phenopacket_dir self . conn = BenchmarkDBManager ( benchmark_name ) . conn self . table_name = phenopacket_dir . parents [ 0 ] . name","title":"__init__"},{"location":"api/pheval/analyse/parse_corpus/#src.pheval.analyse.parse_corpus.CorpusParser.parse_corpus","text":"Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. Source code in src/pheval/analyse/parse_corpus.py 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def parse_corpus ( self , benchmark_generator : BenchmarkRunOutputGenerator ) -> None : \"\"\" Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables. Args: benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type. \"\"\" self . _create_tables ( benchmark_generator ) for phenopacket_path in all_files ( self . phenopacket_dir ): if isinstance ( benchmark_generator , GeneBenchmarkRunOutputGenerator ): genes = _obtain_causative_genes ( phenopacket_path ) self . _insert_genes ( phenopacket_path , genes ) if isinstance ( benchmark_generator , VariantBenchmarkRunOutputGenerator ): variants = _obtain_causative_variants ( phenopacket_path ) self . _insert_variants ( phenopacket_path , variants ) if isinstance ( benchmark_generator , DiseaseBenchmarkRunOutputGenerator ): diseases = _obtain_causative_diseases ( phenopacket_path ) self . _insert_diseases ( phenopacket_path , diseases ) self . conn . close ()","title":"parse_corpus"},{"location":"api/pheval/analyse/prioritisation_result_types/","text":"DiseasePrioritisationResult dataclass Store rank data for known diseases. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. disease ProbandDisease The proband disease. rank int The assigned rank for the disease. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 @dataclass class DiseasePrioritisationResult : \"\"\" Store rank data for known diseases. Attributes: phenopacket_path (Path): Path to the phenopacket. disease (ProbandDisease): The proband disease. rank (int): The assigned rank for the disease. Defaults to 0. \"\"\" phenopacket_path : Path disease : ProbandDisease rank : int = 0 GenePrioritisationResult dataclass Store rank data for causative genes. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. gene str The causative gene. rank int The assigned rank for the gene. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 7 8 9 10 11 12 13 14 15 16 17 18 19 20 @dataclass class GenePrioritisationResult : \"\"\" Store rank data for causative genes. Attributes: phenopacket_path (Path): Path to the phenopacket. gene (str): The causative gene. rank (int): The assigned rank for the gene. Defaults to 0. \"\"\" phenopacket_path : Path gene : str rank : int = 0 VariantPrioritisationResult dataclass Store rank data for variants. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. variant GenomicVariant The genomic variant. rank int The assigned rank for the variant. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 @dataclass class VariantPrioritisationResult : \"\"\" Store rank data for variants. Attributes: phenopacket_path (Path): Path to the phenopacket. variant (GenomicVariant): The genomic variant. rank (int): The assigned rank for the variant. Defaults to 0. \"\"\" phenopacket_path : Path variant : GenomicVariant rank : int = 0","title":"Prioritisation result types"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.DiseasePrioritisationResult","text":"Store rank data for known diseases. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. disease ProbandDisease The proband disease. rank int The assigned rank for the disease. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 @dataclass class DiseasePrioritisationResult : \"\"\" Store rank data for known diseases. Attributes: phenopacket_path (Path): Path to the phenopacket. disease (ProbandDisease): The proband disease. rank (int): The assigned rank for the disease. Defaults to 0. \"\"\" phenopacket_path : Path disease : ProbandDisease rank : int = 0","title":"DiseasePrioritisationResult"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.GenePrioritisationResult","text":"Store rank data for causative genes. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. gene str The causative gene. rank int The assigned rank for the gene. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 7 8 9 10 11 12 13 14 15 16 17 18 19 20 @dataclass class GenePrioritisationResult : \"\"\" Store rank data for causative genes. Attributes: phenopacket_path (Path): Path to the phenopacket. gene (str): The causative gene. rank (int): The assigned rank for the gene. Defaults to 0. \"\"\" phenopacket_path : Path gene : str rank : int = 0","title":"GenePrioritisationResult"},{"location":"api/pheval/analyse/prioritisation_result_types/#src.pheval.analyse.prioritisation_result_types.VariantPrioritisationResult","text":"Store rank data for variants. Attributes: Name Type Description phenopacket_path Path Path to the phenopacket. variant GenomicVariant The genomic variant. rank int The assigned rank for the variant. Defaults to 0. Source code in src/pheval/analyse/prioritisation_result_types.py 23 24 25 26 27 28 29 30 31 32 33 34 35 36 @dataclass class VariantPrioritisationResult : \"\"\" Store rank data for variants. Attributes: phenopacket_path (Path): Path to the phenopacket. variant (GenomicVariant): The genomic variant. rank (int): The assigned rank for the variant. Defaults to 0. \"\"\" phenopacket_path : Path variant : GenomicVariant rank : int = 0","title":"VariantPrioritisationResult"},{"location":"api/pheval/analyse/rank_stats/","text":"RankStats dataclass Store statistics related to ranking. Attributes: Name Type Description top int Count of top-ranked matches. top3 int Count of matches within the top 3 ranks. top5 int Count of matches within the top 5 ranks. top10 int Count of matches within the top 10 ranks. found int Count of found matches. total int Total count of matches. reciprocal_ranks List [ float ] List of reciprocal ranks. relevant_ranks List [ List [ int ]] Nested list of ranks for the known entities for all cases in a run. mrr float Mean Reciprocal Rank (MRR). Defaults to None. Source code in src/pheval/analyse/rank_stats.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 @dataclass class RankStats : \"\"\"Store statistics related to ranking. Attributes: top (int): Count of top-ranked matches. top3 (int): Count of matches within the top 3 ranks. top5 (int): Count of matches within the top 5 ranks. top10 (int): Count of matches within the top 10 ranks. found (int): Count of found matches. total (int): Total count of matches. reciprocal_ranks (List[float]): List of reciprocal ranks. relevant_ranks List[List[int]]: Nested list of ranks for the known entities for all cases in a run. mrr (float): Mean Reciprocal Rank (MRR). Defaults to None. \"\"\" top : int = 0 top3 : int = 0 top5 : int = 0 top10 : int = 0 found : int = 0 total : int = 0 reciprocal_ranks : List = field ( default_factory = list ) relevant_result_ranks : List [ List [ int ]] = field ( default_factory = list ) mrr : float = None def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () @staticmethod def _execute_count_query ( conn : DuckDBPyConnection , table_name : str , column_name : str , condition : str ) -> int : \"\"\" Execute count query on table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to execute count query on. column_name (str): Name of the column to execute count query on. condition (str): Condition to execute count query. Returns: int: Count query result. \"\"\" query = f 'SELECT COUNT(*) FROM { table_name } WHERE \" { column_name } \" { condition } ' return conn . execute ( query ) . fetchone ()[ 0 ] @staticmethod def _fetch_reciprocal_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ float ]: \"\"\" Fetch reciprocal ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch reciprocal ranks from. column_name (str): Name of the column to fetch reciprocal ranks from. Returns: List[float]: List of reciprocal ranks. \"\"\" query = f 'SELECT \" { column_name } \" FROM { table_name } ' return [ 1 / rank [ 0 ] if rank [ 0 ] > 0 else 0 for rank in conn . execute ( query ) . fetchall ()] @staticmethod def _fetch_relevant_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ List [ int ]]: \"\"\" Fetch relevant ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch relevant ranks from. column_name (str): Name of the column to fetch relevant ranks from. Returns: List[List[int]]: List of relevant ranks. \"\"\" query = ( f 'SELECT LIST(\" { column_name } \") as values_list FROM { table_name } GROUP BY phenopacket' ) return [ rank [ 0 ] for rank in conn . execute ( query ) . fetchall ()] def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) @staticmethod def _average_precision_at_k ( number_of_relevant_entities_at_k : int , precision_at_k : float ) -> float : \"\"\" Calculate the Average Precision at k. Average Precision at k (AP@k) is a metric used to evaluate the precision of a ranked retrieval system. It measures the precision at each relevant position up to k and takes the average. Args: number_of_relevant_entities_at_k (int): The count of relevant entities in the top-k predictions. precision_at_k (float): The precision at k - the sum of the precision values at each relevant position. Returns: float: The Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better precision in the top-k predictions. \"\"\" return ( ( 1 / number_of_relevant_entities_at_k ) * precision_at_k if number_of_relevant_entities_at_k > 0 else 0.0 ) def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores ) add_ranks ( benchmark_name , table_name , column_name ) Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: Source code in src/pheval/analyse/rank_stats.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () f_beta_score_at_k ( percentage_at_k , k ) Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Parameters: Name Type Description Default percentage_at_k float The percentage of true positive predictions within the top-k. required k int The number of top predictions to consider. required Returns: Name Type Description float float The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. Source code in src/pheval/analyse/rank_stats.py 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) mean_average_precision_at_k ( k ) Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Parameters: Name Type Description Default k int The number of top predictions to consider for precision calculation. required Returns: Name Type Description float float The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. Source code in src/pheval/analyse/rank_stats.py 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores mean_normalised_discounted_cumulative_gain ( k ) Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Parameters: Name Type Description Default k int The rank cutoff for calculating NDCG. required Returns: Name Type Description float float The mean NDCG score across all query results. Source code in src/pheval/analyse/rank_stats.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores ) mean_reciprocal_rank () Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: Name Type Description float float The calculated Mean Reciprocal Rank. Source code in src/pheval/analyse/rank_stats.py 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) percentage_difference ( percentage_value_1 , percentage_value_2 ) staticmethod Calculate the percentage difference between two percentage values. Parameters: Name Type Description Default percentage_value_1 float The first percentage value. required percentage_value_2 float The second percentage value. required Returns: Name Type Description float float The difference between the two percentage values. Source code in src/pheval/analyse/rank_stats.py 167 168 169 170 171 172 173 174 175 176 177 178 179 @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 percentage_found () Calculate the percentage of matches found. Returns: Name Type Description float float The percentage of matches found compared to the total count. Source code in src/pheval/analyse/rank_stats.py 158 159 160 161 162 163 164 165 def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) percentage_rank ( value ) Calculate the percentage rank. Parameters: Name Type Description Default value int The value for which the percentage rank needs to be calculated. required Returns: Name Type Description float float The calculated percentage rank based on the provided value and the total count. Source code in src/pheval/analyse/rank_stats.py 110 111 112 113 114 115 116 117 118 119 120 def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total percentage_top () Calculate the percentage of top matches. Returns: Name Type Description float float The percentage of top matches compared to the total count. Source code in src/pheval/analyse/rank_stats.py 122 123 124 125 126 127 128 129 def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) percentage_top10 () Calculate the percentage of matches within the top 10. Returns: Name Type Description float float The percentage of matches within the top 10 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 149 150 151 152 153 154 155 156 def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) percentage_top3 () Calculate the percentage of matches within the top 3. Returns: Name Type Description float float The percentage of matches within the top 3 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 131 132 133 134 135 136 137 138 def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) percentage_top5 () Calculate the percentage of matches within the top 5. Returns: Name Type Description float float The percentage of matches within the top 5 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 140 141 142 143 144 145 146 147 def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) precision_at_k ( k ) Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Parameters: Name Type Description Default k int The number of top predictions to consider. required Returns: Name Type Description float float The precision at k, ranging from 0.0 to 1.0. float A higher precision indicates a better performance in identifying relevant items in the top-k predictions. Source code in src/pheval/analyse/rank_stats.py 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) return_mean_reciprocal_rank () Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: Name Type Description float float The Mean Reciprocal Rank value. Source code in src/pheval/analyse/rank_stats.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () RankStatsWriter Class for writing the rank stats to a file. Source code in src/pheval/analyse/rank_stats.py 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 class RankStatsWriter : \"\"\"Class for writing the rank stats to a file.\"\"\" def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close () __init__ ( benchmark_name , table_name ) Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. Source code in src/pheval/analyse/rank_stats.py 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () add_statistics_entry ( run_identifier , rank_stats , binary_classification ) Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. Source code in src/pheval/analyse/rank_stats.py 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"Rank stats"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats","text":"Store statistics related to ranking. Attributes: Name Type Description top int Count of top-ranked matches. top3 int Count of matches within the top 3 ranks. top5 int Count of matches within the top 5 ranks. top10 int Count of matches within the top 10 ranks. found int Count of found matches. total int Total count of matches. reciprocal_ranks List [ float ] List of reciprocal ranks. relevant_ranks List [ List [ int ]] Nested list of ranks for the known entities for all cases in a run. mrr float Mean Reciprocal Rank (MRR). Defaults to None. Source code in src/pheval/analyse/rank_stats.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 @dataclass class RankStats : \"\"\"Store statistics related to ranking. Attributes: top (int): Count of top-ranked matches. top3 (int): Count of matches within the top 3 ranks. top5 (int): Count of matches within the top 5 ranks. top10 (int): Count of matches within the top 10 ranks. found (int): Count of found matches. total (int): Total count of matches. reciprocal_ranks (List[float]): List of reciprocal ranks. relevant_ranks List[List[int]]: Nested list of ranks for the known entities for all cases in a run. mrr (float): Mean Reciprocal Rank (MRR). Defaults to None. \"\"\" top : int = 0 top3 : int = 0 top5 : int = 0 top10 : int = 0 found : int = 0 total : int = 0 reciprocal_ranks : List = field ( default_factory = list ) relevant_result_ranks : List [ List [ int ]] = field ( default_factory = list ) mrr : float = None def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close () @staticmethod def _execute_count_query ( conn : DuckDBPyConnection , table_name : str , column_name : str , condition : str ) -> int : \"\"\" Execute count query on table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to execute count query on. column_name (str): Name of the column to execute count query on. condition (str): Condition to execute count query. Returns: int: Count query result. \"\"\" query = f 'SELECT COUNT(*) FROM { table_name } WHERE \" { column_name } \" { condition } ' return conn . execute ( query ) . fetchone ()[ 0 ] @staticmethod def _fetch_reciprocal_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ float ]: \"\"\" Fetch reciprocal ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch reciprocal ranks from. column_name (str): Name of the column to fetch reciprocal ranks from. Returns: List[float]: List of reciprocal ranks. \"\"\" query = f 'SELECT \" { column_name } \" FROM { table_name } ' return [ 1 / rank [ 0 ] if rank [ 0 ] > 0 else 0 for rank in conn . execute ( query ) . fetchall ()] @staticmethod def _fetch_relevant_ranks ( conn : DuckDBPyConnection , table_name : str , column_name : str ) -> List [ List [ int ]]: \"\"\" Fetch relevant ranks from table. Args: conn (DuckDBPyConnection): Connection to the database. table_name (str): Name of the table to fetch relevant ranks from. column_name (str): Name of the column to fetch relevant ranks from. Returns: List[List[int]]: List of relevant ranks. \"\"\" query = ( f 'SELECT LIST(\" { column_name } \") as values_list FROM { table_name } GROUP BY phenopacket' ) return [ rank [ 0 ] for rank in conn . execute ( query ) . fetchall ()] def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top ) def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 ) def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 ) def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 ) def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found ) @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks ) def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank () def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k ) @staticmethod def _average_precision_at_k ( number_of_relevant_entities_at_k : int , precision_at_k : float ) -> float : \"\"\" Calculate the Average Precision at k. Average Precision at k (AP@k) is a metric used to evaluate the precision of a ranked retrieval system. It measures the precision at each relevant position up to k and takes the average. Args: number_of_relevant_entities_at_k (int): The count of relevant entities in the top-k predictions. precision_at_k (float): The precision at k - the sum of the precision values at each relevant position. Returns: float: The Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better precision in the top-k predictions. \"\"\" return ( ( 1 / number_of_relevant_entities_at_k ) * precision_at_k if number_of_relevant_entities_at_k > 0 else 0.0 ) def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 ) def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores )","title":"RankStats"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.add_ranks","text":"Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: Source code in src/pheval/analyse/rank_stats.py 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 def add_ranks ( self , benchmark_name : str , table_name : str , column_name : str ) -> None : \"\"\" Add ranks to RankStats instance from table. Args: table_name (str): Name of the table to add ranks from. column_name (str): Name of the column to add ranks from.: \"\"\" conn = BenchmarkDBManager ( benchmark_name ) . conn self . top = self . _execute_count_query ( conn , table_name , column_name , \" = 1\" ) self . top3 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 3\" ) self . top5 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 5\" ) self . top10 = self . _execute_count_query ( conn , table_name , column_name , \" BETWEEN 1 AND 10\" ) self . found = self . _execute_count_query ( conn , table_name , column_name , \" > 0\" ) self . total = self . _execute_count_query ( conn , table_name , column_name , \" >= 0\" ) self . reciprocal_ranks = self . _fetch_reciprocal_ranks ( conn , table_name , column_name ) self . relevant_result_ranks = self . _fetch_relevant_ranks ( conn , table_name , column_name ) conn . close ()","title":"add_ranks"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.f_beta_score_at_k","text":"Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Parameters: Name Type Description Default percentage_at_k float The percentage of true positive predictions within the top-k. required k int The number of top predictions to consider. required Returns: Name Type Description float float The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. Source code in src/pheval/analyse/rank_stats.py 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 def f_beta_score_at_k ( self , percentage_at_k : float , k : int ) -> float : \"\"\" Calculate the F-beta score at k. The F-beta score is a metric that combines precision and recall, with beta controlling the emphasis on precision. The Beta value is set to the value of 1 to allow for equal weighting for both precision and recall. This method computes the F-beta score at a specific percentage threshold within the top-k predictions. Args: percentage_at_k (float): The percentage of true positive predictions within the top-k. k (int): The number of top predictions to consider. Returns: float: The F-beta score at k, ranging from 0.0 to 1.0. A higher score indicates better trade-off between precision and recall. \"\"\" precision = self . precision_at_k ( k ) recall_at_k = percentage_at_k / 100 return ( ( 2 * precision * recall_at_k ) / ( precision + recall_at_k ) if ( precision + recall_at_k ) > 0 else 0 )","title":"f_beta_score_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_average_precision_at_k","text":"Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Parameters: Name Type Description Default k int The number of top predictions to consider for precision calculation. required Returns: Name Type Description float float The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. Source code in src/pheval/analyse/rank_stats.py 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 def mean_average_precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the Mean Average Precision at k. Mean Average Precision at k (MAP@k) is a performance metric for ranked data. It calculates the average precision at k for each result rank and then takes the mean across all queries. Args: k (int): The number of top predictions to consider for precision calculation. Returns: float: The Mean Average Precision at k, ranging from 0.0 to 1.0. A higher value indicates better performance in ranking relevant entities higher in the predictions. \"\"\" cumulative_average_precision_scores = 0 for result_ranks in self . relevant_result_ranks : precision_at_k , number_of_relevant_entities_at_k = 0 , 0 for rank in result_ranks : if 0 < rank <= k : number_of_relevant_entities_at_k += 1 precision_at_k += number_of_relevant_entities_at_k / rank cumulative_average_precision_scores += self . _average_precision_at_k ( number_of_relevant_entities_at_k , precision_at_k ) return ( 1 / self . total ) * cumulative_average_precision_scores","title":"mean_average_precision_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_normalised_discounted_cumulative_gain","text":"Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Parameters: Name Type Description Default k int The rank cutoff for calculating NDCG. required Returns: Name Type Description float float The mean NDCG score across all query results. Source code in src/pheval/analyse/rank_stats.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 def mean_normalised_discounted_cumulative_gain ( self , k : int ) -> float : \"\"\" Calculate the mean Normalised Discounted Cumulative Gain (NDCG) for a given rank cutoff. NDCG measures the effectiveness of a ranking by considering both the relevance and the order of items. Args: k (int): The rank cutoff for calculating NDCG. Returns: float: The mean NDCG score across all query results. \"\"\" ndcg_scores = [] for result_ranks in self . relevant_result_ranks : result_ranks = [ rank for rank in result_ranks if rank <= k ] result_ranks = [ 3 if i in result_ranks else 0 for i in range ( k )] ideal_ranking = sorted ( result_ranks , reverse = True ) ndcg_scores . append ( ndcg_score ( np . asarray ([ ideal_ranking ]), np . asarray ([ result_ranks ]))) return np . mean ( ndcg_scores )","title":"mean_normalised_discounted_cumulative_gain"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.mean_reciprocal_rank","text":"Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: Name Type Description float float The calculated Mean Reciprocal Rank. Source code in src/pheval/analyse/rank_stats.py 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 def mean_reciprocal_rank ( self ) -> float : \"\"\" Calculate the Mean Reciprocal Rank (MRR) for the stored ranks. The Mean Reciprocal Rank is computed as the mean of the reciprocal ranks for the found cases. If the total number of cases differs from the number of found cases, this method extends the reciprocal ranks list with zeroes for missing cases. Returns: float: The calculated Mean Reciprocal Rank. \"\"\" if len ( self . reciprocal_ranks ) != self . total : missing_cases = self . total - self . found self . reciprocal_ranks . extend ([ 0 ] * missing_cases ) return mean ( self . reciprocal_ranks ) return mean ( self . reciprocal_ranks )","title":"mean_reciprocal_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_difference","text":"Calculate the percentage difference between two percentage values. Parameters: Name Type Description Default percentage_value_1 float The first percentage value. required percentage_value_2 float The second percentage value. required Returns: Name Type Description float float The difference between the two percentage values. Source code in src/pheval/analyse/rank_stats.py 167 168 169 170 171 172 173 174 175 176 177 178 179 @staticmethod def percentage_difference ( percentage_value_1 : float , percentage_value_2 : float ) -> float : \"\"\" Calculate the percentage difference between two percentage values. Args: percentage_value_1 (float): The first percentage value. percentage_value_2 (float): The second percentage value. Returns: float: The difference between the two percentage values. \"\"\" return percentage_value_1 - percentage_value_2","title":"percentage_difference"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_found","text":"Calculate the percentage of matches found. Returns: Name Type Description float float The percentage of matches found compared to the total count. Source code in src/pheval/analyse/rank_stats.py 158 159 160 161 162 163 164 165 def percentage_found ( self ) -> float : \"\"\" Calculate the percentage of matches found. Returns: float: The percentage of matches found compared to the total count. \"\"\" return self . percentage_rank ( self . found )","title":"percentage_found"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_rank","text":"Calculate the percentage rank. Parameters: Name Type Description Default value int The value for which the percentage rank needs to be calculated. required Returns: Name Type Description float float The calculated percentage rank based on the provided value and the total count. Source code in src/pheval/analyse/rank_stats.py 110 111 112 113 114 115 116 117 118 119 120 def percentage_rank ( self , value : int ) -> float : \"\"\" Calculate the percentage rank. Args: value (int): The value for which the percentage rank needs to be calculated. Returns: float: The calculated percentage rank based on the provided value and the total count. \"\"\" return 100 * value / self . total","title":"percentage_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top","text":"Calculate the percentage of top matches. Returns: Name Type Description float float The percentage of top matches compared to the total count. Source code in src/pheval/analyse/rank_stats.py 122 123 124 125 126 127 128 129 def percentage_top ( self ) -> float : \"\"\" Calculate the percentage of top matches. Returns: float: The percentage of top matches compared to the total count. \"\"\" return self . percentage_rank ( self . top )","title":"percentage_top"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top10","text":"Calculate the percentage of matches within the top 10. Returns: Name Type Description float float The percentage of matches within the top 10 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 149 150 151 152 153 154 155 156 def percentage_top10 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 10. Returns: float: The percentage of matches within the top 10 compared to the total count. \"\"\" return self . percentage_rank ( self . top10 )","title":"percentage_top10"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top3","text":"Calculate the percentage of matches within the top 3. Returns: Name Type Description float float The percentage of matches within the top 3 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 131 132 133 134 135 136 137 138 def percentage_top3 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 3. Returns: float: The percentage of matches within the top 3 compared to the total count. \"\"\" return self . percentage_rank ( self . top3 )","title":"percentage_top3"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.percentage_top5","text":"Calculate the percentage of matches within the top 5. Returns: Name Type Description float float The percentage of matches within the top 5 compared to the total count. Source code in src/pheval/analyse/rank_stats.py 140 141 142 143 144 145 146 147 def percentage_top5 ( self ) -> float : \"\"\" Calculate the percentage of matches within the top 5. Returns: float: The percentage of matches within the top 5 compared to the total count. \"\"\" return self . percentage_rank ( self . top5 )","title":"percentage_top5"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.precision_at_k","text":"Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Parameters: Name Type Description Default k int The number of top predictions to consider. required Returns: Name Type Description float float The precision at k, ranging from 0.0 to 1.0. float A higher precision indicates a better performance in identifying relevant items in the top-k predictions. Source code in src/pheval/analyse/rank_stats.py 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 def precision_at_k ( self , k : int ) -> float : \"\"\" Calculate the precision at k. Precision at k is the ratio of relevant items in the top-k predictions to the total number of predictions. It measures the accuracy of the top-k predictions made by a model. Args: k (int): The number of top predictions to consider. Returns: float: The precision at k, ranging from 0.0 to 1.0. A higher precision indicates a better performance in identifying relevant items in the top-k predictions. \"\"\" k_attr = getattr ( self , f \"top { k } \" ) if k > 1 else self . top return k_attr / ( self . total * k )","title":"precision_at_k"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStats.return_mean_reciprocal_rank","text":"Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: Name Type Description float float The Mean Reciprocal Rank value. Source code in src/pheval/analyse/rank_stats.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 def return_mean_reciprocal_rank ( self ) -> float : \"\"\" Retrieve or calculate the Mean Reciprocal Rank (MRR). If a pre-calculated MRR value exists (stored in the 'mrr' attribute), this method returns that value. Otherwise, it computes the Mean Reciprocal Rank using the 'mean_reciprocal_rank' method. Returns: float: The Mean Reciprocal Rank value. \"\"\" if self . mrr is not None : return self . mrr else : return self . mean_reciprocal_rank ()","title":"return_mean_reciprocal_rank"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter","text":"Class for writing the rank stats to a file. Source code in src/pheval/analyse/rank_stats.py 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 class RankStatsWriter : \"\"\"Class for writing the rank stats to a file.\"\"\" def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close () def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"RankStatsWriter"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter.__init__","text":"Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. Source code in src/pheval/analyse/rank_stats.py 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 def __init__ ( self , benchmark_name : str , table_name : str ): \"\"\" Initialise the RankStatsWriter class Args: table_name (str): Name of table to add statistics. \"\"\" self . table_name = table_name self . benchmark_name = benchmark_name conn = BenchmarkDBManager ( benchmark_name ) . conn conn . execute ( f 'CREATE TABLE IF NOT EXISTS \" { self . table_name } \" (' f \"results_directory_path VARCHAR,\" f \"top INT,\" f \"top3 INT,\" f \"top5 INT,\" f \"top10 INT,\" f '\"found\" INT,' f \"total INT,\" f \"mean_reciprocal_rank FLOAT,\" f \"percentage_top FLOAT,\" f \"percentage_top3 FLOAT,\" f \"percentage_top5 FLOAT,\" f \"percentage_top10 FLOAT,\" f \"percentage_found FLOAT,\" f '\"precision@1\" FLOAT,' f '\"precision@3\" FLOAT,' f '\"precision@5\" FLOAT,' f '\"precision@10\" FLOAT,' f '\"MAP@1\" FLOAT,' f '\"MAP@3\" FLOAT,' f '\"MAP@5\" FLOAT,' f '\"MAP@10\" FLOAT,' f '\"f_beta_score@1\" FLOAT,' f '\"f_beta_score@3\"FLOAT,' f '\"f_beta_score@5\" FLOAT,' f '\"f_beta_score@10\" FLOAT,' f '\"NDCG@3\" FLOAT,' f '\"NDCG@5\" FLOAT,' f '\"NDCG@10\" FLOAT,' f \"true_positives INT,\" f \"false_positives INT,\" f \"true_negatives INT,\" f \"false_negatives INT,\" f \"sensitivity FLOAT,\" f \"specificity FLOAT,\" f '\"precision\" FLOAT,' f \"negative_predictive_value FLOAT,\" f \"false_positive_rate FLOAT,\" f \"false_discovery_rate FLOAT,\" f \"false_negative_rate FLOAT,\" f \"accuracy FLOAT,\" f \"f1_score FLOAT,\" f \"matthews_correlation_coefficient FLOAT, )\" ) conn . close ()","title":"__init__"},{"location":"api/pheval/analyse/rank_stats/#src.pheval.analyse.rank_stats.RankStatsWriter.add_statistics_entry","text":"Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. Source code in src/pheval/analyse/rank_stats.py 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 def add_statistics_entry ( self , run_identifier : str , rank_stats : RankStats , binary_classification : BinaryClassificationStats , ): \"\"\" Add statistics row to table for a run. Args: run_identifier (str): The run identifier. rank_stats (RankStats): RankStats object for the run. binary_classification (BinaryClassificationStats): BinaryClassificationStats object for the run. \"\"\" conn = BenchmarkDBManager ( self . benchmark_name ) . conn conn . execute ( f ' INSERT INTO \" { self . table_name } \" VALUES ( ' f \"' { run_identifier } ',\" f \" { rank_stats . top } ,\" f \" { rank_stats . top3 } ,\" f \" { rank_stats . top5 } ,\" f \" { rank_stats . top10 } ,\" f \" { rank_stats . found } ,\" f \" { rank_stats . total } ,\" f \" { rank_stats . mean_reciprocal_rank () } ,\" f \" { rank_stats . percentage_top () } ,\" f \" { rank_stats . percentage_top3 () } ,\" f \" { rank_stats . percentage_top5 () } ,\" f \" { rank_stats . percentage_top10 () } ,\" f \" { rank_stats . percentage_found () } ,\" f \" { rank_stats . precision_at_k ( 1 ) } ,\" f \" { rank_stats . precision_at_k ( 3 ) } ,\" f \" { rank_stats . precision_at_k ( 5 ) } ,\" f \" { rank_stats . precision_at_k ( 10 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 1 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 3 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 5 ) } ,\" f \" { rank_stats . mean_average_precision_at_k ( 10 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 1 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 3 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 5 ) } ,\" f \" { rank_stats . f_beta_score_at_k ( rank_stats . percentage_top (), 10 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 3 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 5 ) } ,\" f \" { rank_stats . mean_normalised_discounted_cumulative_gain ( 10 ) } ,\" f \" { binary_classification . true_positives } ,\" f \" { binary_classification . false_positives } ,\" f \" { binary_classification . true_negatives } ,\" f \" { binary_classification . false_negatives } ,\" f \" { binary_classification . sensitivity () } ,\" f \" { binary_classification . specificity () } ,\" f \" { binary_classification . precision () } ,\" f \" { binary_classification . negative_predictive_value () } ,\" f \" { binary_classification . false_positive_rate () } ,\" f \" { binary_classification . false_discovery_rate () } ,\" f \" { binary_classification . false_negative_rate () } ,\" f \" { binary_classification . accuracy () } ,\" f \" { binary_classification . f1_score () } ,\" f \" { binary_classification . matthews_correlation_coefficient () } )\" ) conn . close ()","title":"add_statistics_entry"},{"location":"api/pheval/analyse/run_data_parser/","text":"Config Bases: BaseModel Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. Source code in src/pheval/analyse/run_data_parser.py 101 102 103 104 105 106 107 108 109 110 class Config ( BaseModel ): \"\"\" Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. \"\"\" benchmark_name : str runs : List [ RunConfig ] plot_customisation : PlotCustomisation PlotCustomisation Bases: BaseModel Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. Source code in src/pheval/analyse/run_data_parser.py 87 88 89 90 91 92 93 94 95 96 97 98 class PlotCustomisation ( BaseModel ): \"\"\" Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. \"\"\" gene_plots : SinglePlotCustomisation disease_plots : SinglePlotCustomisation variant_plots : SinglePlotCustomisation RunConfig Bases: BaseModel Store configurations for a run. Attributes: Name Type Description run_identifier str The run identifier. phenopacket_dir str The path to the phenopacket directory used for generating the results. results_dir str The path to the results directory. gene_analysis bool Whether or not to benchmark gene analysis results. variant_analysis bool Whether or not to benchmark variant analysis results. disease_analysis bool Whether or not to benchmark disease analysis results. threshold Optional [ float ] The threshold to consider for benchmarking. score_order Optional [ str ] The order of scores to consider for benchmarking, either ascending or descending. Source code in src/pheval/analyse/run_data_parser.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 class RunConfig ( BaseModel ): \"\"\" Store configurations for a run. Attributes: run_identifier (str): The run identifier. phenopacket_dir (str): The path to the phenopacket directory used for generating the results. results_dir (str): The path to the results directory. gene_analysis (bool): Whether or not to benchmark gene analysis results. variant_analysis (bool): Whether or not to benchmark variant analysis results. disease_analysis (bool): Whether or not to benchmark disease analysis results. threshold (Optional[float]): The threshold to consider for benchmarking. score_order (Optional[str]): The order of scores to consider for benchmarking, either ascending or descending. \"\"\" run_identifier : str phenopacket_dir : Path results_dir : Path gene_analysis : bool variant_analysis : bool disease_analysis : bool threshold : Optional [ float ] score_order : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values handle_blank_fields ( values ) Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values SinglePlotCustomisation Bases: BaseModel Store customisations for plots. Attributes: Name Type Description plot_type str The plot type. rank_plot_title str The title for the rank summary plot. roc_curve_title str The title for the roc curve plot. precision_recall_title str The title for the precision-recall plot. Source code in src/pheval/analyse/run_data_parser.py 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class SinglePlotCustomisation ( BaseModel ): \"\"\" Store customisations for plots. Attributes: plot_type (str): The plot type. rank_plot_title (str): The title for the rank summary plot. roc_curve_title (str): The title for the roc curve plot. precision_recall_title (str): The title for the precision-recall plot. \"\"\" plot_type : Optional [ str ] = \"bar_cumulative\" rank_plot_title : Optional [ str ] roc_curve_title : Optional [ str ] precision_recall_title : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values handle_blank_fields ( values ) Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values parse_run_config ( run_config ) Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. Source code in src/pheval/analyse/run_data_parser.py 113 114 115 116 117 118 119 120 121 122 123 124 125 def parse_run_config ( run_config : Path ) -> Config : \"\"\" Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. \"\"\" with open ( run_config , \"r\" ) as f : config_data = yaml . safe_load ( f ) f . close () config = Config ( ** config_data ) return config","title":"Run data parser"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.Config","text":"Bases: BaseModel Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. Source code in src/pheval/analyse/run_data_parser.py 101 102 103 104 105 106 107 108 109 110 class Config ( BaseModel ): \"\"\" Store configurations for a runs. Attributes: runs (List[RunConfig]): The list of run configurations. \"\"\" benchmark_name : str runs : List [ RunConfig ] plot_customisation : PlotCustomisation","title":"Config"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.PlotCustomisation","text":"Bases: BaseModel Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. Source code in src/pheval/analyse/run_data_parser.py 87 88 89 90 91 92 93 94 95 96 97 98 class PlotCustomisation ( BaseModel ): \"\"\" Store customisations for all plots. Attributes: gene_plots (SinglePlotCustomisation): Customisation for all gene benchmarking plots. disease_plots (SinglePlotCustomisation): Customisation for all disease benchmarking plots. variant_plots (SinglePlotCustomisation): Customisation for all variant benchmarking plots. \"\"\" gene_plots : SinglePlotCustomisation disease_plots : SinglePlotCustomisation variant_plots : SinglePlotCustomisation","title":"PlotCustomisation"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.RunConfig","text":"Bases: BaseModel Store configurations for a run. Attributes: Name Type Description run_identifier str The run identifier. phenopacket_dir str The path to the phenopacket directory used for generating the results. results_dir str The path to the results directory. gene_analysis bool Whether or not to benchmark gene analysis results. variant_analysis bool Whether or not to benchmark variant analysis results. disease_analysis bool Whether or not to benchmark disease analysis results. threshold Optional [ float ] The threshold to consider for benchmarking. score_order Optional [ str ] The order of scores to consider for benchmarking, either ascending or descending. Source code in src/pheval/analyse/run_data_parser.py 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 class RunConfig ( BaseModel ): \"\"\" Store configurations for a run. Attributes: run_identifier (str): The run identifier. phenopacket_dir (str): The path to the phenopacket directory used for generating the results. results_dir (str): The path to the results directory. gene_analysis (bool): Whether or not to benchmark gene analysis results. variant_analysis (bool): Whether or not to benchmark variant analysis results. disease_analysis (bool): Whether or not to benchmark disease analysis results. threshold (Optional[float]): The threshold to consider for benchmarking. score_order (Optional[str]): The order of scores to consider for benchmarking, either ascending or descending. \"\"\" run_identifier : str phenopacket_dir : Path results_dir : Path gene_analysis : bool variant_analysis : bool disease_analysis : bool threshold : Optional [ float ] score_order : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values","title":"RunConfig"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.RunConfig.handle_blank_fields","text":"Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'threshold' and 'score_order' are None and assigns default values if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"threshold\" ) is None : values [ \"threshold\" ] = 0 print ( \"setting default threshold\" ) if values . get ( \"score_order\" ) is None : values [ \"score_order\" ] = \"descending\" return values","title":"handle_blank_fields"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.SinglePlotCustomisation","text":"Bases: BaseModel Store customisations for plots. Attributes: Name Type Description plot_type str The plot type. rank_plot_title str The title for the rank summary plot. roc_curve_title str The title for the roc curve plot. precision_recall_title str The title for the precision-recall plot. Source code in src/pheval/analyse/run_data_parser.py 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 class SinglePlotCustomisation ( BaseModel ): \"\"\" Store customisations for plots. Attributes: plot_type (str): The plot type. rank_plot_title (str): The title for the rank summary plot. roc_curve_title (str): The title for the roc curve plot. precision_recall_title (str): The title for the precision-recall plot. \"\"\" plot_type : Optional [ str ] = \"bar_cumulative\" rank_plot_title : Optional [ str ] roc_curve_title : Optional [ str ] precision_recall_title : Optional [ str ] @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values","title":"SinglePlotCustomisation"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.SinglePlotCustomisation.handle_blank_fields","text":"Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Parameters: Name Type Description Default values dict The input values provided to the model. required Returns: Name Type Description dict dict The updated values with defaults applied where necessary. Source code in src/pheval/analyse/run_data_parser.py 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 @root_validator ( pre = True ) def handle_blank_fields ( cls , values : dict ) -> dict : # noqa: N805 \"\"\" Root validator to handle fields that may be explicitly set to None. This method checks if 'plot_type' is None and assigns default value if so. Args: values (dict): The input values provided to the model. Returns: dict: The updated values with defaults applied where necessary. \"\"\" if values . get ( \"plot_type\" ) is None : values [ \"plot_type\" ] = \"bar_cumulative\" return values","title":"handle_blank_fields"},{"location":"api/pheval/analyse/run_data_parser/#src.pheval.analyse.run_data_parser.parse_run_config","text":"Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. Source code in src/pheval/analyse/run_data_parser.py 113 114 115 116 117 118 119 120 121 122 123 124 125 def parse_run_config ( run_config : Path ) -> Config : \"\"\" Parse a run configuration yaml file. Args: run_config (Path): The path to the run data yaml configuration. Returns: Config: The parsed run configurations. \"\"\" with open ( run_config , \"r\" ) as f : config_data = yaml . safe_load ( f ) f . close () config = Config ( ** config_data ) return config","title":"parse_run_config"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/","text":"AssessVariantPrioritisation Bases: AssessPrioritisationBase Class for assessing variant prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 class AssessVariantPrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing variant prioritisation based on thresholds and scoring orders.\"\"\" def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , ) assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , binary_classification_stats ) Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_variant_result_path Path Path to standardised variant TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , ) assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker ) Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required variant_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required variant_benchmarker AssessVariantPrioritisation AssessVariantPrioritisation class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 def assess_phenopacket_variant_prioritisation ( phenopacket_path : Path , run : RunConfig , variant_binary_classification_stats : BinaryClassificationStats , variant_benchmarker : AssessVariantPrioritisation , ) -> None : \"\"\" Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance. \"\"\" standardised_variant_result_path = run . results_dir . joinpath ( f \"pheval_variant_results/ { phenopacket_path . stem } -pheval_variant_result.tsv\" ) variant_benchmarker . assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , variant_binary_classification_stats , ) benchmark_variant_prioritisation ( benchmark_name , run , score_order , threshold ) Benchmark a directory based on variant prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def benchmark_variant_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on variant prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" variant_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) variant_benchmarker = AssessVariantPrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_variant\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker , ) variant_rank_stats = RankStats () variant_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _variant\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( benchmark_name = run . run_identifier , rank_stats = variant_rank_stats , binary_classification_stats = variant_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"Variant prioritisation analysis"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.AssessVariantPrioritisation","text":"Bases: AssessPrioritisationBase Class for assessing variant prioritisation based on thresholds and scoring orders. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 class AssessVariantPrioritisation ( AssessPrioritisationBase ): \"\"\"Class for assessing variant prioritisation based on thresholds and scoring orders.\"\"\" def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , )","title":"AssessVariantPrioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.AssessVariantPrioritisation.assess_variant_prioritisation","text":"Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Parameters: Name Type Description Default standardised_variant_result_path Path Path to standardised variant TSV result. required phenopacket_path Path Path to the phenopacket. required binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 def assess_variant_prioritisation ( self , standardised_variant_result_path : Path , phenopacket_path : Path , binary_classification_stats : BinaryClassificationStats , ) -> None : \"\"\" Assess variant prioritisation. This method assesses the prioritisation of variants based on the provided criteria and records ranks using a PrioritisationRankRecorder. Args: standardised_variant_result_path (Path): Path to standardised variant TSV result. phenopacket_path (Path): Path to the phenopacket. binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. \"\"\" relevant_ranks = [] df = self . conn . execute ( f \"\"\"SELECT * FROM { self . table_name } WHERE phenopacket = ' { phenopacket_path . name } '\"\"\" ) . fetchdf () for _i , row in df . iterrows (): causative_variant = GenomicVariant ( chrom = row [ \"chrom\" ], pos = int ( row [ \"pos\" ]), ref = row [ \"ref\" ], alt = row [ \"alt\" ], ) result = ( self . conn . execute ( f \"SELECT * FROM ' { standardised_variant_result_path } ' \" f \"WHERE \" f \"chromosome == ' { causative_variant . chrom } ' AND \" f \"start == { causative_variant . pos } AND \" f \"ref == ' { causative_variant . ref } ' AND \" f \"alt == ' { causative_variant . alt } '\" ) . fetchdf () . to_dict ( orient = \"records\" ) ) if len ( result ) > 0 : variant_match = self . _record_matched_entity ( RankedPhEvalVariantResult ( ** result [ 0 ])) relevant_ranks . append ( variant_match ) primary_key = ( f \" { phenopacket_path . name } - { causative_variant . chrom } - { causative_variant . pos } -\" f \" { causative_variant . ref } - { causative_variant . alt } \" ) self . conn . execute ( f 'UPDATE { self . table_name } SET \" { self . column } \" = ? WHERE identifier = ?' , ( variant_match , primary_key ), ) binary_classification_stats . add_classification ( self . db_connection . parse_table_into_dataclass ( str ( standardised_variant_result_path ), RankedPhEvalVariantResult ), relevant_ranks , )","title":"assess_variant_prioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.assess_phenopacket_variant_prioritisation","text":"Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path Path to the Phenopacket. required run RunConfig Run configuration. required variant_binary_classification_stats BinaryClassificationStats BinaryClassificationStats class instance. required variant_benchmarker AssessVariantPrioritisation AssessVariantPrioritisation class instance. required Source code in src/pheval/analyse/variant_prioritisation_analysis.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 def assess_phenopacket_variant_prioritisation ( phenopacket_path : Path , run : RunConfig , variant_binary_classification_stats : BinaryClassificationStats , variant_benchmarker : AssessVariantPrioritisation , ) -> None : \"\"\" Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results against the recorded causative variants for a proband in the Phenopacket. Args: phenopacket_path (Path): Path to the Phenopacket. run (RunConfig): Run configuration. variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance. variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance. \"\"\" standardised_variant_result_path = run . results_dir . joinpath ( f \"pheval_variant_results/ { phenopacket_path . stem } -pheval_variant_result.tsv\" ) variant_benchmarker . assess_variant_prioritisation ( standardised_variant_result_path , phenopacket_path , variant_binary_classification_stats , )","title":"assess_phenopacket_variant_prioritisation"},{"location":"api/pheval/analyse/variant_prioritisation_analysis/#src.pheval.analyse.variant_prioritisation_analysis.benchmark_variant_prioritisation","text":"Benchmark a directory based on variant prioritisation results. Parameters: Name Type Description Default benchmark_name str Name of the benchmark. required run RunConfig Run configuration. required score_order str The order in which scores are arranged. required threshold float Threshold for assessment. required Returns: Name Type Description BenchmarkRunResults An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. Source code in src/pheval/analyse/variant_prioritisation_analysis.py 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def benchmark_variant_prioritisation ( benchmark_name : str , run : RunConfig , score_order : str , threshold : float , ): \"\"\" Benchmark a directory based on variant prioritisation results. Args: benchmark_name (str): Name of the benchmark. run (RunConfig): Run configuration. score_order (str): The order in which scores are arranged. threshold (float): Threshold for assessment. Returns: BenchmarkRunResults: An object containing benchmarking results for variant prioritisation, including ranks and rank statistics for the benchmarked directory. \"\"\" variant_binary_classification_stats = BinaryClassificationStats () db_connection = BenchmarkDBManager ( benchmark_name ) variant_benchmarker = AssessVariantPrioritisation ( db_connection , f \" { run . phenopacket_dir . parents [ 0 ] . name } \" f \"_variant\" , run . run_identifier , threshold , score_order , ) for phenopacket_path in all_files ( run . phenopacket_dir ): assess_phenopacket_variant_prioritisation ( phenopacket_path , run , variant_binary_classification_stats , variant_benchmarker , ) variant_rank_stats = RankStats () variant_rank_stats . add_ranks ( benchmark_name = benchmark_name , table_name = f \" { run . phenopacket_dir . parents [ 0 ] . name } _variant\" , column_name = str ( run . run_identifier ), ) return BenchmarkRunResults ( benchmark_name = run . run_identifier , rank_stats = variant_rank_stats , binary_classification_stats = variant_binary_classification_stats , phenopacket_dir = run . phenopacket_dir , )","title":"benchmark_variant_prioritisation"},{"location":"api/pheval/infra/exomiserdb/","text":"DBConnection Source code in src/pheval/infra/exomiserdb.py 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 class DBConnection : connection = None def __init__ ( self , connection ): DBConnection . connection = connection @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection def close ( self ): return self . connection . close () @classmethod def get_cursor ( cls ) -> jaydebeapi . Cursor : connection = cls . get_connection () return connection . cursor () get_connection () classmethod Creates return new Singleton database connection Source code in src/pheval/infra/exomiserdb.py 49 50 51 52 @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection DBConnector Source code in src/pheval/infra/exomiserdb.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 class DBConnector : def __init__ ( self , jar : Path , driver : str , server : str , database : str , user : str , password : str ): self . jar = jar self . driver = driver self . server = server self . database = database self . user = user self . password = password self . dbconn = None def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) def __enter__ ( self ) -> jaydebeapi . Connection : self . dbconn = self . create_connection () return self . dbconn def __exit__ ( self , * other ): self . dbconn . close () create_connection () creates h2 database connection Source code in src/pheval/infra/exomiserdb.py 26 27 28 29 30 31 32 33 def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) ExomiserDB Source code in src/pheval/infra/exomiserdb.py 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 class ExomiserDB : def __init__ ( self , db_path : Path ): try : self . connector = DBConnector ( # noqa jar = os . path . join ( os . path . dirname ( __file__ ), \"../../../lib/h2-1.4.199.jar\" ), driver = \"org.h2.Driver\" , server = f \"jdbc:h2: { db_path } \" , user = \"sa\" , password = \"\" , database = \"\" , ) except Exception as e : print ( \"An exception occurred\" , e ) def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length ) import_from_semsim_file ( input_file , subject_prefix , object_prefix ) imports semsim tsv profile into exomiser phenotype database Parameters: Name Type Description Default input_file Path semsim profile required subject_prefix str Subject Prefix. e.g HP required object_prefix str Object Prefix. e.g MP required Source code in src/pheval/infra/exomiserdb.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"Exomiserdb"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnection","text":"Source code in src/pheval/infra/exomiserdb.py 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 class DBConnection : connection = None def __init__ ( self , connection ): DBConnection . connection = connection @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection def close ( self ): return self . connection . close () @classmethod def get_cursor ( cls ) -> jaydebeapi . Cursor : connection = cls . get_connection () return connection . cursor ()","title":"DBConnection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnection.get_connection","text":"Creates return new Singleton database connection Source code in src/pheval/infra/exomiserdb.py 49 50 51 52 @classmethod def get_connection ( cls ) -> jaydebeapi . Connection : \"\"\"Creates return new Singleton database connection\"\"\" return DBConnection . connection","title":"get_connection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnector","text":"Source code in src/pheval/infra/exomiserdb.py 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 class DBConnector : def __init__ ( self , jar : Path , driver : str , server : str , database : str , user : str , password : str ): self . jar = jar self . driver = driver self . server = server self . database = database self . user = user self . password = password self . dbconn = None def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , ) def __enter__ ( self ) -> jaydebeapi . Connection : self . dbconn = self . create_connection () return self . dbconn def __exit__ ( self , * other ): self . dbconn . close ()","title":"DBConnector"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.DBConnector.create_connection","text":"creates h2 database connection Source code in src/pheval/infra/exomiserdb.py 26 27 28 29 30 31 32 33 def create_connection ( self ) -> jaydebeapi . Connection : \"\"\"creates h2 database connection\"\"\" return jaydebeapi . connect ( self . driver , f \" { self . server }{ self . database } \" , [ self . user , self . password ], self . jar , )","title":"create_connection"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.ExomiserDB","text":"Source code in src/pheval/infra/exomiserdb.py 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 class ExomiserDB : def __init__ ( self , db_path : Path ): try : self . connector = DBConnector ( # noqa jar = os . path . join ( os . path . dirname ( __file__ ), \"../../../lib/h2-1.4.199.jar\" ), driver = \"org.h2.Driver\" , server = f \"jdbc:h2: { db_path } \" , user = \"sa\" , password = \"\" , database = \"\" , ) except Exception as e : print ( \"An exception occurred\" , e ) def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"ExomiserDB"},{"location":"api/pheval/infra/exomiserdb/#src.pheval.infra.exomiserdb.ExomiserDB.import_from_semsim_file","text":"imports semsim tsv profile into exomiser phenotype database Parameters: Name Type Description Default input_file Path semsim profile required subject_prefix str Subject Prefix. e.g HP required object_prefix str Object Prefix. e.g MP required Source code in src/pheval/infra/exomiserdb.py 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 def import_from_semsim_file ( self , input_file : Path , subject_prefix : str , object_prefix : str ): \"\"\"imports semsim tsv profile into exomiser phenotype database Args: input_file (Path): semsim profile subject_prefix (str): Subject Prefix. e.g HP object_prefix (str): Object Prefix. e.g MP \"\"\" with self . connector as cnn : conn = DBConnection ( cnn ) reader = pl . read_csv_batched ( input_file , separator = \" \\t \" ) batch_length = 5 batches = reader . next_batches ( batch_length ) cursor = conn . get_cursor () # # TODO: Refactor this with open ( input_file , \"r\" ) as f : total = sum ( 1 for line in f ) pbar = tqdm ( total = total - 1 ) mapping_id = 1 while batches : input_data = pl . concat ( batches ) sql = _semsim2h2 ( input_data , object_prefix , subject_prefix , mapping_id = mapping_id ) cursor . execute ( sql ) len_input_data = len ( input_data ) mapping_id += len_input_data pbar . update ( len_input_data ) batches = reader . next_batches ( batch_length )","title":"import_from_semsim_file"},{"location":"api/pheval/post_processing/post_processing/","text":"PhEvalDiseaseResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float PhEvalGeneResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float PhEvalResult dataclass Base class for PhEval results. Source code in src/pheval/post_processing/post_processing.py 25 26 27 @dataclass class PhEvalResult : \"\"\"Base class for PhEval results.\"\"\" PhEvalVariantResult dataclass Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float RankedPhEvalDiseaseResult dataclass Bases: PhEvalDiseaseResult PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @dataclass class RankedPhEvalDiseaseResult ( PhEvalDiseaseResult ): \"\"\"PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , ) from_disease_result ( pheval_disease_result , rank ) staticmethod Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalDiseaseResult The result as a RankedPhEvalDiseaseResult Source code in src/pheval/post_processing/post_processing.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , ) RankedPhEvalGeneResult dataclass Bases: PhEvalGeneResult PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @dataclass class RankedPhEvalGeneResult ( PhEvalGeneResult ): \"\"\"PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , ) from_gene_result ( pheval_gene_result , rank ) staticmethod Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalGeneResult The result as a RankedPhEvalGeneResult Source code in src/pheval/post_processing/post_processing.py 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , ) RankedPhEvalVariantResult dataclass Bases: PhEvalVariantResult PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @dataclass class RankedPhEvalVariantResult ( PhEvalVariantResult ): \"\"\"PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , ) from_variant_result ( pheval_variant_result , rank ) staticmethod Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalVariantResult The result as a RankedPhEvalVariantResult Source code in src/pheval/post_processing/post_processing.py 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , ) ResultSorter Class for sorting PhEvalResult instances based on a given sort order. Source code in src/pheval/post_processing/post_processing.py 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 class ResultSorter : \"\"\"Class for sorting PhEvalResult instances based on a given sort order.\"\"\" def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order def _sort_by_decreasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in descending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = True ) def _sort_by_increasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in ascending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = False ) def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () ) __init__ ( pheval_results , sort_order ) Initialise ResultSorter Parameters: Name Type Description Default pheval_results [ PhEvalResult ] List of PhEvalResult instances to be sorted required sort_order SortOrder Sorting order to be applied required Source code in src/pheval/post_processing/post_processing.py 188 189 190 191 192 193 194 195 196 197 def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order sort_pheval_results () Sort results based on the specified sort order. Returns: Type Description [ PhEvalResult ] [PhEvalResult]: Sorted list of PhEvalResult instances. Source code in src/pheval/post_processing/post_processing.py 217 218 219 220 221 222 223 224 225 226 227 228 def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () ) SortOrder Bases: Enum Enumeration representing sorting orders. Source code in src/pheval/post_processing/post_processing.py 176 177 178 179 180 181 182 class SortOrder ( Enum ): \"\"\"Enumeration representing sorting orders.\"\"\" ASCENDING = 1 \"\"\"Ascending sort order.\"\"\" DESCENDING = 2 \"\"\"Descending sort order.\"\"\" ASCENDING = 1 class-attribute instance-attribute Ascending sort order. DESCENDING = 2 class-attribute instance-attribute Descending sort order. calculate_end_pos ( variant_start , variant_ref ) Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: Name Type Description int int The end position of the variant Source code in src/pheval/post_processing/post_processing.py 13 14 15 16 17 18 19 20 21 22 def calculate_end_pos ( variant_start : int , variant_ref : str ) -> int : \"\"\"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: int: The end position of the variant \"\"\" return variant_start + len ( variant_ref ) - 1 generate_pheval_result ( pheval_result , sort_order_str , output_dir , tool_result_path ) Generate PhEval variant, gene or disease TSV result based on input results. Parameters: Name Type Description Default pheval_result [ PhEvalResult ] List of PhEvalResult instances to be processed. required sort_order_str str String representation of the desired sorting order. required output_dir Path Path to the output directory. required tool_result_path Path Path to the tool-specific result file. required Raises: Type Description ValueError If the results are not all the same type or an error occurs during file writing. Source code in src/pheval/post_processing/post_processing.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 def generate_pheval_result ( pheval_result : [ PhEvalResult ], sort_order_str : str , output_dir : Path , tool_result_path : Path , ) -> None : \"\"\" Generate PhEval variant, gene or disease TSV result based on input results. Args: pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed. sort_order_str (str): String representation of the desired sorting order. output_dir (Path): Path to the output directory. tool_result_path (Path): Path to the tool-specific result file. Raises: ValueError: If the results are not all the same type or an error occurs during file writing. \"\"\" if not pheval_result : info_log . warning ( f \"No results found for { tool_result_path . name } \" ) return ranked_pheval_result = _create_pheval_result ( pheval_result , sort_order_str ) if all ( isinstance ( result , PhEvalGeneResult ) for result in pheval_result ): _write_pheval_gene_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalVariantResult ) for result in pheval_result ): _write_pheval_variant_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalDiseaseResult ) for result in pheval_result ): _write_pheval_disease_result ( ranked_pheval_result , output_dir , tool_result_path ) else : raise ValueError ( \"Results are not all of the same type.\" )","title":"Post processing"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalDiseaseResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 @dataclass class PhEvalDiseaseResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for disease prioritisation Args: disease_name (str): Disease name for the result entry disease_identifier (str): Identifier for the disease result entry in the OMIM namespace score (str): Score for the disease result entry Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str score : float","title":"PhEvalDiseaseResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalGeneResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 @dataclass class PhEvalGeneResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for gene prioritisation result Args: gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry score (float): The score for the gene result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : Union [ List [ str ], str ] gene_identifier : Union [ List [ str ], str ] score : float","title":"PhEvalGeneResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalResult","text":"Base class for PhEval results. Source code in src/pheval/post_processing/post_processing.py 25 26 27 @dataclass class PhEvalResult : \"\"\"Base class for PhEval results.\"\"\"","title":"PhEvalResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.PhEvalVariantResult","text":"Bases: PhEvalResult Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. Source code in src/pheval/post_processing/post_processing.py 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 @dataclass class PhEvalVariantResult ( PhEvalResult ): \"\"\"Minimal data required from tool-specific output for variant prioritisation Args: chromosome (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. start (int): The start position of the variant end (int): The end position of the variant ref (str): The reference allele of the variant alt (str): The alternate allele of the variant score (float): The score for the variant result entry Notes: While we recommend providing the variant's chromosome in the specified format, any matching format used in Phenopacket interpretations is acceptable for result matching purposes in the analysis. \"\"\" chromosome : str start : int end : int ref : str alt : str score : float","title":"PhEvalVariantResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalDiseaseResult","text":"Bases: PhEvalDiseaseResult PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @dataclass class RankedPhEvalDiseaseResult ( PhEvalDiseaseResult ): \"\"\"PhEval disease result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , )","title":"RankedPhEvalDiseaseResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalDiseaseResult.from_disease_result","text":"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalDiseaseResult The result as a RankedPhEvalDiseaseResult Source code in src/pheval/post_processing/post_processing.py 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 @staticmethod def from_disease_result ( pheval_disease_result : PhEvalDiseaseResult , rank : int ): \"\"\"Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank Args: pheval_disease_result (PhEvalDiseaseResult): The disease result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult \"\"\" return RankedPhEvalDiseaseResult ( disease_name = pheval_disease_result . disease_name , disease_identifier = pheval_disease_result . disease_identifier , score = pheval_disease_result . score , rank = rank , )","title":"from_disease_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalGeneResult","text":"Bases: PhEvalGeneResult PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @dataclass class RankedPhEvalGeneResult ( PhEvalGeneResult ): \"\"\"PhEval gene result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , )","title":"RankedPhEvalGeneResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalGeneResult.from_gene_result","text":"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalGeneResult The result as a RankedPhEvalGeneResult Source code in src/pheval/post_processing/post_processing.py 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 @staticmethod def from_gene_result ( pheval_gene_result : PhEvalGeneResult , rank : int ): \"\"\"Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank Args: pheval_gene_result (PhEvalGeneResult): The gene result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult \"\"\" return RankedPhEvalGeneResult ( gene_symbol = pheval_gene_result . gene_symbol , gene_identifier = pheval_gene_result . gene_identifier , score = pheval_gene_result . score , rank = rank , )","title":"from_gene_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalVariantResult","text":"Bases: PhEvalVariantResult PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry Source code in src/pheval/post_processing/post_processing.py 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @dataclass class RankedPhEvalVariantResult ( PhEvalVariantResult ): \"\"\"PhEval variant result with corresponding rank Args: rank (int): The rank for the result entry \"\"\" rank : int @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , )","title":"RankedPhEvalVariantResult"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.RankedPhEvalVariantResult.from_variant_result","text":"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: Name Type Description RankedPhEvalVariantResult The result as a RankedPhEvalVariantResult Source code in src/pheval/post_processing/post_processing.py 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 @staticmethod def from_variant_result ( pheval_variant_result : PhEvalVariantResult , rank : int ): \"\"\"Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank Args: pheval_variant_result (PhEvalVariantResult): The variant result entry rank (int): The corresponding rank for the result entry Returns: RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult \"\"\" return RankedPhEvalVariantResult ( chromosome = pheval_variant_result . chromosome , start = pheval_variant_result . start , end = pheval_variant_result . end , ref = pheval_variant_result . ref , alt = pheval_variant_result . alt , score = pheval_variant_result . score , rank = rank , )","title":"from_variant_result"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter","text":"Class for sorting PhEvalResult instances based on a given sort order. Source code in src/pheval/post_processing/post_processing.py 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 class ResultSorter : \"\"\"Class for sorting PhEvalResult instances based on a given sort order.\"\"\" def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order def _sort_by_decreasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in descending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = True ) def _sort_by_increasing_score ( self ) -> [ PhEvalResult ]: \"\"\" Sort results in ascending order based on the score Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return sorted ( self . pheval_results , key = operator . attrgetter ( \"score\" ), reverse = False ) def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () )","title":"ResultSorter"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter.__init__","text":"Initialise ResultSorter Parameters: Name Type Description Default pheval_results [ PhEvalResult ] List of PhEvalResult instances to be sorted required sort_order SortOrder Sorting order to be applied required Source code in src/pheval/post_processing/post_processing.py 188 189 190 191 192 193 194 195 196 197 def __init__ ( self , pheval_results : [ PhEvalResult ], sort_order : SortOrder ): \"\"\" Initialise ResultSorter Args: pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted sort_order (SortOrder): Sorting order to be applied \"\"\" self . pheval_results = pheval_results self . sort_order = sort_order","title":"__init__"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.ResultSorter.sort_pheval_results","text":"Sort results based on the specified sort order. Returns: Type Description [ PhEvalResult ] [PhEvalResult]: Sorted list of PhEvalResult instances. Source code in src/pheval/post_processing/post_processing.py 217 218 219 220 221 222 223 224 225 226 227 228 def sort_pheval_results ( self ) -> [ PhEvalResult ]: \"\"\" Sort results based on the specified sort order. Returns: [PhEvalResult]: Sorted list of PhEvalResult instances. \"\"\" return ( self . _sort_by_increasing_score () if self . sort_order == SortOrder . ASCENDING else self . _sort_by_decreasing_score () )","title":"sort_pheval_results"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder","text":"Bases: Enum Enumeration representing sorting orders. Source code in src/pheval/post_processing/post_processing.py 176 177 178 179 180 181 182 class SortOrder ( Enum ): \"\"\"Enumeration representing sorting orders.\"\"\" ASCENDING = 1 \"\"\"Ascending sort order.\"\"\" DESCENDING = 2 \"\"\"Descending sort order.\"\"\"","title":"SortOrder"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder.ASCENDING","text":"Ascending sort order.","title":"ASCENDING"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.SortOrder.DESCENDING","text":"Descending sort order.","title":"DESCENDING"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.calculate_end_pos","text":"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: Name Type Description int int The end position of the variant Source code in src/pheval/post_processing/post_processing.py 13 14 15 16 17 18 19 20 21 22 def calculate_end_pos ( variant_start : int , variant_ref : str ) -> int : \"\"\"Calculate the end position for a variant Args: variant_start (int): The start position of the variant variant_ref (str): The reference allele of the variant Returns: int: The end position of the variant \"\"\" return variant_start + len ( variant_ref ) - 1","title":"calculate_end_pos"},{"location":"api/pheval/post_processing/post_processing/#src.pheval.post_processing.post_processing.generate_pheval_result","text":"Generate PhEval variant, gene or disease TSV result based on input results. Parameters: Name Type Description Default pheval_result [ PhEvalResult ] List of PhEvalResult instances to be processed. required sort_order_str str String representation of the desired sorting order. required output_dir Path Path to the output directory. required tool_result_path Path Path to the tool-specific result file. required Raises: Type Description ValueError If the results are not all the same type or an error occurs during file writing. Source code in src/pheval/post_processing/post_processing.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 def generate_pheval_result ( pheval_result : [ PhEvalResult ], sort_order_str : str , output_dir : Path , tool_result_path : Path , ) -> None : \"\"\" Generate PhEval variant, gene or disease TSV result based on input results. Args: pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed. sort_order_str (str): String representation of the desired sorting order. output_dir (Path): Path to the output directory. tool_result_path (Path): Path to the tool-specific result file. Raises: ValueError: If the results are not all the same type or an error occurs during file writing. \"\"\" if not pheval_result : info_log . warning ( f \"No results found for { tool_result_path . name } \" ) return ranked_pheval_result = _create_pheval_result ( pheval_result , sort_order_str ) if all ( isinstance ( result , PhEvalGeneResult ) for result in pheval_result ): _write_pheval_gene_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalVariantResult ) for result in pheval_result ): _write_pheval_variant_result ( ranked_pheval_result , output_dir , tool_result_path ) elif all ( isinstance ( result , PhEvalDiseaseResult ) for result in pheval_result ): _write_pheval_disease_result ( ranked_pheval_result , output_dir , tool_result_path ) else : raise ValueError ( \"Results are not all of the same type.\" )","title":"generate_pheval_result"},{"location":"api/pheval/prepare/create_noisy_phenopackets/","text":"HpoRandomiser Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO). Source code in src/pheval/prepare/create_noisy_phenopackets.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 class HpoRandomiser : \"\"\"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO).\"\"\" def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), ) __init__ ( hpo_ontology , scramble_factor ) Initialise the HpoRandomiser. Parameters: Name Type Description Default hpo_ontology ProntoImplementation The instance of the HPO ontology. required scramble_factor float A factor for scrambling phenotypic features. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 37 38 39 40 41 42 43 44 45 46 47 def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor add_noise_to_phenotypic_profile ( phenopacket ) Randomise the phenotypic profile of a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] The Phenopacket or Family to be randomised. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The randomised Phenopacket or Family. Source code in src/pheval/prepare/create_noisy_phenopackets.py 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket convert_patient_terms_to_parent ( phenotypic_features , retained_phenotypic_features , number_of_scrambled_terms ) Convert a subset of patient HPO terms to their respective parent terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of all phenotypic features. required retained_phenotypic_features List [ PhenotypicFeature ] List of retained non-scrambled phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. Source code in src/pheval/prepare/create_noisy_phenopackets.py 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms create_random_hpo_terms ( number_of_scrambled_terms ) Generate a list of random HPO terms. Parameters: Name Type Description Default number_of_scrambled_terms int The count of random HPO terms to be generated. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomly selected HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] create_scrambled_phenopacket ( output_dir , phenopacket_path ) Create a scrambled version of a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopacket. required phenopacket_path Path The path to the original Phenopacket file. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) create_scrambled_phenopackets ( output_dir , phenopacket_dir ) Create scrambled versions of Phenopackets within a directory. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_dir Path The directory containing the original Phenopacket files. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), ) randomise_hpo_terms ( phenotypic_features ) Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features to be randomised. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomised HPO terms. Note This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. Source code in src/pheval/prepare/create_noisy_phenopackets.py 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) staticmethod Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) retrieve_hpo_term ( hpo_id ) Retrieve an HPO term based on the provided HPO ID. Parameters: Name Type Description Default hpo_id str The HPO ID of the term to retrieve. required Returns: Name Type Description PhenotypicFeature PhenotypicFeature The PhenotypicFeature object representing the retrieved HPO term. Source code in src/pheval/prepare/create_noisy_phenopackets.py 64 65 66 67 68 69 70 71 72 73 74 75 76 def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) scramble_factor_proportions ( phenotypic_features ) Calculate the proportion of scrambled HPO terms based on the scramble factor. Parameters: Name Type Description Default phenotypic_features list [ PhenotypicFeature ] List of phenotypic features. required Returns: Name Type Description int int The calculated number of phenotypic features to be scrambled. Source code in src/pheval/prepare/create_noisy_phenopackets.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) load_ontology ( local_cached_ontology = None ) Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. Source code in src/pheval/prepare/create_noisy_phenopackets.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 def load_ontology ( local_cached_ontology : Path = None ) -> ProntoImplementation : \"\"\" Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. \"\"\" if local_cached_ontology is None : resource = OntologyResource ( slug = \"hp.obo\" , local = False ) return ProntoImplementation ( resource ) else : resource = OntologyResource ( slug = local_cached_ontology , local = True ) return ProntoImplementation ( resource ) scramble_phenopackets ( output_dir , phenopacket_path , phenopacket_dir , scramble_factor , local_cached_ontology ) Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_path Path The path to a single Phenopacket file (if applicable). required phenopacket_dir Path The directory containing multiple Phenopacket files (if applicable). required scramble_factor float A factor determining the level of scrambling for phenotypic features. required local_cached_ontology Path The path to the local cached ontology. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def scramble_phenopackets ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , scramble_factor : float , local_cached_ontology : Path , ) -> None : \"\"\" Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_path (Path): The path to a single Phenopacket file (if applicable). phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable). scramble_factor (float): A factor determining the level of scrambling for phenotypic features. local_cached_ontology (Path): The path to the local cached ontology. \"\"\" output_dir . mkdir ( exist_ok = True ) ontology = load_ontology ( local_cached_ontology ) if phenopacket_path is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopacket ( output_dir , phenopacket_path ) elif phenopacket_dir is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopackets ( output_dir , phenopacket_dir , )","title":"Create noisy phenopackets"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser","text":"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO). Source code in src/pheval/prepare/create_noisy_phenopackets.py 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 class HpoRandomiser : \"\"\"Class for randomising phenopacket phenotypic features using Human Phenotype Ontology (HPO).\"\"\" def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 )) def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term )) @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id ) def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ] def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) ) def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), ) def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), )","title":"HpoRandomiser"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.__init__","text":"Initialise the HpoRandomiser. Parameters: Name Type Description Default hpo_ontology ProntoImplementation The instance of the HPO ontology. required scramble_factor float A factor for scrambling phenotypic features. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 37 38 39 40 41 42 43 44 45 46 47 def __init__ ( self , hpo_ontology : ProntoImplementation , scramble_factor : float ): \"\"\" Initialise the HpoRandomiser. Args: hpo_ontology (ProntoImplementation): The instance of the HPO ontology. scramble_factor (float): A factor for scrambling phenotypic features. \"\"\" self . hpo_ontology = hpo_ontology self . phenotypic_abnormalities = set ( hpo_ontology . roots ( predicates = [ \"HP:0000118\" ])) self . scramble_factor = scramble_factor","title":"__init__"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.add_noise_to_phenotypic_profile","text":"Randomise the phenotypic profile of a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] The Phenopacket or Family to be randomised. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The randomised Phenopacket or Family. Source code in src/pheval/prepare/create_noisy_phenopackets.py 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 def add_noise_to_phenotypic_profile ( self , phenopacket : Union [ Phenopacket , Family ], ) -> Union [ Phenopacket , Family ]: \"\"\" Randomise the phenotypic profile of a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised. Returns: Union[Phenopacket, Family]: The randomised Phenopacket or Family. \"\"\" phenotypic_features = PhenopacketUtil ( phenopacket ) . observed_phenotypic_features () random_phenotypes = self . randomise_hpo_terms ( phenotypic_features ) randomised_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_randomised_hpo ( random_phenotypes ) return randomised_phenopacket","title":"add_noise_to_phenotypic_profile"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.convert_patient_terms_to_parent","text":"Convert a subset of patient HPO terms to their respective parent terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of all phenotypic features. required retained_phenotypic_features List [ PhenotypicFeature ] List of retained non-scrambled phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. Source code in src/pheval/prepare/create_noisy_phenopackets.py 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 def convert_patient_terms_to_parent ( self , phenotypic_features : List [ PhenotypicFeature ], retained_phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Convert a subset of patient HPO terms to their respective parent terms. Args: phenotypic_features (List[PhenotypicFeature]): List of all phenotypic features. retained_phenotypic_features (List[PhenotypicFeature]): List of retained non-scrambled phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of HPO terms converted to their parent terms. Note: This method identifies a subset of patient HPO terms that are not retained among the non-scrambled phenotypic features and converts them to their respective parent terms. It then returns a list of parent HPO terms based on the provided scrambled terms count. If no remaining HPO terms are available for conversion, no parent terms are returned. \"\"\" remaining_hpo = [ i for i in phenotypic_features if i not in retained_phenotypic_features ] if len ( remaining_hpo ) == 0 : number_of_scrambled_terms = 0 hpo_terms_to_be_changed = list ( random . sample ( remaining_hpo , number_of_scrambled_terms )) parent_terms = [] for term in hpo_terms_to_be_changed : if self . hpo_ontology . label ( term . type . id ) . startswith ( \"obsolete\" ): obsolete_term = self . hpo_ontology . entity_metadata_map ( term . type . id ) updated_term = list ( obsolete_term . values ())[ 0 ][ 0 ] parents = self . hpo_ontology . hierarchical_parents ( updated_term ) else : parents = self . hpo_ontology . hierarchical_parents ( term . type . id ) if not parents : parent_terms . append ( term ) else : parent_terms . append ( self . retrieve_hpo_term ( random . choice ( parents ))) return parent_terms","title":"convert_patient_terms_to_parent"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_random_hpo_terms","text":"Generate a list of random HPO terms. Parameters: Name Type Description Default number_of_scrambled_terms int The count of random HPO terms to be generated. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomly selected HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 140 141 142 143 144 145 146 147 148 149 150 151 152 153 def create_random_hpo_terms ( self , number_of_scrambled_terms : int ) -> List [ PhenotypicFeature ]: \"\"\" Generate a list of random HPO terms. Args: number_of_scrambled_terms (int): The count of random HPO terms to be generated. Returns: List[PhenotypicFeature]: A list of randomly selected HPO terms. \"\"\" random_ids = list ( random . sample ( sorted ( self . phenotypic_abnormalities ), number_of_scrambled_terms ) ) return [ self . retrieve_hpo_term ( random_id ) for random_id in random_ids ]","title":"create_random_hpo_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_scrambled_phenopacket","text":"Create a scrambled version of a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopacket. required phenopacket_path Path The path to the original Phenopacket file. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 def create_scrambled_phenopacket ( self , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Create a scrambled version of a Phenopacket. Args: output_dir (Path): The directory to store the output scrambled Phenopacket. phenopacket_path (Path): The path to the original Phenopacket file. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket , ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name ), )","title":"create_scrambled_phenopacket"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.create_scrambled_phenopackets","text":"Create scrambled versions of Phenopackets within a directory. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_dir Path The directory containing the original Phenopacket files. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 def create_scrambled_phenopackets ( self , output_dir : Path , phenopacket_dir : Path , ) -> None : \"\"\" Create scrambled versions of Phenopackets within a directory. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_dir (Path): The directory containing the original Phenopacket files. \"\"\" phenopacket_files = files_with_suffix ( phenopacket_dir , \".json\" ) for phenopacket_path in phenopacket_files : phenopacket = phenopacket_reader ( phenopacket_path ) created_noisy_phenopacket = self . add_noise_to_phenotypic_profile ( phenopacket ) write_phenopacket ( created_noisy_phenopacket , output_dir . joinpath ( phenopacket_path . name , ), )","title":"create_scrambled_phenopackets"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.randomise_hpo_terms","text":"Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features to be randomised. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of randomised HPO terms. Note This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. Source code in src/pheval/prepare/create_noisy_phenopackets.py 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 def randomise_hpo_terms ( self , phenotypic_features : List [ PhenotypicFeature ], ) -> List [ PhenotypicFeature ]: \"\"\" Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features to be randomised. Returns: List[PhenotypicFeature]: A list of randomised HPO terms. Note: This method randomises the provided phenotypic features by incorporating three types of HPO terms: 1. Retained Patient Terms: Non-scrambled (real patient) HPO terms retained based on the scramble factor. 2. Converted to Parent Terms: Subset of HPO terms converted to their respective parent terms. 3. Random HPO Terms: Newly generated random HPO terms based on the scramble factor. The method determines the count of terms for each category and combines them to form a final list of randomised HPO terms to be used in the phenotypic features. \"\"\" number_of_scrambled_terms = self . scramble_factor_proportions ( phenotypic_features ) retained_patient_terms = self . retain_real_patient_terms ( phenotypic_features , number_of_scrambled_terms ) return ( retained_patient_terms + self . convert_patient_terms_to_parent ( phenotypic_features , retained_patient_terms , number_of_scrambled_terms ) + self . create_random_hpo_terms ( number_of_scrambled_terms ) )","title":"randomise_hpo_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.retain_real_patient_terms","text":"Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Parameters: Name Type Description Default phenotypic_features List [ PhenotypicFeature ] List of phenotypic features. required number_of_scrambled_terms int The count of scrambled HPO terms. required Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. Source code in src/pheval/prepare/create_noisy_phenopackets.py 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 @staticmethod def retain_real_patient_terms ( phenotypic_features : List [ PhenotypicFeature ], number_of_scrambled_terms : int , ) -> List [ PhenotypicFeature ]: \"\"\" Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms. Args: phenotypic_features (List[PhenotypicFeature]): List of phenotypic features. number_of_scrambled_terms (int): The count of scrambled HPO terms. Returns: List[PhenotypicFeature]: A list of non-scrambled (real patient) HPO terms. \"\"\" if len ( phenotypic_features ) > 1 : number_of_real_id = len ( phenotypic_features ) - number_of_scrambled_terms else : number_of_real_id = 1 return random . sample ( phenotypic_features , number_of_real_id )","title":"retain_real_patient_terms"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.retrieve_hpo_term","text":"Retrieve an HPO term based on the provided HPO ID. Parameters: Name Type Description Default hpo_id str The HPO ID of the term to retrieve. required Returns: Name Type Description PhenotypicFeature PhenotypicFeature The PhenotypicFeature object representing the retrieved HPO term. Source code in src/pheval/prepare/create_noisy_phenopackets.py 64 65 66 67 68 69 70 71 72 73 74 75 76 def retrieve_hpo_term ( self , hpo_id : str ) -> PhenotypicFeature : \"\"\" Retrieve an HPO term based on the provided HPO ID. Args: hpo_id (str): The HPO ID of the term to retrieve. Returns: PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term. \"\"\" rels = self . hpo_ontology . entity_alias_map ( hpo_id ) hpo_term = \"\" . join ( rels [( list ( rels . keys ())[ 0 ])]) return PhenotypicFeature ( type = OntologyClass ( id = hpo_id , label = hpo_term ))","title":"retrieve_hpo_term"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.HpoRandomiser.scramble_factor_proportions","text":"Calculate the proportion of scrambled HPO terms based on the scramble factor. Parameters: Name Type Description Default phenotypic_features list [ PhenotypicFeature ] List of phenotypic features. required Returns: Name Type Description int int The calculated number of phenotypic features to be scrambled. Source code in src/pheval/prepare/create_noisy_phenopackets.py 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def scramble_factor_proportions ( self , phenotypic_features : list [ PhenotypicFeature ]) -> int : \"\"\" Calculate the proportion of scrambled HPO terms based on the scramble factor. Args: phenotypic_features (list[PhenotypicFeature]): List of phenotypic features. Returns: int: The calculated number of phenotypic features to be scrambled. \"\"\" if len ( phenotypic_features ) == 1 : return 1 else : return int ( round ( len ( phenotypic_features ) * self . scramble_factor , 0 ))","title":"scramble_factor_proportions"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.load_ontology","text":"Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. Source code in src/pheval/prepare/create_noisy_phenopackets.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 def load_ontology ( local_cached_ontology : Path = None ) -> ProntoImplementation : \"\"\" Load the Human Phenotype Ontology (HPO). Args: local_cached_ontology(Path): Path to the local cached ontology. Returns: ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO. \"\"\" if local_cached_ontology is None : resource = OntologyResource ( slug = \"hp.obo\" , local = False ) return ProntoImplementation ( resource ) else : resource = OntologyResource ( slug = local_cached_ontology , local = True ) return ProntoImplementation ( resource )","title":"load_ontology"},{"location":"api/pheval/prepare/create_noisy_phenopackets/#src.pheval.prepare.create_noisy_phenopackets.scramble_phenopackets","text":"Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the output scrambled Phenopackets. required phenopacket_path Path The path to a single Phenopacket file (if applicable). required phenopacket_dir Path The directory containing multiple Phenopacket files (if applicable). required scramble_factor float A factor determining the level of scrambling for phenotypic features. required local_cached_ontology Path The path to the local cached ontology. required Source code in src/pheval/prepare/create_noisy_phenopackets.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def scramble_phenopackets ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , scramble_factor : float , local_cached_ontology : Path , ) -> None : \"\"\" Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets. Args: output_dir (Path): The directory to store the output scrambled Phenopackets. phenopacket_path (Path): The path to a single Phenopacket file (if applicable). phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable). scramble_factor (float): A factor determining the level of scrambling for phenotypic features. local_cached_ontology (Path): The path to the local cached ontology. \"\"\" output_dir . mkdir ( exist_ok = True ) ontology = load_ontology ( local_cached_ontology ) if phenopacket_path is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopacket ( output_dir , phenopacket_path ) elif phenopacket_dir is not None : HpoRandomiser ( ontology , scramble_factor ) . create_scrambled_phenopackets ( output_dir , phenopacket_dir , )","title":"scramble_phenopackets"},{"location":"api/pheval/prepare/create_spiked_vcf/","text":"VcfFile dataclass Represents a VCF file with its name, contents, and header information. Attributes: Name Type Description vcf_file_name str The name of the VCF file. vcf_contents List [ str ] The contents of the VCF file. vcf_header VcfHeader The parsed header information of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @dataclass class VcfFile : \"\"\" Represents a VCF file with its name, contents, and header information. Attributes: vcf_file_name (str): The name of the VCF file. vcf_contents (List[str]): The contents of the VCF file. vcf_header (VcfHeader): The parsed header information of the VCF file. \"\"\" vcf_file_name : str = None vcf_contents : List [ str ] = None vcf_header : VcfHeader = None @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ()) populate_fields ( template_vcf ) staticmethod Populate the fields of the VcfFile instance using the contents of a template VCF file. Parameters: Name Type Description Default template_vcf Path The path to the template VCF file. required Returns: Name Type Description VcfFile An instance of VcfFile with populated fields. Source code in src/pheval/prepare/create_spiked_vcf.py 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ()) VcfHeader dataclass Data obtained from VCF header. Parameters: Name Type Description Default sample_id str The sample identifier from the VCF header. required assembly str The assembly information obtained from the VCF header. required chr_status bool A boolean indicating whether the VCF denotes chromosomes as chr or not. required Source code in src/pheval/prepare/create_spiked_vcf.py 78 79 80 81 82 83 84 85 86 87 88 89 90 @dataclass class VcfHeader : \"\"\"Data obtained from VCF header. Args: sample_id (str): The sample identifier from the VCF header. assembly (str): The assembly information obtained from the VCF header. chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not. \"\"\" sample_id : str assembly : str chr_status : bool VcfHeaderParser Class for parsing the header of a VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 class VcfHeaderParser : \"\"\"Class for parsing the header of a VCF file.\"\"\" def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status ) __init__ ( vcf_contents ) Initialise the VcfHeaderParser. Parameters: Name Type Description Default vcf_contents list [ str ] The contents of the VCF file as a list of strings. required Source code in src/pheval/prepare/create_spiked_vcf.py 115 116 117 118 119 120 121 122 def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents parse_assembly () Parse the genome assembly and format of vcf_records. Returns: Type Description tuple [ str , bool ] Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). Source code in src/pheval/prepare/create_spiked_vcf.py 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () parse_vcf_header () Parse the header of the VCF. Returns: Name Type Description VcfHeader VcfHeader An instance of VcfHeader containing sample ID, assembly, and chromosome status. Source code in src/pheval/prepare/create_spiked_vcf.py 163 164 165 166 167 168 169 170 171 172 def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status ) VcfSpiker Class for spiking proband variants into template VCF file contents. Source code in src/pheval/prepare/create_spiked_vcf.py 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 class VcfSpiker : \"\"\"Class for spiking proband variants into template VCF file contents.\"\"\" def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name )) __init__ ( vcf_contents , proband_causative_variants , vcf_header ) Initialise the VcfSpiker. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the template VCF file. required proband_causative_variants List [ ProbandCausativeVariant ] List of proband causative variants. required vcf_header VcfHeader The VCF header information. required Source code in src/pheval/prepare/create_spiked_vcf.py 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header construct_header ( updated_vcf_records ) Construct the header of the VCF. Parameters: Name Type Description Default updated_vcf_records List [ str ] Updated VCF records. required Returns: Type Description List [ str ] List[str]: Constructed header as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file construct_variant_entry ( proband_variant_data ) Construct variant entries. Parameters: Name Type Description Default proband_variant_data ProbandCausativeVariant Data for the proband variant. required Returns: Type Description List [ str ] List[str]: Constructed variant entry as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] construct_vcf ( template_vcf_name ) Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: The complete spiked VCF file content as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 393 394 395 396 397 398 399 400 401 402 403 def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name )) construct_vcf_records ( template_vcf_name ) Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: Updated VCF records containing the spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records VcfWriter Class for writing VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 class VcfWriter : \"\"\"Class for writing VCF file.\"\"\" def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed () __init__ ( vcf_contents , spiked_vcf_file_path ) Initialise the VcfWriter class. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the VCF file to be written. required spiked_vcf_file_path Path Path to the spiked VCF file to be created. required Source code in src/pheval/prepare/create_spiked_vcf.py 409 410 411 412 413 414 415 416 417 418 419 420 421 422 def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path write_gzip () Write the VCF contents to a gzipped VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 424 425 426 427 428 429 430 431 432 def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () write_uncompressed () Write the VCF contents to an uncompressed VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 434 435 436 437 438 439 440 def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () write_vcf_file () Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). Source code in src/pheval/prepare/create_spiked_vcf.py 442 443 444 445 446 447 448 449 def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed () check_variant_assembly ( proband_causative_variants , vcf_header , phenopacket_path ) Check the assembly of the variant assembly against the VCF. Parameters: Name Type Description Default proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required vcf_header VcfHeader An instance of VcfHeader representing the VCF file's header. required phenopacket_path Path The path to the Phenopacket file. required Raises: Type Description ValueError If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError If the assembly in the Phenopacket does not match the VCF assembly. Source code in src/pheval/prepare/create_spiked_vcf.py 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def check_variant_assembly ( proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , phenopacket_path : Path , ) -> None : \"\"\" Check the assembly of the variant assembly against the VCF. Args: proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header. phenopacket_path (Path): The path to the Phenopacket file. Raises: ValueError: If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly. \"\"\" compatible_genome_assembly = { \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" } phenopacket_assembly = list ({ variant . assembly for variant in proband_causative_variants }) if len ( phenopacket_assembly ) > 1 : raise ValueError ( \"Too many genome assemblies!\" ) if phenopacket_assembly [ 0 ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( phenopacket_assembly , phenopacket_path ) if ( phenopacket_assembly [ 0 ] in { \"hg19\" , \"GRCh37\" } and vcf_header . assembly not in { \"hg19\" , \"GRCh37\" } ) or ( phenopacket_assembly [ 0 ] in { \"hg38\" , \"GRCh38\" } and vcf_header . assembly not in { \"hg38\" , \"GRCh38\" } ): raise IncompatibleGenomeAssemblyError ( assembly = phenopacket_assembly , phenopacket = phenopacket_path ) create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create a spiked VCF for a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_path Path Path to the Phenopacket file. required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 def create_spiked_vcf ( output_dir : Path , phenopacket_path : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a Phenopacket. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_path (Path): Path to the Phenopacket file. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if hg19_template_vcf is None and hg38_template_vcf is None : raise InputError ( \"Either a hg19 template vcf or hg38 template vcf must be specified\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create a spiked VCF for a directory of Phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_dir Path Path to the Phenopacket directory. required hg19_template_vcf Path Path to the template hg19 VCF file (optional). required hg38_template_vcf Path Path to the template hg19 VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 def create_spiked_vcfs ( output_dir : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a directory of Phenopackets. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_dir (Path): Path to the Phenopacket directory. hg19_template_vcf (Path): Path to the template hg19 VCF file (optional). hg38_template_vcf (Path): Path to the template hg19 VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if ( hg19_template_vcf is None and hg38_template_vcf is None and hg19_vcf_dir is None and hg38_vcf_dir is None ): raise InputError ( \"Need to specify a VCF!\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None for phenopacket_path in files_with_suffix ( phenopacket_dir , \".json\" ): spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Write spiked VCF contents to a new file. Parameters: Name Type Description Default output_dir Path Path to the directory to store the generated file. required phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: File: The generated File object representing the newly created spiked VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 def generate_spiked_vcf_file ( output_dir : Path , phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> File : \"\"\" Write spiked VCF contents to a new file. Args: output_dir (Path): Path to the directory to store the generated file. phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: File: The generated File object representing the newly created spiked VCF file. \"\"\" output_dir . mkdir ( exist_ok = True ) info_log . info ( f \" Created a directory { output_dir } \" ) vcf_assembly , spiked_vcf = spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) spiked_vcf_path = output_dir . joinpath ( phenopacket_path . name . replace ( \".json\" , \".vcf.gz\" )) VcfWriter ( spiked_vcf , spiked_vcf_path ) . write_vcf_file () return File ( uri = urllib . parse . unquote ( spiked_vcf_path . as_uri ()), file_attributes = { \"fileFormat\" : \"vcf\" , \"genomeAssembly\" : vcf_assembly }, ) read_vcf ( vcf_file ) Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Parameters: Name Type Description Default vcf_file Path The path to the VCF file to be read. required Returns: Type Description List [ str ] List[str]: A list containing the lines of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def read_vcf ( vcf_file : Path ) -> List [ str ]: \"\"\" Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Args: vcf_file (Path): The path to the VCF file to be read. Returns: List[str]: A list containing the lines of the VCF file. \"\"\" open_fn = gzip . open if is_gzipped ( vcf_file ) else open vcf = open_fn ( vcf_file ) vcf_contents = ( [ line . decode () for line in vcf . readlines ()] if is_gzipped ( vcf_file ) else vcf . readlines () ) vcf . close () return vcf_contents select_vcf_template ( phenopacket_path , proband_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Select the appropriate VCF template based on the assembly information of the proband causative variants. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile CF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Name Type Description VcfFile VcfFile The selected VCF template file based on the assembly information of the proband causative variants. Source code in src/pheval/prepare/create_spiked_vcf.py 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 def select_vcf_template ( phenopacket_path : Path , proband_causative_variants : List [ ProbandCausativeVariant ], hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> VcfFile : \"\"\" Select the appropriate VCF template based on the assembly information of the proband causative variants. Args: phenopacket_path (Path): The path to the Phenopacket file. proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): CF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: VcfFile: The selected VCF template file based on the assembly information of the proband causative variants. \"\"\" if proband_causative_variants [ 0 ] . assembly in [ \"hg19\" , \"GRCh37\" ]: if hg19_vcf_info : return hg19_vcf_info elif hg19_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg19_vcf_dir ))) else : raise InputError ( \"Must specify hg19 template VCF!\" ) elif proband_causative_variants [ 0 ] . assembly in [ \"hg38\" , \"GRCh38\" ]: if hg38_vcf_info : return hg38_vcf_info elif hg38_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg38_vcf_dir ))) else : raise InputError ( \"Must specify hg38 template VCF!\" ) else : raise IncompatibleGenomeAssemblyError ( proband_causative_variants [ 0 ] . assembly , phenopacket_path ) spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path ) Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Parameters: Name Type Description Default hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required output_dir Path Directory where the updated Phenopacket will be saved. required phenopacket_path Path Path to the original Phenopacket file. required Returns: Type Description None None Source code in src/pheval/prepare/create_spiked_vcf.py 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 def spike_and_update_phenopacket ( hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Args: hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. output_dir (Path): Directory where the updated Phenopacket will be saved. phenopacket_path (Path): Path to the original Phenopacket file. Returns: None \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) spiked_vcf_file_message = generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) updated_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_spiked_vcf_path ( spiked_vcf_file_message ) write_phenopacket ( updated_phenopacket , phenopacket_path ) spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) Spike VCF records with variants obtained from a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Type Description tuple [ str , List [ str ]] A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 def spike_vcf_contents ( phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> tuple [ str , List [ str ]]: \"\"\" Spike VCF records with variants obtained from a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. \"\"\" phenopacket_causative_variants = PhenopacketUtil ( phenopacket ) . causative_variants () chosen_template_vcf = select_vcf_template ( phenopacket_path , phenopacket_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) check_variant_assembly ( phenopacket_causative_variants , chosen_template_vcf . vcf_header , phenopacket_path ) return ( chosen_template_vcf . vcf_header . assembly , VcfSpiker ( chosen_template_vcf . vcf_contents , phenopacket_causative_variants , chosen_template_vcf . vcf_header , ) . construct_vcf ( chosen_template_vcf . vcf_file_name ), ) spike_vcfs ( output_dir , phenopacket_path , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir ) Create spiked VCF from either a Phenopacket or a Phenopacket directory. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file(s). required phenopacket_path Path Path to a single Phenopacket file (optional). required phenopacket_dir Path Path to a directory containing Phenopacket files (optional). required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Source code in src/pheval/prepare/create_spiked_vcf.py 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 def spike_vcfs ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: output_dir (Path): The directory to store the generated spiked VCF file(s). phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). \"\"\" if phenopacket_path is not None : create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) elif phenopacket_dir is not None : create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , )","title":"Create spiked vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfFile","text":"Represents a VCF file with its name, contents, and header information. Attributes: Name Type Description vcf_file_name str The name of the VCF file. vcf_contents List [ str ] The contents of the VCF file. vcf_header VcfHeader The parsed header information of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @dataclass class VcfFile : \"\"\" Represents a VCF file with its name, contents, and header information. Attributes: vcf_file_name (str): The name of the VCF file. vcf_contents (List[str]): The contents of the VCF file. vcf_header (VcfHeader): The parsed header information of the VCF file. \"\"\" vcf_file_name : str = None vcf_contents : List [ str ] = None vcf_header : VcfHeader = None @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ())","title":"VcfFile"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfFile.populate_fields","text":"Populate the fields of the VcfFile instance using the contents of a template VCF file. Parameters: Name Type Description Default template_vcf Path The path to the template VCF file. required Returns: Name Type Description VcfFile An instance of VcfFile with populated fields. Source code in src/pheval/prepare/create_spiked_vcf.py 190 191 192 193 194 195 196 197 198 199 200 201 202 203 @staticmethod def populate_fields ( template_vcf : Path ): \"\"\" Populate the fields of the VcfFile instance using the contents of a template VCF file. Args: template_vcf (Path): The path to the template VCF file. Returns: VcfFile: An instance of VcfFile with populated fields. \"\"\" contents = read_vcf ( template_vcf ) return VcfFile ( template_vcf . name , contents , VcfHeaderParser ( contents ) . parse_vcf_header ())","title":"populate_fields"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeader","text":"Data obtained from VCF header. Parameters: Name Type Description Default sample_id str The sample identifier from the VCF header. required assembly str The assembly information obtained from the VCF header. required chr_status bool A boolean indicating whether the VCF denotes chromosomes as chr or not. required Source code in src/pheval/prepare/create_spiked_vcf.py 78 79 80 81 82 83 84 85 86 87 88 89 90 @dataclass class VcfHeader : \"\"\"Data obtained from VCF header. Args: sample_id (str): The sample identifier from the VCF header. assembly (str): The assembly information obtained from the VCF header. chr_status (bool): A boolean indicating whether the VCF denotes chromosomes as chr or not. \"\"\" sample_id : str assembly : str chr_status : bool","title":"VcfHeader"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser","text":"Class for parsing the header of a VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 class VcfHeaderParser : \"\"\"Class for parsing the header of a VCF file.\"\"\" def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip () def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status )","title":"VcfHeaderParser"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.__init__","text":"Initialise the VcfHeaderParser. Parameters: Name Type Description Default vcf_contents list [ str ] The contents of the VCF file as a list of strings. required Source code in src/pheval/prepare/create_spiked_vcf.py 115 116 117 118 119 120 121 122 def __init__ ( self , vcf_contents : list [ str ]): \"\"\" Initialise the VcfHeaderParser. Args: vcf_contents (list[str]): The contents of the VCF file as a list of strings. \"\"\" self . vcf_contents = vcf_contents","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.parse_assembly","text":"Parse the genome assembly and format of vcf_records. Returns: Type Description tuple [ str , bool ] Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). Source code in src/pheval/prepare/create_spiked_vcf.py 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 def parse_assembly ( self ) -> tuple [ str , bool ]: \"\"\" Parse the genome assembly and format of vcf_records. Returns: Tuple[str, bool]: A tuple containing the assembly and chromosome status (True/False). \"\"\" vcf_assembly = {} chr_status = False for line in self . vcf_contents : if line . startswith ( \"##contig= str : \"\"\" Parse the sample ID of the VCF. Returns: str: The sample ID extracted from the VCF header. \"\"\" for line in self . vcf_contents : if line . startswith ( \"#CHROM\" ): return line . split ( \" \\t \" )[ 9 ] . rstrip ()","title":"parse_sample_id"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfHeaderParser.parse_vcf_header","text":"Parse the header of the VCF. Returns: Name Type Description VcfHeader VcfHeader An instance of VcfHeader containing sample ID, assembly, and chromosome status. Source code in src/pheval/prepare/create_spiked_vcf.py 163 164 165 166 167 168 169 170 171 172 def parse_vcf_header ( self ) -> VcfHeader : \"\"\" Parse the header of the VCF. Returns: VcfHeader: An instance of VcfHeader containing sample ID, assembly, and chromosome status. \"\"\" assembly , chr_status = self . parse_assembly () sample_id = self . parse_sample_id () return VcfHeader ( sample_id , assembly , chr_status )","title":"parse_vcf_header"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker","text":"Class for spiking proband variants into template VCF file contents. Source code in src/pheval/prepare/create_spiked_vcf.py 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 class VcfSpiker : \"\"\"Class for spiking proband variants into template VCF file contents.\"\"\" def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ] def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name ))","title":"VcfSpiker"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.__init__","text":"Initialise the VcfSpiker. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the template VCF file. required proband_causative_variants List [ ProbandCausativeVariant ] List of proband causative variants. required vcf_header VcfHeader The VCF header information. required Source code in src/pheval/prepare/create_spiked_vcf.py 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 def __init__ ( self , vcf_contents : list [ str ], proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , ): \"\"\" Initialise the VcfSpiker. Args: vcf_contents (List[str]): Contents of the template VCF file. proband_causative_variants (List[ProbandCausativeVariant]): List of proband causative variants. vcf_header (VcfHeader): The VCF header information. \"\"\" self . vcf_contents = vcf_contents self . proband_causative_variants = proband_causative_variants self . vcf_header = vcf_header","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_header","text":"Construct the header of the VCF. Parameters: Name Type Description Default updated_vcf_records List [ str ] Updated VCF records. required Returns: Type Description List [ str ] List[str]: Constructed header as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 def construct_header ( self , updated_vcf_records : List [ str ]) -> List [ str ]: \"\"\" Construct the header of the VCF. Args: updated_vcf_records (List[str]): Updated VCF records. Returns: List[str]: Constructed header as a list of strings. \"\"\" updated_vcf_file = [] for line in updated_vcf_records : if line . startswith ( \"#\" ): text = line . replace ( self . vcf_header . sample_id , self . proband_causative_variants [ 0 ] . proband_id , ) else : text = line updated_vcf_file . append ( text ) return updated_vcf_file","title":"construct_header"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_variant_entry","text":"Construct variant entries. Parameters: Name Type Description Default proband_variant_data ProbandCausativeVariant Data for the proband variant. required Returns: Type Description List [ str ] List[str]: Constructed variant entry as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 def construct_variant_entry ( self , proband_variant_data : ProbandCausativeVariant ) -> List [ str ]: \"\"\" Construct variant entries. Args: proband_variant_data (ProbandCausativeVariant): Data for the proband variant. Returns: List[str]: Constructed variant entry as a list of strings. \"\"\" genotype_codes = { \"hemizygous\" : \"0/1\" , \"homozygous\" : \"1/1\" , \"heterozygous\" : \"0/1\" , \"compound heterozygous\" : \"0/1\" , } if self . vcf_header . chr_status is True and \"chr\" not in proband_variant_data . variant . chrom : proband_variant_data . variant . chrom = \"chr\" + proband_variant_data . variant . chrom return [ proband_variant_data . variant . chrom , str ( proband_variant_data . variant . pos ), \".\" , proband_variant_data . variant . ref , ( f \"< { proband_variant_data . variant . alt } >\" if proband_variant_data . variant . ref == \"N\" else proband_variant_data . variant . alt ), \"100\" , \"PASS\" , proband_variant_data . info if proband_variant_data . info else \".\" , \"GT\" , genotype_codes [ proband_variant_data . genotype . lower ()] + \" \\n \" , ]","title":"construct_variant_entry"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_vcf","text":"Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: The complete spiked VCF file content as a list of strings. Source code in src/pheval/prepare/create_spiked_vcf.py 393 394 395 396 397 398 399 400 401 402 403 def construct_vcf ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct the entire spiked VCF file by incorporating the spiked variants into the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: The complete spiked VCF file content as a list of strings. \"\"\" return self . construct_header ( self . construct_vcf_records ( template_vcf_name ))","title":"construct_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfSpiker.construct_vcf_records","text":"Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Parameters: Name Type Description Default template_vcf_name str Name of the template VCF file. required Returns: Type Description List [ str ] List[str]: Updated VCF records containing the spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 def construct_vcf_records ( self , template_vcf_name : str ) -> List [ str ]: \"\"\" Construct updated VCF records by inserting spiked variants into the correct positions within the VCF. Args: template_vcf_name (str): Name of the template VCF file. Returns: List[str]: Updated VCF records containing the spiked variants. \"\"\" updated_vcf_records = copy ( self . vcf_contents ) for variant in self . proband_causative_variants : variant_entry = self . construct_variant_entry ( variant ) matching_indices = [ i for i , val in enumerate ( updated_vcf_records ) if val . split ( \" \\t \" )[ 0 ] == variant_entry [ 0 ] and int ( val . split ( \" \\t \" )[ 1 ]) < int ( variant_entry [ 1 ]) ] if matching_indices : variant_entry_position = matching_indices [ - 1 ] + 1 else : info_log . warning ( f \"Could not find entry position for { variant . variant . chrom } - { variant . variant . pos } -\" f \" { variant . variant . ref } - { variant . variant . alt } in { template_vcf_name } , \" \"inserting at end of VCF contents.\" ) variant_entry_position = len ( updated_vcf_records ) updated_vcf_records . insert ( variant_entry_position , \" \\t \" . join ( variant_entry )) return updated_vcf_records","title":"construct_vcf_records"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter","text":"Class for writing VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 class VcfWriter : \"\"\"Class for writing VCF file.\"\"\" def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close () def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close () def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed ()","title":"VcfWriter"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.__init__","text":"Initialise the VcfWriter class. Parameters: Name Type Description Default vcf_contents List [ str ] Contents of the VCF file to be written. required spiked_vcf_file_path Path Path to the spiked VCF file to be created. required Source code in src/pheval/prepare/create_spiked_vcf.py 409 410 411 412 413 414 415 416 417 418 419 420 421 422 def __init__ ( self , vcf_contents : List [ str ], spiked_vcf_file_path : Path , ): \"\"\" Initialise the VcfWriter class. Args: vcf_contents (List[str]): Contents of the VCF file to be written. spiked_vcf_file_path (Path): Path to the spiked VCF file to be created. \"\"\" self . vcf_contents = vcf_contents self . spiked_vcf_file_path = spiked_vcf_file_path","title":"__init__"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_gzip","text":"Write the VCF contents to a gzipped VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 424 425 426 427 428 429 430 431 432 def write_gzip ( self ) -> None : \"\"\" Write the VCF contents to a gzipped VCF file. \"\"\" encoded_contents = [ line . encode () for line in self . vcf_contents ] with gzip . open ( self . spiked_vcf_file_path , \"wb\" ) as f : for line in encoded_contents : f . write ( line ) f . close ()","title":"write_gzip"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_uncompressed","text":"Write the VCF contents to an uncompressed VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 434 435 436 437 438 439 440 def write_uncompressed ( self ) -> None : \"\"\" Write the VCF contents to an uncompressed VCF file. \"\"\" with open ( self . spiked_vcf_file_path , \"w\" ) as file : file . writelines ( self . vcf_contents ) file . close ()","title":"write_uncompressed"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.VcfWriter.write_vcf_file","text":"Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). Source code in src/pheval/prepare/create_spiked_vcf.py 442 443 444 445 446 447 448 449 def write_vcf_file ( self ) -> None : \"\"\" Write the VCF file based on compression type. Determines the file writing method based on the compression type of the spiked VCF file path. Writes the VCF contents to the corresponding file format (gzip or uncompressed). \"\"\" self . write_gzip () if is_gzipped ( self . spiked_vcf_file_path ) else self . write_uncompressed ()","title":"write_vcf_file"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.check_variant_assembly","text":"Check the assembly of the variant assembly against the VCF. Parameters: Name Type Description Default proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required vcf_header VcfHeader An instance of VcfHeader representing the VCF file's header. required phenopacket_path Path The path to the Phenopacket file. required Raises: Type Description ValueError If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError If the assembly in the Phenopacket does not match the VCF assembly. Source code in src/pheval/prepare/create_spiked_vcf.py 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 def check_variant_assembly ( proband_causative_variants : list [ ProbandCausativeVariant ], vcf_header : VcfHeader , phenopacket_path : Path , ) -> None : \"\"\" Check the assembly of the variant assembly against the VCF. Args: proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. vcf_header (VcfHeader): An instance of VcfHeader representing the VCF file's header. phenopacket_path (Path): The path to the Phenopacket file. Raises: ValueError: If there are too many or incompatible genome assemblies found. IncompatibleGenomeAssemblyError: If the assembly in the Phenopacket does not match the VCF assembly. \"\"\" compatible_genome_assembly = { \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" } phenopacket_assembly = list ({ variant . assembly for variant in proband_causative_variants }) if len ( phenopacket_assembly ) > 1 : raise ValueError ( \"Too many genome assemblies!\" ) if phenopacket_assembly [ 0 ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( phenopacket_assembly , phenopacket_path ) if ( phenopacket_assembly [ 0 ] in { \"hg19\" , \"GRCh37\" } and vcf_header . assembly not in { \"hg19\" , \"GRCh37\" } ) or ( phenopacket_assembly [ 0 ] in { \"hg38\" , \"GRCh38\" } and vcf_header . assembly not in { \"hg38\" , \"GRCh38\" } ): raise IncompatibleGenomeAssemblyError ( assembly = phenopacket_assembly , phenopacket = phenopacket_path )","title":"check_variant_assembly"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.create_spiked_vcf","text":"Create a spiked VCF for a Phenopacket. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_path Path Path to the Phenopacket file. required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 def create_spiked_vcf ( output_dir : Path , phenopacket_path : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a Phenopacket. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_path (Path): Path to the Phenopacket file. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if hg19_template_vcf is None and hg38_template_vcf is None : raise InputError ( \"Either a hg19 template vcf or hg38 template vcf must be specified\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path )","title":"create_spiked_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.create_spiked_vcfs","text":"Create a spiked VCF for a directory of Phenopackets. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file. required phenopacket_dir Path Path to the Phenopacket directory. required hg19_template_vcf Path Path to the template hg19 VCF file (optional). required hg38_template_vcf Path Path to the template hg19 VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Raises: Type Description InputError If both hg19_template_vcf and hg38_template_vcf are None. Source code in src/pheval/prepare/create_spiked_vcf.py 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 def create_spiked_vcfs ( output_dir : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create a spiked VCF for a directory of Phenopackets. Args: output_dir (Path): The directory to store the generated spiked VCF file. phenopacket_dir (Path): Path to the Phenopacket directory. hg19_template_vcf (Path): Path to the template hg19 VCF file (optional). hg38_template_vcf (Path): Path to the template hg19 VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). Raises: InputError: If both hg19_template_vcf and hg38_template_vcf are None. \"\"\" if ( hg19_template_vcf is None and hg38_template_vcf is None and hg19_vcf_dir is None and hg38_vcf_dir is None ): raise InputError ( \"Need to specify a VCF!\" ) hg19_vcf_info = VcfFile . populate_fields ( hg19_template_vcf ) if hg19_template_vcf else None hg38_vcf_info = VcfFile . populate_fields ( hg38_template_vcf ) if hg38_template_vcf else None for phenopacket_path in files_with_suffix ( phenopacket_dir , \".json\" ): spike_and_update_phenopacket ( hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , output_dir , phenopacket_path )","title":"create_spiked_vcfs"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.generate_spiked_vcf_file","text":"Write spiked VCF contents to a new file. Parameters: Name Type Description Default output_dir Path Path to the directory to store the generated file. required phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: File: The generated File object representing the newly created spiked VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 def generate_spiked_vcf_file ( output_dir : Path , phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> File : \"\"\" Write spiked VCF contents to a new file. Args: output_dir (Path): Path to the directory to store the generated file. phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: File: The generated File object representing the newly created spiked VCF file. \"\"\" output_dir . mkdir ( exist_ok = True ) info_log . info ( f \" Created a directory { output_dir } \" ) vcf_assembly , spiked_vcf = spike_vcf_contents ( phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir ) spiked_vcf_path = output_dir . joinpath ( phenopacket_path . name . replace ( \".json\" , \".vcf.gz\" )) VcfWriter ( spiked_vcf , spiked_vcf_path ) . write_vcf_file () return File ( uri = urllib . parse . unquote ( spiked_vcf_path . as_uri ()), file_attributes = { \"fileFormat\" : \"vcf\" , \"genomeAssembly\" : vcf_assembly }, )","title":"generate_spiked_vcf_file"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.read_vcf","text":"Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Parameters: Name Type Description Default vcf_file Path The path to the VCF file to be read. required Returns: Type Description List [ str ] List[str]: A list containing the lines of the VCF file. Source code in src/pheval/prepare/create_spiked_vcf.py 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def read_vcf ( vcf_file : Path ) -> List [ str ]: \"\"\" Read the contents of a VCF file into memory, handling both uncompressed and gzipped files. Args: vcf_file (Path): The path to the VCF file to be read. Returns: List[str]: A list containing the lines of the VCF file. \"\"\" open_fn = gzip . open if is_gzipped ( vcf_file ) else open vcf = open_fn ( vcf_file ) vcf_contents = ( [ line . decode () for line in vcf . readlines ()] if is_gzipped ( vcf_file ) else vcf . readlines () ) vcf . close () return vcf_contents","title":"read_vcf"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.select_vcf_template","text":"Select the appropriate VCF template based on the assembly information of the proband causative variants. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required proband_causative_variants List [ ProbandCausativeVariant ] A list of causative variants from the proband. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile CF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Name Type Description VcfFile VcfFile The selected VCF template file based on the assembly information of the proband causative variants. Source code in src/pheval/prepare/create_spiked_vcf.py 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 def select_vcf_template ( phenopacket_path : Path , proband_causative_variants : List [ ProbandCausativeVariant ], hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> VcfFile : \"\"\" Select the appropriate VCF template based on the assembly information of the proband causative variants. Args: phenopacket_path (Path): The path to the Phenopacket file. proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): CF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: VcfFile: The selected VCF template file based on the assembly information of the proband causative variants. \"\"\" if proband_causative_variants [ 0 ] . assembly in [ \"hg19\" , \"GRCh37\" ]: if hg19_vcf_info : return hg19_vcf_info elif hg19_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg19_vcf_dir ))) else : raise InputError ( \"Must specify hg19 template VCF!\" ) elif proband_causative_variants [ 0 ] . assembly in [ \"hg38\" , \"GRCh38\" ]: if hg38_vcf_info : return hg38_vcf_info elif hg38_vcf_dir : return VcfFile . populate_fields ( random . choice ( all_files ( hg38_vcf_dir ))) else : raise InputError ( \"Must specify hg38 template VCF!\" ) else : raise IncompatibleGenomeAssemblyError ( proband_causative_variants [ 0 ] . assembly , phenopacket_path )","title":"select_vcf_template"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_and_update_phenopacket","text":"Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Parameters: Name Type Description Default hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required output_dir Path Directory where the updated Phenopacket will be saved. required phenopacket_path Path Path to the original Phenopacket file. required Returns: Type Description None None Source code in src/pheval/prepare/create_spiked_vcf.py 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 def spike_and_update_phenopacket ( hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , phenopacket_path : Path , ) -> None : \"\"\" Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket accordingly, and write the updated Phenopacket to the specified output directory. Args: hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. output_dir (Path): Directory where the updated Phenopacket will be saved. phenopacket_path (Path): Path to the original Phenopacket file. Returns: None \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) spiked_vcf_file_message = generate_spiked_vcf_file ( output_dir , phenopacket , phenopacket_path , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) updated_phenopacket = PhenopacketRebuilder ( phenopacket ) . add_spiked_vcf_path ( spiked_vcf_file_message ) write_phenopacket ( updated_phenopacket , phenopacket_path )","title":"spike_and_update_phenopacket"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_vcf_contents","text":"Spike VCF records with variants obtained from a Phenopacket or Family. Parameters: Name Type Description Default phenopacket Union [ Phenopacket , Family ] Phenopacket or Family containing causative variants. required phenopacket_path Path Path to the Phenopacket file. required hg19_vcf_info VcfFile VCF file info for hg19 template vcf. required hg38_vcf_info VcfFile VCF file info for hg38 template vcf. required hg19_vcf_dir Path The directory containing the hg19 VCF files. required hg38_vcf_dir Path The directory containing the hg38 VCF files. required Returns: Type Description tuple [ str , List [ str ]] A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. Source code in src/pheval/prepare/create_spiked_vcf.py 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 def spike_vcf_contents ( phenopacket : Union [ Phenopacket , Family ], phenopacket_path : Path , hg19_vcf_info : VcfFile , hg38_vcf_info : VcfFile , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> tuple [ str , List [ str ]]: \"\"\" Spike VCF records with variants obtained from a Phenopacket or Family. Args: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants. phenopacket_path (Path): Path to the Phenopacket file. hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf. hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf. hg19_vcf_dir (Path): The directory containing the hg19 VCF files. hg38_vcf_dir (Path): The directory containing the hg38 VCF files. Returns: A tuple containing: assembly (str): The genome assembly information extracted from VCF header. modified_vcf_contents (List[str]): Modified VCF records with spiked variants. \"\"\" phenopacket_causative_variants = PhenopacketUtil ( phenopacket ) . causative_variants () chosen_template_vcf = select_vcf_template ( phenopacket_path , phenopacket_causative_variants , hg19_vcf_info , hg38_vcf_info , hg19_vcf_dir , hg38_vcf_dir , ) check_variant_assembly ( phenopacket_causative_variants , chosen_template_vcf . vcf_header , phenopacket_path ) return ( chosen_template_vcf . vcf_header . assembly , VcfSpiker ( chosen_template_vcf . vcf_contents , phenopacket_causative_variants , chosen_template_vcf . vcf_header , ) . construct_vcf ( chosen_template_vcf . vcf_file_name ), )","title":"spike_vcf_contents"},{"location":"api/pheval/prepare/create_spiked_vcf/#src.pheval.prepare.create_spiked_vcf.spike_vcfs","text":"Create spiked VCF from either a Phenopacket or a Phenopacket directory. Parameters: Name Type Description Default output_dir Path The directory to store the generated spiked VCF file(s). required phenopacket_path Path Path to a single Phenopacket file (optional). required phenopacket_dir Path Path to a directory containing Phenopacket files (optional). required hg19_template_vcf Path Path to the hg19 template VCF file (optional). required hg38_template_vcf Path Path to the hg38 template VCF file (optional). required hg19_vcf_dir Path The directory containing the hg19 VCF files (optional). required hg38_vcf_dir Path The directory containing the hg38 VCF files (optional). required Source code in src/pheval/prepare/create_spiked_vcf.py 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 def spike_vcfs ( output_dir : Path , phenopacket_path : Path , phenopacket_dir : Path , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , ) -> None : \"\"\" Create spiked VCF from either a Phenopacket or a Phenopacket directory. Args: output_dir (Path): The directory to store the generated spiked VCF file(s). phenopacket_path (Path): Path to a single Phenopacket file (optional). phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional). hg19_template_vcf (Path): Path to the hg19 template VCF file (optional). hg38_template_vcf (Path): Path to the hg38 template VCF file (optional). hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional). hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional). \"\"\" if phenopacket_path is not None : create_spiked_vcf ( output_dir , phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) elif phenopacket_dir is not None : create_spiked_vcfs ( output_dir , phenopacket_dir , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , )","title":"spike_vcfs"},{"location":"api/pheval/prepare/custom_exceptions/","text":"InputError Bases: Exception Exception raised for missing required inputs. Source code in src/pheval/prepare/custom_exceptions.py 4 5 6 7 8 9 10 11 12 13 class InputError ( Exception ): \"\"\"Exception raised for missing required inputs.\"\"\" def __init__ ( self , file , message = \"Missing required input\" ): self . file : str = file self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . file } \" MutuallyExclusiveOptionError Bases: Option Exception raised for when Source code in src/pheval/prepare/custom_exceptions.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 class MutuallyExclusiveOptionError ( Option ): \"\"\"Exception raised for when\"\"\" def __init__ ( self , * args , ** kwargs ): self . mutually_exclusive = set ( kwargs . pop ( \"mutually_exclusive\" , [])) help_ = kwargs . get ( \"help\" , \"\" ) if self . mutually_exclusive : ex_str = \", \" . join ( self . mutually_exclusive ) kwargs [ \"help\" ] = help_ + ( \" NOTE: This argument is mutually exclusive with \" \" arguments: [\" + ex_str + \"].\" ) super ( MutuallyExclusiveOptionError , self ) . __init__ ( * args , ** kwargs ) def handle_parse_result ( self , ctx , opts , args ): if self . mutually_exclusive . intersection ( opts ) and self . name in opts : raise UsageError ( \"Illegal usage: ` {} ` is mutually exclusive with \" \"arguments ` {} `.\" . format ( self . name , \", \" . join ( self . mutually_exclusive )) ) return super ( MutuallyExclusiveOptionError , self ) . handle_parse_result ( ctx , opts , args )","title":"Custom exceptions"},{"location":"api/pheval/prepare/custom_exceptions/#src.pheval.prepare.custom_exceptions.InputError","text":"Bases: Exception Exception raised for missing required inputs. Source code in src/pheval/prepare/custom_exceptions.py 4 5 6 7 8 9 10 11 12 13 class InputError ( Exception ): \"\"\"Exception raised for missing required inputs.\"\"\" def __init__ ( self , file , message = \"Missing required input\" ): self . file : str = file self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . file } \"","title":"InputError"},{"location":"api/pheval/prepare/custom_exceptions/#src.pheval.prepare.custom_exceptions.MutuallyExclusiveOptionError","text":"Bases: Option Exception raised for when Source code in src/pheval/prepare/custom_exceptions.py 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 class MutuallyExclusiveOptionError ( Option ): \"\"\"Exception raised for when\"\"\" def __init__ ( self , * args , ** kwargs ): self . mutually_exclusive = set ( kwargs . pop ( \"mutually_exclusive\" , [])) help_ = kwargs . get ( \"help\" , \"\" ) if self . mutually_exclusive : ex_str = \", \" . join ( self . mutually_exclusive ) kwargs [ \"help\" ] = help_ + ( \" NOTE: This argument is mutually exclusive with \" \" arguments: [\" + ex_str + \"].\" ) super ( MutuallyExclusiveOptionError , self ) . __init__ ( * args , ** kwargs ) def handle_parse_result ( self , ctx , opts , args ): if self . mutually_exclusive . intersection ( opts ) and self . name in opts : raise UsageError ( \"Illegal usage: ` {} ` is mutually exclusive with \" \"arguments ` {} `.\" . format ( self . name , \", \" . join ( self . mutually_exclusive )) ) return super ( MutuallyExclusiveOptionError , self ) . handle_parse_result ( ctx , opts , args )","title":"MutuallyExclusiveOptionError"},{"location":"api/pheval/prepare/prepare_corpus/","text":"prepare_corpus ( phenopacket_dir , variant_analysis , gene_analysis , disease_analysis , gene_identifier , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , output_dir ) Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Parameters: Name Type Description Default phenopacket_dir Path The path to the directory containing Phenopackets. required variant_analysis bool If True, check for complete variant records in the Phenopackets. required gene_analysis bool If True, check for complete gene records in the Phenopackets. required disease_analysis bool If True, check for complete disease records in the Phenopackets. required gene_identifier str Identifier for updating gene identifiers, if applicable. required hg19_template_vcf Path Path to the hg19 template VCF file (optional), to spike variants into required hg38_template_vcf Path Path to the hg38 template VCF file (optional), to spike variants into required hg19_vcf_dir Path Path to the directory containing hg19 template VCF files (optional). required hg38_vcf_dir Path Path to the directory containing hg38 template VCF files (optional). required output_dir Path The directory to save the prepared Phenopackets and, optionally, VCF files. required Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Source code in src/pheval/prepare/prepare_corpus.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 def prepare_corpus ( phenopacket_dir : Path , variant_analysis : bool , gene_analysis : bool , disease_analysis : bool , gene_identifier : str , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , ) -> None : \"\"\" Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. \"\"\" output_dir . joinpath ( \"phenopackets\" ) . mkdir ( exist_ok = True , parents = True ) for phenopacket_path in all_files ( phenopacket_dir ): phenopacket_util = PhenopacketUtil ( phenopacket_reader ( phenopacket_path )) if not phenopacket_util . observed_phenotypic_features (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to no observed phenotypic features.\" ) continue if variant_analysis : if phenopacket_util . check_incomplete_variant_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing variant fields.\" ) continue elif phenopacket_util . check_variant_alleles (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to identical \" \"reference and alternate allele fields.\" ) if gene_analysis : if phenopacket_util . check_incomplete_gene_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing gene fields.\" ) continue if disease_analysis : if phenopacket_util . check_incomplete_disease_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing disease fields.\" ) continue if hg19_template_vcf or hg38_template_vcf : output_dir . joinpath ( \"vcf\" ) . mkdir ( exist_ok = True ) create_spiked_vcf ( output_dir . joinpath ( \"vcf\" ), phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) if gene_identifier : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir . joinpath ( \"phenopackets\" ) ) else : # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory shutil . copy ( phenopacket_path , output_dir . joinpath ( f \"phenopackets/ { phenopacket_path . name } \" ) )","title":"Prepare corpus"},{"location":"api/pheval/prepare/prepare_corpus/#src.pheval.prepare.prepare_corpus.prepare_corpus","text":"Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Parameters: Name Type Description Default phenopacket_dir Path The path to the directory containing Phenopackets. required variant_analysis bool If True, check for complete variant records in the Phenopackets. required gene_analysis bool If True, check for complete gene records in the Phenopackets. required disease_analysis bool If True, check for complete disease records in the Phenopackets. required gene_identifier str Identifier for updating gene identifiers, if applicable. required hg19_template_vcf Path Path to the hg19 template VCF file (optional), to spike variants into required hg38_template_vcf Path Path to the hg38 template VCF file (optional), to spike variants into required hg19_vcf_dir Path Path to the directory containing hg19 template VCF files (optional). required hg38_vcf_dir Path Path to the directory containing hg38 template VCF files (optional). required output_dir Path The directory to save the prepared Phenopackets and, optionally, VCF files. required Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. Source code in src/pheval/prepare/prepare_corpus.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 def prepare_corpus ( phenopacket_dir : Path , variant_analysis : bool , gene_analysis : bool , disease_analysis : bool , gene_identifier : str , hg19_template_vcf : Path , hg38_template_vcf : Path , hg19_vcf_dir : Path , hg38_vcf_dir : Path , output_dir : Path , ) -> None : \"\"\" Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating gene identifiers. Args: phenopacket_dir (Path): The path to the directory containing Phenopackets. variant_analysis (bool): If True, check for complete variant records in the Phenopackets. gene_analysis (bool): If True, check for complete gene records in the Phenopackets. disease_analysis (bool): If True, check for complete disease records in the Phenopackets. gene_identifier (str): Identifier for updating gene identifiers, if applicable. hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required. hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional). hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional). output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files. Notes: To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf, hg19_vcf_dir or hg38_vcf_dir is required. \"\"\" output_dir . joinpath ( \"phenopackets\" ) . mkdir ( exist_ok = True , parents = True ) for phenopacket_path in all_files ( phenopacket_dir ): phenopacket_util = PhenopacketUtil ( phenopacket_reader ( phenopacket_path )) if not phenopacket_util . observed_phenotypic_features (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to no observed phenotypic features.\" ) continue if variant_analysis : if phenopacket_util . check_incomplete_variant_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing variant fields.\" ) continue elif phenopacket_util . check_variant_alleles (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to identical \" \"reference and alternate allele fields.\" ) if gene_analysis : if phenopacket_util . check_incomplete_gene_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing gene fields.\" ) continue if disease_analysis : if phenopacket_util . check_incomplete_disease_record (): info_log . warning ( f \"Removed { phenopacket_path . name } from the corpus due to missing disease fields.\" ) continue if hg19_template_vcf or hg38_template_vcf : output_dir . joinpath ( \"vcf\" ) . mkdir ( exist_ok = True ) create_spiked_vcf ( output_dir . joinpath ( \"vcf\" ), phenopacket_path , hg19_template_vcf , hg38_template_vcf , hg19_vcf_dir , hg38_vcf_dir , ) if gene_identifier : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir . joinpath ( \"phenopackets\" ) ) else : # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory shutil . copy ( phenopacket_path , output_dir . joinpath ( f \"phenopackets/ { phenopacket_path . name } \" ) )","title":"prepare_corpus"},{"location":"api/pheval/prepare/update_phenopacket/","text":"create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_path Path The path to the input Phenopacket file. required output_dir Path The directory where the updated Phenopacket will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def create_updated_phenopacket ( gene_identifier : str , phenopacket_path : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_path (Path): The path to the input Phenopacket file. output_dir (Path): The directory where the updated Phenopacket will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name )) create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir ) Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_dir Path The path to the input Phenopacket directory. required output_dir Path The directory where the updated Phenopackets will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 def create_updated_phenopackets ( gene_identifier : str , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_dir (Path): The path to the input Phenopacket directory. output_dir (Path): The directory where the updated Phenopackets will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () for phenopacket_path in all_files ( phenopacket_dir ): updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name )) update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) Update the gene context of the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required gene_identifier str Identifier to update the gene context. required hgnc_data defaultdict The HGNC data used for updating. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 def update_outdated_gene_context ( phenopacket_path : Path , gene_identifier : str , hgnc_data : defaultdict ) -> Union [ Phenopacket , Family ]: \"\"\" Update the gene context of the Phenopacket. Args: phenopacket_path (Path): The path to the Phenopacket file. gene_identifier (str): Identifier to update the gene context. hgnc_data (defaultdict): The HGNC data used for updating. Returns: Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) interpretations = PhenopacketUtil ( phenopacket ) . interpretations () updated_interpretations = GeneIdentifierUpdater ( hgnc_data = hgnc_data , gene_identifier = gene_identifier ) . update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) return PhenopacketRebuilder ( phenopacket ) . update_interpretations ( updated_interpretations ) update_phenopackets ( gene_identifier , phenopacket_path , phenopacket_dir , output_dir ) Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default gene_identifier str The gene identifier to be updated. required phenopacket_path Path The path to a single Phenopacket file. required phenopacket_dir Path The directory containing multiple Phenopacket files. required output_dir Path The output directory to save the updated Phenopacket files. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def update_phenopackets ( gene_identifier : str , phenopacket_path : Path , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Args: gene_identifier (str): The gene identifier to be updated. phenopacket_path (Path): The path to a single Phenopacket file. phenopacket_dir (Path): The directory containing multiple Phenopacket files. output_dir (Path): The output directory to save the updated Phenopacket files. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" output_dir . mkdir ( exist_ok = True ) if phenopacket_path is not None : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) elif phenopacket_dir is not None : create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir )","title":"Update phenopacket"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.create_updated_phenopacket","text":"Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_path Path The path to the input Phenopacket file. required output_dir Path The directory where the updated Phenopacket will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 def create_updated_phenopacket ( gene_identifier : str , phenopacket_path : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_path (Path): The path to the input Phenopacket file. output_dir (Path): The directory where the updated Phenopacket will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name ))","title":"create_updated_phenopacket"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.create_updated_phenopackets","text":"Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Parameters: Name Type Description Default gene_identifier str Identifier used to update the gene context. required phenopacket_dir Path The path to the input Phenopacket directory. required output_dir Path The directory where the updated Phenopackets will be written. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 def create_updated_phenopackets ( gene_identifier : str , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene context within the interpretations for a directory of Phenopackets and writes the updated Phenopackets. Args: gene_identifier (str): Identifier used to update the gene context. phenopacket_dir (Path): The path to the input Phenopacket directory. output_dir (Path): The directory where the updated Phenopackets will be written. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" hgnc_data = create_hgnc_dict () for phenopacket_path in all_files ( phenopacket_dir ): updated_phenopacket = update_outdated_gene_context ( phenopacket_path , gene_identifier , hgnc_data ) write_phenopacket ( updated_phenopacket , output_dir . joinpath ( phenopacket_path . name ))","title":"create_updated_phenopackets"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.update_outdated_gene_context","text":"Update the gene context of the Phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the Phenopacket file. required gene_identifier str Identifier to update the gene context. required hgnc_data defaultdict The HGNC data used for updating. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 def update_outdated_gene_context ( phenopacket_path : Path , gene_identifier : str , hgnc_data : defaultdict ) -> Union [ Phenopacket , Family ]: \"\"\" Update the gene context of the Phenopacket. Args: phenopacket_path (Path): The path to the Phenopacket file. gene_identifier (str): Identifier to update the gene context. hgnc_data (defaultdict): The HGNC data used for updating. Returns: Union[Phenopacket, Family]: The updated Phenopacket or Family. Notes: This function updates the gene context within the Phenopacket or Family instance. The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" phenopacket = phenopacket_reader ( phenopacket_path ) interpretations = PhenopacketUtil ( phenopacket ) . interpretations () updated_interpretations = GeneIdentifierUpdater ( hgnc_data = hgnc_data , gene_identifier = gene_identifier ) . update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) return PhenopacketRebuilder ( phenopacket ) . update_interpretations ( updated_interpretations )","title":"update_outdated_gene_context"},{"location":"api/pheval/prepare/update_phenopacket/#src.pheval.prepare.update_phenopacket.update_phenopackets","text":"Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Parameters: Name Type Description Default gene_identifier str The gene identifier to be updated. required phenopacket_path Path The path to a single Phenopacket file. required phenopacket_dir Path The directory containing multiple Phenopacket files. required output_dir Path The output directory to save the updated Phenopacket files. required Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. Source code in src/pheval/prepare/update_phenopacket.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 def update_phenopackets ( gene_identifier : str , phenopacket_path : Path , phenopacket_dir : Path , output_dir : Path ) -> None : \"\"\" Update the gene identifiers in either a single phenopacket or a directory of phenopackets. Args: gene_identifier (str): The gene identifier to be updated. phenopacket_path (Path): The path to a single Phenopacket file. phenopacket_dir (Path): The directory containing multiple Phenopacket files. output_dir (Path): The output directory to save the updated Phenopacket files. Notes: The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace to describe the gene identifiers. \"\"\" output_dir . mkdir ( exist_ok = True ) if phenopacket_path is not None : create_updated_phenopacket ( gene_identifier , phenopacket_path , output_dir ) elif phenopacket_dir is not None : create_updated_phenopackets ( gene_identifier , phenopacket_dir , output_dir )","title":"update_phenopackets"},{"location":"api/pheval/runners/runner/","text":"Runners Module DefaultPhEvalRunner Bases: PhEvalRunner DefaultPhEvalRunner Parameters: Name Type Description Default PhEvalRunner PhEvalRunner Abstract PhEvalRunnerClass required Source code in src/pheval/runners/runner.py 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 class DefaultPhEvalRunner ( PhEvalRunner ): \"\"\"DefaultPhEvalRunner Args: PhEvalRunner (PhEvalRunner): Abstract PhEvalRunnerClass \"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): print ( \"preparing\" ) def run ( self ): print ( \"running\" ) def post_process ( self ): print ( \"post processing\" ) PhEvalRunner dataclass Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data build_output_directory_structure () build output directory structure Source code in src/pheval/runners/runner.py 87 88 89 90 91 92 93 94 95 96 def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) construct_meta_data () Construct run output meta data Source code in src/pheval/runners/runner.py 125 126 127 def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data post_process () abstractmethod post_process Source code in src/pheval/runners/runner.py 121 122 123 @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" prepare () abstractmethod prepare Source code in src/pheval/runners/runner.py 113 114 115 @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" run () abstractmethod run Source code in src/pheval/runners/runner.py 117 118 119 @abstractmethod def run ( self ): \"\"\"run\"\"\"","title":"Runner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.DefaultPhEvalRunner","text":"Bases: PhEvalRunner DefaultPhEvalRunner Parameters: Name Type Description Default PhEvalRunner PhEvalRunner Abstract PhEvalRunnerClass required Source code in src/pheval/runners/runner.py 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 class DefaultPhEvalRunner ( PhEvalRunner ): \"\"\"DefaultPhEvalRunner Args: PhEvalRunner (PhEvalRunner): Abstract PhEvalRunnerClass \"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str def prepare ( self ): print ( \"preparing\" ) def run ( self ): print ( \"running\" ) def post_process ( self ): print ( \"post processing\" )","title":"DefaultPhEvalRunner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner","text":"Bases: ABC PhEvalRunner Class Source code in src/pheval/runners/runner.py 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 @dataclass class PhEvalRunner ( ABC ): \"\"\"PhEvalRunner Class\"\"\" input_dir : Path testdata_dir : Path tmp_dir : Path output_dir : Path config_file : Path version : str directory_path = None input_dir_config = None _meta_data = None __raw_results_dir = \"raw_results/\" __pheval_gene_results_dir = \"pheval_gene_results/\" __pheval_variant_results_dir = \"pheval_variant_results/\" __pheval_disease_results_dir = \"pheval_disease_results/\" __tool_input_commands_dir = \"tool_input_commands/\" __run_meta_data_file = \"results.yml\" def __post_init__ ( self ): self . input_dir_config = parse_input_dir_config ( self . input_dir ) def _get_tool ( self ): return self . input_dir_config . tool def _get_variant_analysis ( self ): return self . input_dir_config . variant_analysis def _get_gene_analysis ( self ): return self . input_dir_config . gene_analysis def _get_disease_analysis ( self ): return self . input_dir_config . disease_analysis @property def tool_input_commands_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __tool_input_commands_dir ) @tool_input_commands_dir . setter def tool_input_commands_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def raw_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __raw_results_dir ) @raw_results_dir . setter def raw_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_gene_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_gene_results_dir ) @pheval_gene_results_dir . setter def pheval_gene_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_variant_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_variant_results_dir ) @pheval_variant_results_dir . setter def pheval_variant_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) @property def pheval_disease_results_dir ( self ): return Path ( self . output_dir ) . joinpath ( self . __pheval_disease_results_dir ) @pheval_disease_results_dir . setter def pheval_disease_results_dir ( self , directory_path ): self . directory_path = Path ( directory_path ) def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True ) @property def meta_data ( self ): self . _meta_data = BasicOutputRunMetaData ( tool = self . input_dir_config . tool , tool_version = self . version , config = f \" { Path ( self . input_dir ) . parent . name } / { Path ( self . input_dir ) . name } \" , run_timestamp = datetime . now () . timestamp (), corpus = f \" { Path ( self . testdata_dir ) . parent . name } / { Path ( self . testdata_dir ) . name } \" , ) return self . _meta_data @meta_data . setter def meta_data ( self , meta_data ): self . _meta_data = meta_data @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\" @abstractmethod def run ( self ): \"\"\"run\"\"\" @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\" def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"PhEvalRunner"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.build_output_directory_structure","text":"build output directory structure Source code in src/pheval/runners/runner.py 87 88 89 90 91 92 93 94 95 96 def build_output_directory_structure ( self ): \"\"\"build output directory structure\"\"\" self . tool_input_commands_dir . mkdir ( exist_ok = True ) self . raw_results_dir . mkdir ( exist_ok = True ) if self . _get_variant_analysis (): self . pheval_variant_results_dir . mkdir ( exist_ok = True ) if self . _get_gene_analysis (): self . pheval_gene_results_dir . mkdir ( exist_ok = True ) if self . _get_disease_analysis (): self . pheval_disease_results_dir . mkdir ( exist_ok = True )","title":"build_output_directory_structure"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.construct_meta_data","text":"Construct run output meta data Source code in src/pheval/runners/runner.py 125 126 127 def construct_meta_data ( self ): \"\"\"Construct run output meta data\"\"\" return self . meta_data","title":"construct_meta_data"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.post_process","text":"post_process Source code in src/pheval/runners/runner.py 121 122 123 @abstractmethod def post_process ( self ): \"\"\"post_process\"\"\"","title":"post_process"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.prepare","text":"prepare Source code in src/pheval/runners/runner.py 113 114 115 @abstractmethod def prepare ( self ) -> str : \"\"\"prepare\"\"\"","title":"prepare"},{"location":"api/pheval/runners/runner/#src.pheval.runners.runner.PhEvalRunner.run","text":"run Source code in src/pheval/runners/runner.py 117 118 119 @abstractmethod def run ( self ): \"\"\"run\"\"\"","title":"run"},{"location":"api/pheval/utils/exomiser/","text":"semsim_to_exomiserdb ( input_path , object_prefix , subject_prefix , db_path ) ingests semsim file into exomiser phenotypic database Parameters: Name Type Description Default input_path Path semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv required object_prefix str object prefix. e.g. MP required subject_prefix str subject prefix e.g HP required db_path Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) required Source code in src/pheval/utils/exomiser.py 6 7 8 9 10 11 12 13 14 15 16 def semsim_to_exomiserdb ( input_path : Path , object_prefix : str , subject_prefix : str , db_path : Path ): \"\"\"ingests semsim file into exomiser phenotypic database Args: input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) \"\"\" exomiserdb = ExomiserDB ( db_path ) exomiserdb . import_from_semsim_file ( input_path , object_prefix , subject_prefix )","title":"Exomiser"},{"location":"api/pheval/utils/exomiser/#src.pheval.utils.exomiser.semsim_to_exomiserdb","text":"ingests semsim file into exomiser phenotypic database Parameters: Name Type Description Default input_path Path semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv required object_prefix str object prefix. e.g. MP required subject_prefix str subject prefix e.g HP required db_path Path Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) required Source code in src/pheval/utils/exomiser.py 6 7 8 9 10 11 12 13 14 15 16 def semsim_to_exomiserdb ( input_path : Path , object_prefix : str , subject_prefix : str , db_path : Path ): \"\"\"ingests semsim file into exomiser phenotypic database Args: input_path (Path): semsim input file. e.g phenio-plus-hp-mp.0.semsimian.tsv object_prefix (str): object prefix. e.g. MP subject_prefix (str): subject prefix e.g HP db_path (Path): Exomiser Phenotypic Database Folder Path. (e.g. /exomiser_folder/2209_phenotype/2209_phenotype/) \"\"\" exomiserdb = ExomiserDB ( db_path ) exomiserdb . import_from_semsim_file ( input_path , object_prefix , subject_prefix )","title":"semsim_to_exomiserdb"},{"location":"api/pheval/utils/file_utils/","text":"all_files ( directory ) Obtains all files from a given directory. Parameters: Name Type Description Default directory Path The directory path. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing all files in the directory. Source code in src/pheval/utils/file_utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 def all_files ( directory : Path ) -> list [ Path ]: \"\"\" Obtains all files from a given directory. Args: directory (Path): The directory path. Returns: list[Path]: A list of Path objects representing all files in the directory. \"\"\" files = [ file_path for file_path in directory . iterdir ()] files . sort () return files ensure_columns_exists ( cols , dataframes , err_message = '' ) Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" Source code in src/pheval/utils/file_utils.py 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def ensure_columns_exists ( cols : list , dataframes : List [ pd . DataFrame ], err_message : str = \"\" ): \"\"\"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" \"\"\" flat_cols = list ( itertools . chain ( cols )) if not dataframes or not flat_cols : return if err_message : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } { err_message } \"\"\" else : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } \\ - must be present in both left and right files\"\"\" for dataframe in dataframes : if not all ( x in dataframe . columns for x in flat_cols ): raise ValueError ( err_msg ) ensure_file_exists ( * files ) Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised Source code in src/pheval/utils/file_utils.py 73 74 75 76 77 78 79 80 def ensure_file_exists ( * files : str ): \"\"\"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised \"\"\" for file in files : if not path . isfile ( file ): raise FileNotFoundError ( f \"File { file } not found\" ) files_with_suffix ( directory , suffix ) Obtains all files ending in a specified suffix from a given directory. Parameters: Name Type Description Default directory Path The directory path. required suffix str The specified suffix to filter files. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing files with the specified suffix. Source code in src/pheval/utils/file_utils.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def files_with_suffix ( directory : Path , suffix : str ) -> list [ Path ]: \"\"\" Obtains all files ending in a specified suffix from a given directory. Args: directory (Path): The directory path. suffix (str): The specified suffix to filter files. Returns: list[Path]: A list of Path objects representing files with the specified suffix. \"\"\" files = [ file_path for file_path in directory . iterdir () if file_path . suffix == suffix ] files . sort () return files is_gzipped ( file_path ) Confirms whether a file is gzipped. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description bool bool True if the file is gzipped, False otherwise. Source code in src/pheval/utils/file_utils.py 46 47 48 49 50 51 52 53 54 55 56 def is_gzipped ( file_path : Path ) -> bool : \"\"\" Confirms whether a file is gzipped. Args: file_path (Path): The path to the file. Returns: bool: True if the file is gzipped, False otherwise. \"\"\" return file_path . name . endswith ( \".gz\" ) normalise_file_name ( file_path ) Normalises the file name by removing diacritical marks (accents) from Unicode characters. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description str str The normalised file name without diacritical marks. Source code in src/pheval/utils/file_utils.py 59 60 61 62 63 64 65 66 67 68 69 70 def normalise_file_name ( file_path : Path ) -> str : \"\"\" Normalises the file name by removing diacritical marks (accents) from Unicode characters. Args: file_path (Path): The path to the file. Returns: str: The normalised file name without diacritical marks. \"\"\" normalised_file_name = unicodedata . normalize ( \"NFD\" , str ( file_path )) return re . sub ( \"[ \\u0300 - \\u036f ]\" , \"\" , normalised_file_name ) write_metadata ( output_dir , meta_data ) Write the metadata for a run to a YAML file. Parameters: Name Type Description Default output_dir Path The directory where the metadata file will be saved. required meta_data BasicOutputRunMetaData The metadata to be written. required Source code in src/pheval/utils/file_utils.py 108 109 110 111 112 113 114 115 116 117 118 def write_metadata ( output_dir : Path , meta_data : BasicOutputRunMetaData ) -> None : \"\"\" Write the metadata for a run to a YAML file. Args: output_dir (Path): The directory where the metadata file will be saved. meta_data (BasicOutputRunMetaData): The metadata to be written. \"\"\" with open ( Path ( output_dir ) . joinpath ( \"results.yml\" ), \"w\" ) as metadata_file : yaml . dump ( to_dict ( meta_data ), metadata_file , sort_keys = False , default_style = \"\" ) metadata_file . close ()","title":"File utils"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.all_files","text":"Obtains all files from a given directory. Parameters: Name Type Description Default directory Path The directory path. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing all files in the directory. Source code in src/pheval/utils/file_utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 def all_files ( directory : Path ) -> list [ Path ]: \"\"\" Obtains all files from a given directory. Args: directory (Path): The directory path. Returns: list[Path]: A list of Path objects representing all files in the directory. \"\"\" files = [ file_path for file_path in directory . iterdir ()] files . sort () return files","title":"all_files"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.ensure_columns_exists","text":"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" Source code in src/pheval/utils/file_utils.py 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def ensure_columns_exists ( cols : list , dataframes : List [ pd . DataFrame ], err_message : str = \"\" ): \"\"\"Ensures the columns exist in dataframes passed as argument (e.g) \" ensure_columns_exists( cols=['column_a', 'column_b, 'column_c'], err_message=\"Custom error message if any column doesn't exist in any dataframe passed as argument\", dataframes=[data_frame1, data_frame2], ) \" \"\"\" flat_cols = list ( itertools . chain ( cols )) if not dataframes or not flat_cols : return if err_message : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } { err_message } \"\"\" else : err_msg = f \"\"\"columns: { \", \" . join ( flat_cols [: - 1 ]) } and { flat_cols [ - 1 ] } \\ - must be present in both left and right files\"\"\" for dataframe in dataframes : if not all ( x in dataframe . columns for x in flat_cols ): raise ValueError ( err_msg )","title":"ensure_columns_exists"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.ensure_file_exists","text":"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised Source code in src/pheval/utils/file_utils.py 73 74 75 76 77 78 79 80 def ensure_file_exists ( * files : str ): \"\"\"Ensures the existence of files passed as parameter Raises: FileNotFoundError: If any file passed as a parameter doesn't exist a FileNotFound Exception will be raised \"\"\" for file in files : if not path . isfile ( file ): raise FileNotFoundError ( f \"File { file } not found\" )","title":"ensure_file_exists"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.files_with_suffix","text":"Obtains all files ending in a specified suffix from a given directory. Parameters: Name Type Description Default directory Path The directory path. required suffix str The specified suffix to filter files. required Returns: Type Description list [ Path ] list[Path]: A list of Path objects representing files with the specified suffix. Source code in src/pheval/utils/file_utils.py 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def files_with_suffix ( directory : Path , suffix : str ) -> list [ Path ]: \"\"\" Obtains all files ending in a specified suffix from a given directory. Args: directory (Path): The directory path. suffix (str): The specified suffix to filter files. Returns: list[Path]: A list of Path objects representing files with the specified suffix. \"\"\" files = [ file_path for file_path in directory . iterdir () if file_path . suffix == suffix ] files . sort () return files","title":"files_with_suffix"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.is_gzipped","text":"Confirms whether a file is gzipped. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description bool bool True if the file is gzipped, False otherwise. Source code in src/pheval/utils/file_utils.py 46 47 48 49 50 51 52 53 54 55 56 def is_gzipped ( file_path : Path ) -> bool : \"\"\" Confirms whether a file is gzipped. Args: file_path (Path): The path to the file. Returns: bool: True if the file is gzipped, False otherwise. \"\"\" return file_path . name . endswith ( \".gz\" )","title":"is_gzipped"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.normalise_file_name","text":"Normalises the file name by removing diacritical marks (accents) from Unicode characters. Parameters: Name Type Description Default file_path Path The path to the file. required Returns: Name Type Description str str The normalised file name without diacritical marks. Source code in src/pheval/utils/file_utils.py 59 60 61 62 63 64 65 66 67 68 69 70 def normalise_file_name ( file_path : Path ) -> str : \"\"\" Normalises the file name by removing diacritical marks (accents) from Unicode characters. Args: file_path (Path): The path to the file. Returns: str: The normalised file name without diacritical marks. \"\"\" normalised_file_name = unicodedata . normalize ( \"NFD\" , str ( file_path )) return re . sub ( \"[ \\u0300 - \\u036f ]\" , \"\" , normalised_file_name )","title":"normalise_file_name"},{"location":"api/pheval/utils/file_utils/#src.pheval.utils.file_utils.write_metadata","text":"Write the metadata for a run to a YAML file. Parameters: Name Type Description Default output_dir Path The directory where the metadata file will be saved. required meta_data BasicOutputRunMetaData The metadata to be written. required Source code in src/pheval/utils/file_utils.py 108 109 110 111 112 113 114 115 116 117 118 def write_metadata ( output_dir : Path , meta_data : BasicOutputRunMetaData ) -> None : \"\"\" Write the metadata for a run to a YAML file. Args: output_dir (Path): The directory where the metadata file will be saved. meta_data (BasicOutputRunMetaData): The metadata to be written. \"\"\" with open ( Path ( output_dir ) . joinpath ( \"results.yml\" ), \"w\" ) as metadata_file : yaml . dump ( to_dict ( meta_data ), metadata_file , sort_keys = False , default_style = \"\" ) metadata_file . close ()","title":"write_metadata"},{"location":"api/pheval/utils/phenopacket_utils/","text":"GeneIdentifierUpdater Class for updating gene identifiers within genomic interpretations. Source code in src/pheval/utils/phenopacket_utils.py 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 class GeneIdentifierUpdater : \"\"\"Class for updating gene identifiers within genomic interpretations.\"\"\" def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] def _find_alternate_ids ( self , gene_symbol : str ) -> List [ str ]: \"\"\" Find the alternate IDs for a gene symbol. Args: gene_symbol (str): The gene symbol to find alternate IDs for. Returns: List[str]: List of alternate IDs for the gene symbol. \"\"\" if gene_symbol in self . hgnc_data . keys (): return [ self . hgnc_data [ gene_symbol ][ \"hgnc_id\" ], \"ncbigene:\" + self . hgnc_data [ gene_symbol ][ \"entrez_id\" ], \"ensembl:\" + self . hgnc_data [ gene_symbol ][ \"ensembl_id\" ], \"symbol:\" + gene_symbol , ] else : for symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return [ data [ \"hgnc_id\" ], \"ncbigene:\" + data [ \"entrez_id\" ], \"ensembl:\" + data [ \"ensembl_id\" ], \"symbol:\" + symbol , ] def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations __init__ ( gene_identifier , hgnc_data = None , identifier_map = None ) Initialise the GeneIdentifierUpdater. Parameters: Name Type Description Default gene_identifier str The gene identifier to update to. required hgnc_data dict A dictionary containing HGNC data (default: None). None identifier_map dict A dictionary mapping gene identifiers (default: None). None Source code in src/pheval/utils/phenopacket_utils.py 654 655 656 657 658 659 660 661 662 663 664 665 666 def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map find_identifier ( gene_symbol ) Find the specified gene identifier for a gene symbol. Parameters: Name Type Description Default gene_symbol str The gene symbol to find the identifier for. required Returns: Name Type Description str str The identified gene identifier. Source code in src/pheval/utils/phenopacket_utils.py 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] obtain_gene_symbol_from_identifier ( query_gene_identifier ) Obtain gene symbol from a gene identifier. Parameters: Name Type Description Default query_gene_identifier str The gene identifier. required Returns: Name Type Description str str The gene symbol corresponding to the identifier. Source code in src/pheval/utils/phenopacket_utils.py 686 687 688 689 690 691 692 693 694 695 696 def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] update_genomic_interpretations_gene_identifier ( interpretations , phenopacket_path ) Update the genomic interpretations of a Phenopacket. Parameters: Name Type Description Default interpretations List [ Interpretation ] List of Interpretation objects. required Returns: Type Description List [ Interpretation ] List[Interpretation]: Updated list of Interpretation objects. Source code in src/pheval/utils/phenopacket_utils.py 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations GenomicVariant dataclass Represents a genomic variant. Parameters: Name Type Description Default chrom str The chromosome position of the variant recommended to be provided in the following format. required pos int Position of the variant following VCF convention. required ref str Reference allele following VCF convention. required alt str Alternate allele following VCF convention. required Source code in src/pheval/utils/phenopacket_utils.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 @dataclass class GenomicVariant : \"\"\" Represents a genomic variant. Args: chrom (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. pos (int): Position of the variant following VCF convention. ref (str): Reference allele following VCF convention. alt (str): Alternate allele following VCF convention. \"\"\" chrom : str pos : int ref : str alt : str IncompatibleGenomeAssemblyError Bases: Exception Exception raised for incompatible genome assembly. Source code in src/pheval/utils/phenopacket_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 class IncompatibleGenomeAssemblyError ( Exception ): \"\"\"Exception raised for incompatible genome assembly.\"\"\" def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . assembly } in { self . phenopacket } \" __init__ ( assembly , phenopacket , message = 'Incompatible Genome Assembly' ) Initialise IncompatibleGenomeAssemblyError. Attributes: Name Type Description assembly str Incompatible genome assembly encountered. phenopacket Path Path to the Phenopacket associated with the error. message str Custom error message (default is \"Incompatible Genome Assembly\"). Source code in src/pheval/utils/phenopacket_utils.py 30 31 32 33 34 35 36 37 38 39 40 41 42 def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) PhenopacketRebuilder Class for rebuilding a Phenopacket Source code in src/pheval/utils/phenopacket_utils.py 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 class PhenopacketRebuilder : \"\"\"Class for rebuilding a Phenopacket\"\"\" def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket __init__ ( phenopacket ) Initialise PhenopacketUtil Attributes: Name Type Description phenopacket Union [ Phenopacket , Family ] Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 553 554 555 556 557 558 559 def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket add_randomised_hpo ( randomised_hpo ) Add randomised phenotypic profiles to a Phenopacket or Family. Parameters: Name Type Description Default randomised_hpo [ PhenotypicFeature ] The randomised phenotypic profiles to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. Source code in src/pheval/utils/phenopacket_utils.py 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket add_spiked_vcf_path ( spiked_vcf_file_data ) Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. Source code in src/pheval/utils/phenopacket_utils.py 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket update_interpretations ( interpretations ) Add the updated interpretations to a Phenopacket or Family. Parameters: Name Type Description Default interpretations List [ Interpretation ] The updated interpretations to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. Source code in src/pheval/utils/phenopacket_utils.py 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket PhenopacketUtil Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False __init__ ( phenopacket_contents ) Initialise PhenopacketUtil Parameters: Name Type Description Default phenopacket_contents Union [ Phenopacket , Family ] Phenopacket or Family object required Source code in src/pheval/utils/phenopacket_utils.py 222 223 224 225 226 227 228 def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents causative_variants () Retrieve a list of causative variants listed in a Phenopacket Returns: Type Description List [ ProbandCausativeVariant ] List[ProbandCausativeVariant]: List of proband causative variants Source code in src/pheval/utils/phenopacket_utils.py 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants check_incomplete_disease_record () Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: Name Type Description bool bool True if any disease record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 535 536 537 538 539 540 541 542 543 544 545 546 547 def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False check_incomplete_gene_record () Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: Name Type Description bool bool True if any gene record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False check_incomplete_variant_record () Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: Name Type Description bool bool True if any variant record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False check_variant_alleles () Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: Name Type Description bool bool True if the reference and alternate alleles are identical, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 506 507 508 509 510 511 512 513 514 515 516 517 def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False diagnosed_genes () Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes Source code in src/pheval/utils/phenopacket_utils.py 446 447 448 449 450 451 452 453 454 455 456 457 458 def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes diagnosed_variants () Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants Source code in src/pheval/utils/phenopacket_utils.py 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants diagnoses () Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: Type Description List [ ProbandDisease ] List[ProbandDisease]: List of diagnosed diseases Source code in src/pheval/utils/phenopacket_utils.py 331 332 333 334 335 336 337 338 def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) diseases () Retrieve a list of Diseases associated with the proband Returns: Type Description List [ Disease ] List[Disease]: List of diseases Source code in src/pheval/utils/phenopacket_utils.py 283 284 285 286 287 288 289 290 291 292 293 def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases files () Retrieve a list of files associated with a phenopacket Returns: Type Description List [ File ] List[File]: List of files associated with a phenopacket Source code in src/pheval/utils/phenopacket_utils.py 380 381 382 383 384 385 386 387 def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files interpretations () Retrieve a list of interpretations from a Phenopacket Returns: Type Description List [ Interpretation ] List[Interpretation]: List of interpretations Source code in src/pheval/utils/phenopacket_utils.py 340 341 342 343 344 345 346 347 348 349 350 def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations negated_phenotypic_features () Retrieve a list of all negated HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of negated HPO terms Source code in src/pheval/utils/phenopacket_utils.py 269 270 271 272 273 274 275 276 277 278 279 280 281 def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features observed_phenotypic_features () Retrieve a list of all observed HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of observed HPO terms Source code in src/pheval/utils/phenopacket_utils.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features phenotypic_features () Retrieve a list of all HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of HPO terms Source code in src/pheval/utils/phenopacket_utils.py 242 243 244 245 246 247 248 249 250 251 252 def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features sample_id () Retrieve the sample ID from a Phenopacket or proband of a Family Returns: Name Type Description str str Sample ID Source code in src/pheval/utils/phenopacket_utils.py 230 231 232 233 234 235 236 237 238 239 240 def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id vcf_file_data ( phenopacket_path , vcf_dir ) Retrieve the genome assembly and VCF file name from a phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the phenopacket file. required vcf_dir Path The directory path where the VCF file is stored. required Returns: Name Type Description File File The VCF file with updated URI pointing to the specified directory. Raises: Type Description IncorrectFileFormatError If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError If the genome assembly of the VCF file is not compatible. Note This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. Source code in src/pheval/utils/phenopacket_utils.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data ProbandCausativeGene dataclass Represents a causative gene associated with a proband Parameters: Name Type Description Default gene_symbol str Symbol representing the gene required gene_identifier str The ENSEMBL gene identifier for the result entry required Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 @dataclass class ProbandCausativeGene : \"\"\" Represents a causative gene associated with a proband Args: gene_symbol (str): Symbol representing the gene gene_identifier (str): The ENSEMBL gene identifier for the result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : str gene_identifier : str ProbandCausativeVariant dataclass Represents a causative variant associated with a proband Parameters: Name Type Description Default proband_id str ID of the proband required assembly str Genome assembly required variant GenomicVariant Genomic variant associated with the proband required genotype str Genotype information for the variant required info str Additional information about the variant (default is an empty string) '' Source code in src/pheval/utils/phenopacket_utils.py 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 @dataclass class ProbandCausativeVariant : \"\"\" Represents a causative variant associated with a proband Args: proband_id (str): ID of the proband assembly (str): Genome assembly variant (GenomicVariant): Genomic variant associated with the proband genotype (str): Genotype information for the variant info (str, optional): Additional information about the variant (default is an empty string) \"\"\" proband_id : str assembly : str variant : GenomicVariant genotype : str info : str = \"\" ProbandDisease dataclass Represents a disease associated with a proband Parameters: Name Type Description Default disease_name str Name of the disease required disease_identifier str Identifier for the disease result entry in the OMIM namespace required Notes While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 @dataclass ( frozen = True , eq = True ) class ProbandDisease : \"\"\" Represents a disease associated with a proband Args: disease_name (str): Name of the disease disease_identifier (str): Identifier for the disease result entry in the OMIM namespace Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str create_gene_identifier_map () Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: Name Type Description dict dict A mapping of gene identifiers to gene symbols. Notes The dictionary structure: { 'identifier': 'gene_symbol', ... } Source code in src/pheval/utils/phenopacket_utils.py 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 def create_gene_identifier_map () -> dict : \"\"\" Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: dict: A mapping of gene identifiers to gene symbols. Notes: The dictionary structure: { 'identifier': 'gene_symbol', ... } \"\"\" hgnc_df = read_hgnc_data () identifier_map = {} for _index , row in hgnc_df . iterrows (): identifier_map [ row [ \"ensembl_gene_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"hgnc_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"entrez_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"refseq_accession\" ]] = row [ \"symbol\" ] return identifier_map create_hgnc_dict () Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: Name Type Description defaultdict defaultdict A dictionary containing gene symbols as keys and their associated gene information. Notes The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } Source code in src/pheval/utils/phenopacket_utils.py 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 def create_hgnc_dict () -> defaultdict : \"\"\" Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: defaultdict: A dictionary containing gene symbols as keys and their associated gene information. Notes: The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } \"\"\" hgnc_df = read_hgnc_data () hgnc_data = defaultdict ( dict ) for _index , row in hgnc_df . iterrows (): previous_names = [] hgnc_data [ row [ \"symbol\" ]][ \"ensembl_id\" ] = row [ \"ensembl_gene_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"hgnc_id\" ] = row [ \"hgnc_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"entrez_id\" ] = row [ \"entrez_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"refseq_accession\" ] = row [ \"refseq_accession\" ] previous = str ( row [ \"prev_symbol\" ]) . split ( \"|\" ) for p in previous : previous_names . append ( p . strip ( '\"' )) hgnc_data [ row [ \"symbol\" ]][ \"previous_symbol\" ] = previous_names return hgnc_data create_json_message ( phenopacket ) Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. Source code in src/pheval/utils/phenopacket_utils.py 621 622 623 624 625 626 627 628 629 630 631 def create_json_message ( phenopacket : Union [ Phenopacket , Family ]) -> str : \"\"\" Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. \"\"\" return MessageToJson ( phenopacket ) phenopacket_reader ( file ) Read a Phenopacket file and returns its contents as a Phenopacket or Family object Parameters: Name Type Description Default file Path Path to the Phenopacket file required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def phenopacket_reader ( file : Path ) -> Union [ Phenopacket , Family ]: \"\"\" Read a Phenopacket file and returns its contents as a Phenopacket or Family object Args: file (Path): Path to the Phenopacket file Returns: Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object \"\"\" file = open ( file , \"r\" ) phenopacket = json . load ( file ) file . close () if \"proband\" in phenopacket : return Parse ( json . dumps ( phenopacket ), Family ()) else : return Parse ( json . dumps ( phenopacket ), Phenopacket ()) read_hgnc_data () Read HGNC data from a file and return it as a Pandas DataFrame. Returns: Type Description DataFrame pd.DataFrame: DataFrame containing the HGNC data. Source code in src/pheval/utils/phenopacket_utils.py 125 126 127 128 129 130 131 132 133 134 135 136 def read_hgnc_data () -> pd . DataFrame : \"\"\" Read HGNC data from a file and return it as a Pandas DataFrame. Returns: pd.DataFrame: DataFrame containing the HGNC data. \"\"\" return pd . read_csv ( os . path . dirname ( __file__ ) . replace ( \"utils\" , \"resources/hgnc_complete_set.txt\" ), delimiter = \" \\t \" , dtype = str , ) write_phenopacket ( phenopacket , output_file ) Write a Phenopacket or Family object to a file in JSON format. Parameters: Name Type Description Default phenopacket Phenopacket or Family The Phenopacket or Family object to be written. required output_file Path The Path object representing the file to write the Phenopacket data. required Returns: Type Description None None Source code in src/pheval/utils/phenopacket_utils.py 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 def write_phenopacket ( phenopacket : Union [ Phenopacket , Family ], output_file : Path ) -> None : \"\"\" Write a Phenopacket or Family object to a file in JSON format. Args: phenopacket (Phenopacket or Family): The Phenopacket or Family object to be written. output_file (Path): The Path object representing the file to write the Phenopacket data. Returns: None \"\"\" phenopacket_json = create_json_message ( phenopacket ) with open ( output_file , \"w\" ) as outfile : outfile . write ( phenopacket_json ) outfile . close ()","title":"Phenopacket utils"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater","text":"Class for updating gene identifiers within genomic interpretations. Source code in src/pheval/utils/phenopacket_utils.py 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 class GeneIdentifierUpdater : \"\"\"Class for updating gene identifiers within genomic interpretations.\"\"\" def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ] def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ] def _find_alternate_ids ( self , gene_symbol : str ) -> List [ str ]: \"\"\" Find the alternate IDs for a gene symbol. Args: gene_symbol (str): The gene symbol to find alternate IDs for. Returns: List[str]: List of alternate IDs for the gene symbol. \"\"\" if gene_symbol in self . hgnc_data . keys (): return [ self . hgnc_data [ gene_symbol ][ \"hgnc_id\" ], \"ncbigene:\" + self . hgnc_data [ gene_symbol ][ \"entrez_id\" ], \"ensembl:\" + self . hgnc_data [ gene_symbol ][ \"ensembl_id\" ], \"symbol:\" + gene_symbol , ] else : for symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return [ data [ \"hgnc_id\" ], \"ncbigene:\" + data [ \"entrez_id\" ], \"ensembl:\" + data [ \"ensembl_id\" ], \"symbol:\" + symbol , ] def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations","title":"GeneIdentifierUpdater"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.__init__","text":"Initialise the GeneIdentifierUpdater. Parameters: Name Type Description Default gene_identifier str The gene identifier to update to. required hgnc_data dict A dictionary containing HGNC data (default: None). None identifier_map dict A dictionary mapping gene identifiers (default: None). None Source code in src/pheval/utils/phenopacket_utils.py 654 655 656 657 658 659 660 661 662 663 664 665 666 def __init__ ( self , gene_identifier : str , hgnc_data : dict = None , identifier_map : dict = None ): \"\"\" Initialise the GeneIdentifierUpdater. Args: gene_identifier (str): The gene identifier to update to. hgnc_data (dict): A dictionary containing HGNC data (default: None). identifier_map (dict): A dictionary mapping gene identifiers (default: None). \"\"\" self . hgnc_data = hgnc_data self . gene_identifier = gene_identifier self . identifier_map = identifier_map","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.find_identifier","text":"Find the specified gene identifier for a gene symbol. Parameters: Name Type Description Default gene_symbol str The gene symbol to find the identifier for. required Returns: Name Type Description str str The identified gene identifier. Source code in src/pheval/utils/phenopacket_utils.py 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 def find_identifier ( self , gene_symbol : str ) -> str : \"\"\" Find the specified gene identifier for a gene symbol. Args: gene_symbol (str): The gene symbol to find the identifier for. Returns: str: The identified gene identifier. \"\"\" if gene_symbol in self . hgnc_data . keys (): return self . hgnc_data [ gene_symbol ][ self . gene_identifier ] else : for _symbol , data in self . hgnc_data . items (): for prev_symbol in data [ \"previous_symbol\" ]: if prev_symbol == gene_symbol : return data [ self . gene_identifier ]","title":"find_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.obtain_gene_symbol_from_identifier","text":"Obtain gene symbol from a gene identifier. Parameters: Name Type Description Default query_gene_identifier str The gene identifier. required Returns: Name Type Description str str The gene symbol corresponding to the identifier. Source code in src/pheval/utils/phenopacket_utils.py 686 687 688 689 690 691 692 693 694 695 696 def obtain_gene_symbol_from_identifier ( self , query_gene_identifier : str ) -> str : \"\"\" Obtain gene symbol from a gene identifier. Args: query_gene_identifier (str): The gene identifier. Returns: str: The gene symbol corresponding to the identifier. \"\"\" return self . identifier_map [ query_gene_identifier ]","title":"obtain_gene_symbol_from_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GeneIdentifierUpdater.update_genomic_interpretations_gene_identifier","text":"Update the genomic interpretations of a Phenopacket. Parameters: Name Type Description Default interpretations List [ Interpretation ] List of Interpretation objects. required Returns: Type Description List [ Interpretation ] List[Interpretation]: Updated list of Interpretation objects. Source code in src/pheval/utils/phenopacket_utils.py 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 def update_genomic_interpretations_gene_identifier ( self , interpretations : List [ Interpretation ], phenopacket_path : Path ) -> List [ Interpretation ]: \"\"\" Update the genomic interpretations of a Phenopacket. Args: interpretations (List[Interpretation]): List of Interpretation objects. Returns: List[Interpretation]: Updated list of Interpretation objects. \"\"\" updated_interpretations = copy ( list ( interpretations )) for updated_interpretation in updated_interpretations : for g in updated_interpretation . diagnosis . genomic_interpretations : updated_gene_identifier = self . find_identifier ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) info_log . info ( f \"Updating gene identifier in { phenopacket_path } from \" f \" { g . variant_interpretation . variation_descriptor . gene_context . value_id } \" f \"to { updated_gene_identifier } \" ) g . variant_interpretation . variation_descriptor . gene_context . value_id = ( updated_gene_identifier ) del g . variant_interpretation . variation_descriptor . gene_context . alternate_ids [:] g . variant_interpretation . variation_descriptor . gene_context . alternate_ids . extend ( self . _find_alternate_ids ( g . variant_interpretation . variation_descriptor . gene_context . symbol ) ) return updated_interpretations","title":"update_genomic_interpretations_gene_identifier"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.GenomicVariant","text":"Represents a genomic variant. Parameters: Name Type Description Default chrom str The chromosome position of the variant recommended to be provided in the following format. required pos int Position of the variant following VCF convention. required ref str Reference allele following VCF convention. required alt str Alternate allele following VCF convention. required Source code in src/pheval/utils/phenopacket_utils.py 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 @dataclass class GenomicVariant : \"\"\" Represents a genomic variant. Args: chrom (str): The chromosome position of the variant recommended to be provided in the following format. This includes numerical designations from 1 to 22 representing autosomal chromosomes, as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT. pos (int): Position of the variant following VCF convention. ref (str): Reference allele following VCF convention. alt (str): Alternate allele following VCF convention. \"\"\" chrom : str pos : int ref : str alt : str","title":"GenomicVariant"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.IncompatibleGenomeAssemblyError","text":"Bases: Exception Exception raised for incompatible genome assembly. Source code in src/pheval/utils/phenopacket_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 class IncompatibleGenomeAssemblyError ( Exception ): \"\"\"Exception raised for incompatible genome assembly.\"\"\" def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message ) def __str__ ( self ): return f \" { self . message } -> { self . assembly } in { self . phenopacket } \"","title":"IncompatibleGenomeAssemblyError"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.IncompatibleGenomeAssemblyError.__init__","text":"Initialise IncompatibleGenomeAssemblyError. Attributes: Name Type Description assembly str Incompatible genome assembly encountered. phenopacket Path Path to the Phenopacket associated with the error. message str Custom error message (default is \"Incompatible Genome Assembly\"). Source code in src/pheval/utils/phenopacket_utils.py 30 31 32 33 34 35 36 37 38 39 40 41 42 def __init__ ( self , assembly , phenopacket , message = \"Incompatible Genome Assembly\" ): \"\"\" Initialise IncompatibleGenomeAssemblyError. Attributes: assembly (str): Incompatible genome assembly encountered. phenopacket (Path): Path to the Phenopacket associated with the error. message (str, optional): Custom error message (default is \"Incompatible Genome Assembly\"). \"\"\" self . assembly : str = assembly self . phenopacket : Path = phenopacket self . message : str = message super () . __init__ ( self . message )","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder","text":"Class for rebuilding a Phenopacket Source code in src/pheval/utils/phenopacket_utils.py 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 class PhenopacketRebuilder : \"\"\"Class for rebuilding a Phenopacket\"\"\" def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket","title":"PhenopacketRebuilder"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.__init__","text":"Initialise PhenopacketUtil Attributes: Name Type Description phenopacket Union [ Phenopacket , Family ] Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 553 554 555 556 557 558 559 def __init__ ( self , phenopacket : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Attributes: phenopacket (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket = phenopacket","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.add_randomised_hpo","text":"Add randomised phenotypic profiles to a Phenopacket or Family. Parameters: Name Type Description Default randomised_hpo [ PhenotypicFeature ] The randomised phenotypic profiles to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. Source code in src/pheval/utils/phenopacket_utils.py 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 def add_randomised_hpo ( self , randomised_hpo : [ PhenotypicFeature ]) -> Union [ Phenopacket , Family ]: \"\"\" Add randomised phenotypic profiles to a Phenopacket or Family. Args: randomised_hpo: The randomised phenotypic profiles to be added. Returns: Union[Phenopacket, Family] The Phenopacket or Family object with added randomised profiles. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . phenotypic_features [:] phenopacket . proband . phenotypic_features . extend ( randomised_hpo ) else : del phenopacket . phenotypic_features [:] phenopacket . phenotypic_features . extend ( randomised_hpo ) return phenopacket","title":"add_randomised_hpo"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.add_spiked_vcf_path","text":"Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. Source code in src/pheval/utils/phenopacket_utils.py 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 def add_spiked_vcf_path ( self , spiked_vcf_file_data : File ) -> Union [ Phenopacket , Family ]: \"\"\" Add a spiked VCF path to a Phenopacket or Family. Args: - spiked_vcf_file_data (File): The VCF file data to be added. Returns: - Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path. \"\"\" phenopacket = copy ( self . phenopacket ) phenopacket_files = [ file for file in phenopacket . files if file . file_attributes [ \"fileFormat\" ] != \"vcf\" ] phenopacket_files . append ( spiked_vcf_file_data ) del phenopacket . files [:] phenopacket . files . extend ( phenopacket_files ) return phenopacket","title":"add_spiked_vcf_path"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketRebuilder.update_interpretations","text":"Add the updated interpretations to a Phenopacket or Family. Parameters: Name Type Description Default interpretations List [ Interpretation ] The updated interpretations to be added. required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. Source code in src/pheval/utils/phenopacket_utils.py 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 def update_interpretations ( self , interpretations : [ Interpretation ] ) -> Union [ Phenopacket , Family ]: \"\"\" Add the updated interpretations to a Phenopacket or Family. Args: interpretations (List[Interpretation]): The updated interpretations to be added. Returns: Union[Phenopacket, Family]: The Phenopacket or Family object with updated interpretations. \"\"\" phenopacket = copy ( self . phenopacket ) if hasattr ( phenopacket , \"proband\" ): del phenopacket . proband . interpretations [:] phenopacket . proband . interpretations . extend ( interpretations ) else : del phenopacket . interpretations [:] phenopacket . interpretations . extend ( interpretations ) return phenopacket","title":"update_interpretations"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil","text":"Class for retrieving data from a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 class PhenopacketUtil : \"\"\"Class for retrieving data from a Phenopacket or Family object\"\"\" def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases def _diagnosis_from_interpretations ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the interpretations object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] interpretation = self . interpretations () for i in interpretation : ( diagnoses . append ( ProbandDisease ( disease_name = i . diagnosis . disease . label , disease_identifier = i . diagnosis . disease . id , ) ) if i . diagnosis . disease . label != \"\" and i . diagnosis . disease . id != \"\" else None ) return diagnoses def _diagnosis_from_disease ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a list of disease diagnoses associated with the proband from the diseases object Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" diagnoses = [] for disease in self . diseases (): diagnoses . append ( ProbandDisease ( disease_name = disease . term . label , disease_identifier = disease . term . id ) ) return diagnoses def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ())) def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data @staticmethod def _extract_diagnosed_gene ( genomic_interpretation : GenomicInterpretation , ) -> ProbandCausativeGene : \"\"\" Retrieve the disease causing genes from the variant descriptor field if not empty, otherwise, retrieves from the gene descriptor from a phenopacket. Args: genomic_interpretation (GenomicInterpretation): A genomic interpretation from a Phenopacket Returns: ProbandCausativeGene: The disease causing gene \"\"\" if genomic_interpretation . variant_interpretation . ByteSize () != 0 : return ProbandCausativeGene ( genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . symbol , genomic_interpretation . variant_interpretation . variation_descriptor . gene_context . value_id , ) else : return ProbandCausativeGene ( gene_symbol = genomic_interpretation . gene . symbol , gene_identifier = genomic_interpretation . gene . value_id , ) def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False","title":"PhenopacketUtil"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.__init__","text":"Initialise PhenopacketUtil Parameters: Name Type Description Default phenopacket_contents Union [ Phenopacket , Family ] Phenopacket or Family object required Source code in src/pheval/utils/phenopacket_utils.py 222 223 224 225 226 227 228 def __init__ ( self , phenopacket_contents : Union [ Phenopacket , Family ]): \"\"\"Initialise PhenopacketUtil Args: phenopacket_contents (Union[Phenopacket, Family]): Phenopacket or Family object \"\"\" self . phenopacket_contents = phenopacket_contents","title":"__init__"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.causative_variants","text":"Retrieve a list of causative variants listed in a Phenopacket Returns: Type Description List [ ProbandCausativeVariant ] List[ProbandCausativeVariant]: List of proband causative variants Source code in src/pheval/utils/phenopacket_utils.py 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 def causative_variants ( self ) -> List [ ProbandCausativeVariant ]: \"\"\" Retrieve a list of causative variants listed in a Phenopacket Returns: List[ProbandCausativeVariant]: List of proband causative variants \"\"\" all_variants = [] interpretation = self . interpretations () for i in interpretation : for g in i . diagnosis . genomic_interpretations : vcf_record = g . variant_interpretation . variation_descriptor . vcf_record genotype = g . variant_interpretation . variation_descriptor . allelic_state variant_data = ProbandCausativeVariant ( self . phenopacket_contents . subject . id , vcf_record . genome_assembly , GenomicVariant ( vcf_record . chrom , vcf_record . pos , vcf_record . ref , vcf_record . alt , ), genotype . label , vcf_record . info , ) all_variants . append ( variant_data ) return all_variants","title":"causative_variants"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_disease_record","text":"Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: Name Type Description bool bool True if any disease record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 535 536 537 538 539 540 541 542 543 544 545 546 547 def check_incomplete_disease_record ( self ) -> bool : \"\"\" Check if any disease record in the phenopacket has incomplete information. This method iterates through the diagnosed disease records and checks if any of them have missing or incomplete information such as empty disease name, or disease identifier. Returns: bool: True if any disease record is incomplete, False otherwise. \"\"\" if len ( self . diagnoses ()) == 0 : return True return False","title":"check_incomplete_disease_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_gene_record","text":"Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: Name Type Description bool bool True if any gene record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 def check_incomplete_gene_record ( self ) -> bool : \"\"\" Check if any gene record in the phenopacket has incomplete information. This method iterates through the diagnosed gene records and checks if any of them have missing or incomplete information such as gene name, or gene identifier. Returns: bool: True if any gene record is incomplete, False otherwise. \"\"\" genes = self . diagnosed_genes () for gene in genes : if gene . gene_symbol == \"\" or gene . gene_identifier == \"\" : return True return False","title":"check_incomplete_gene_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_incomplete_variant_record","text":"Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: Name Type Description bool bool True if any variant record is incomplete, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 def check_incomplete_variant_record ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has incomplete information. This method iterates through the diagnosed variant records and checks if any of them have missing or incomplete information such as empty chromosome, position, reference, or alternate allele. Returns: bool: True if any variant record is incomplete, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if ( variant . chrom == \"\" or variant . pos == 0 or variant . pos == \"\" or variant . ref == \"\" or variant . alt == \"\" ): return True return False","title":"check_incomplete_variant_record"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.check_variant_alleles","text":"Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: Name Type Description bool bool True if the reference and alternate alleles are identical, False otherwise. Source code in src/pheval/utils/phenopacket_utils.py 506 507 508 509 510 511 512 513 514 515 516 517 def check_variant_alleles ( self ) -> bool : \"\"\" Check if any variant record in the phenopacket has identical reference and alternate alleles. Returns: bool: True if the reference and alternate alleles are identical, False otherwise. \"\"\" variants = self . diagnosed_variants () for variant in variants : if variant . ref == variant . alt : return True return False","title":"check_variant_alleles"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnosed_genes","text":"Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes Source code in src/pheval/utils/phenopacket_utils.py 446 447 448 449 450 451 452 453 454 455 456 457 458 def diagnosed_genes ( self ) -> List [ ProbandCausativeGene ]: \"\"\" Retrieve the disease causing genes from a phenopacket. Returns: List[ProbandCausativeGene]: List of causative genes \"\"\" pheno_interpretation = self . interpretations () genes = [] for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : genes . append ( self . _extract_diagnosed_gene ( g )) genes = list ({ gene . gene_symbol : gene for gene in genes } . values ()) return genes","title":"diagnosed_genes"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnosed_variants","text":"Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants Source code in src/pheval/utils/phenopacket_utils.py 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 def diagnosed_variants ( self ) -> List [ GenomicVariant ]: \"\"\" Retrieve a list of all known causative variants from a phenopacket. Returns: List[GenomicVariant]: List of causative variants \"\"\" variants = [] pheno_interpretation = self . interpretations () for i in pheno_interpretation : for g in i . diagnosis . genomic_interpretations : variant = GenomicVariant ( chrom = str ( g . variant_interpretation . variation_descriptor . vcf_record . chrom . replace ( \"chr\" , \"\" ) ), pos = int ( g . variant_interpretation . variation_descriptor . vcf_record . pos ), ref = g . variant_interpretation . variation_descriptor . vcf_record . ref , alt = g . variant_interpretation . variation_descriptor . vcf_record . alt , ) variants . append ( variant ) return variants","title":"diagnosed_variants"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diagnoses","text":"Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: Type Description List [ ProbandDisease ] List[ProbandDisease]: List of diagnosed diseases Source code in src/pheval/utils/phenopacket_utils.py 331 332 333 334 335 336 337 338 def diagnoses ( self ) -> List [ ProbandDisease ]: \"\"\" Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket Returns: List[ProbandDisease]: List of diagnosed diseases \"\"\" return list ( set ( self . _diagnosis_from_interpretations () + self . _diagnosis_from_disease ()))","title":"diagnoses"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.diseases","text":"Retrieve a list of Diseases associated with the proband Returns: Type Description List [ Disease ] List[Disease]: List of diseases Source code in src/pheval/utils/phenopacket_utils.py 283 284 285 286 287 288 289 290 291 292 293 def diseases ( self ) -> List [ Disease ]: \"\"\" Retrieve a list of Diseases associated with the proband Returns: List[Disease]: List of diseases \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . diseases else : return self . phenopacket_contents . diseases","title":"diseases"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.files","text":"Retrieve a list of files associated with a phenopacket Returns: Type Description List [ File ] List[File]: List of files associated with a phenopacket Source code in src/pheval/utils/phenopacket_utils.py 380 381 382 383 384 385 386 387 def files ( self ) -> List [ File ]: \"\"\" Retrieve a list of files associated with a phenopacket Returns: List[File]: List of files associated with a phenopacket \"\"\" return self . phenopacket_contents . files","title":"files"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.interpretations","text":"Retrieve a list of interpretations from a Phenopacket Returns: Type Description List [ Interpretation ] List[Interpretation]: List of interpretations Source code in src/pheval/utils/phenopacket_utils.py 340 341 342 343 344 345 346 347 348 349 350 def interpretations ( self ) -> List [ Interpretation ]: \"\"\" Retrieve a list of interpretations from a Phenopacket Returns: List[Interpretation]: List of interpretations \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . interpretations else : return self . phenopacket_contents . interpretations","title":"interpretations"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.negated_phenotypic_features","text":"Retrieve a list of all negated HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of negated HPO terms Source code in src/pheval/utils/phenopacket_utils.py 269 270 271 272 273 274 275 276 277 278 279 280 281 def negated_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all negated HPO terms Returns: List[PhenotypicFeature]: List of negated HPO terms \"\"\" negated_phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : negated_phenotypic_features . append ( p ) return negated_phenotypic_features","title":"negated_phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.observed_phenotypic_features","text":"Retrieve a list of all observed HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of observed HPO terms Source code in src/pheval/utils/phenopacket_utils.py 254 255 256 257 258 259 260 261 262 263 264 265 266 267 def observed_phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all observed HPO terms Returns: List[PhenotypicFeature]: List of observed HPO terms \"\"\" phenotypic_features = [] all_phenotypic_features = self . phenotypic_features () for p in all_phenotypic_features : if p . excluded : continue phenotypic_features . append ( p ) return phenotypic_features","title":"observed_phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.phenotypic_features","text":"Retrieve a list of all HPO terms Returns: Type Description List [ PhenotypicFeature ] List[PhenotypicFeature]: List of HPO terms Source code in src/pheval/utils/phenopacket_utils.py 242 243 244 245 246 247 248 249 250 251 252 def phenotypic_features ( self ) -> List [ PhenotypicFeature ]: \"\"\" Retrieve a list of all HPO terms Returns: List[PhenotypicFeature]: List of HPO terms \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . phenotypic_features else : return self . phenopacket_contents . phenotypic_features","title":"phenotypic_features"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.sample_id","text":"Retrieve the sample ID from a Phenopacket or proband of a Family Returns: Name Type Description str str Sample ID Source code in src/pheval/utils/phenopacket_utils.py 230 231 232 233 234 235 236 237 238 239 240 def sample_id ( self ) -> str : \"\"\" Retrieve the sample ID from a Phenopacket or proband of a Family Returns: str: Sample ID \"\"\" if hasattr ( self . phenopacket_contents , \"proband\" ): return self . phenopacket_contents . proband . subject . id else : return self . phenopacket_contents . subject . id","title":"sample_id"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.PhenopacketUtil.vcf_file_data","text":"Retrieve the genome assembly and VCF file name from a phenopacket. Parameters: Name Type Description Default phenopacket_path Path The path to the phenopacket file. required vcf_dir Path The directory path where the VCF file is stored. required Returns: Name Type Description File File The VCF file with updated URI pointing to the specified directory. Raises: Type Description IncorrectFileFormatError If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError If the genome assembly of the VCF file is not compatible. Note This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. Source code in src/pheval/utils/phenopacket_utils.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 def vcf_file_data ( self , phenopacket_path : Path , vcf_dir : Path ) -> File : \"\"\" Retrieve the genome assembly and VCF file name from a phenopacket. Args: phenopacket_path (Path): The path to the phenopacket file. vcf_dir (Path): The directory path where the VCF file is stored. Returns: File: The VCF file with updated URI pointing to the specified directory. Raises: IncorrectFileFormatError: If the provided file is not in .vcf or .vcf.gz format. IncompatibleGenomeAssemblyError: If the genome assembly of the VCF file is not compatible. Note: This function searches for a VCF file within the provided list of files, validates its format, and checks if the genome assembly is compatible. If the conditions are met, it updates the URI of the VCF file to the specified directory and returns the modified file object. \"\"\" compatible_genome_assembly = [ \"GRCh37\" , \"hg19\" , \"GRCh38\" , \"hg38\" ] vcf_data = [ file for file in self . files () if file . file_attributes [ \"fileFormat\" ] == \"vcf\" ][ 0 ] if not Path ( vcf_data . uri ) . name . endswith ( \".vcf\" ) and not Path ( vcf_data . uri ) . name . endswith ( \".vcf.gz\" ): raise IncorrectFileFormatError ( Path ( vcf_data . uri ), \".vcf or .vcf.gz file\" ) if vcf_data . file_attributes [ \"genomeAssembly\" ] not in compatible_genome_assembly : raise IncompatibleGenomeAssemblyError ( vcf_data . file_attributes [ \"genomeAssembly\" ], phenopacket_path ) vcf_data . uri = str ( vcf_dir . joinpath ( Path ( vcf_data . uri ) . name )) return vcf_data","title":"vcf_file_data"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandCausativeGene","text":"Represents a causative gene associated with a proband Parameters: Name Type Description Default gene_symbol str Symbol representing the gene required gene_identifier str The ENSEMBL gene identifier for the result entry required Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 @dataclass class ProbandCausativeGene : \"\"\" Represents a causative gene associated with a proband Args: gene_symbol (str): Symbol representing the gene gene_identifier (str): The ENSEMBL gene identifier for the result entry Notes: While we recommend providing the gene identifier in the ENSEMBL namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" gene_symbol : str gene_identifier : str","title":"ProbandCausativeGene"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandCausativeVariant","text":"Represents a causative variant associated with a proband Parameters: Name Type Description Default proband_id str ID of the proband required assembly str Genome assembly required variant GenomicVariant Genomic variant associated with the proband required genotype str Genotype information for the variant required info str Additional information about the variant (default is an empty string) '' Source code in src/pheval/utils/phenopacket_utils.py 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 @dataclass class ProbandCausativeVariant : \"\"\" Represents a causative variant associated with a proband Args: proband_id (str): ID of the proband assembly (str): Genome assembly variant (GenomicVariant): Genomic variant associated with the proband genotype (str): Genotype information for the variant info (str, optional): Additional information about the variant (default is an empty string) \"\"\" proband_id : str assembly : str variant : GenomicVariant genotype : str info : str = \"\"","title":"ProbandCausativeVariant"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.ProbandDisease","text":"Represents a disease associated with a proband Parameters: Name Type Description Default disease_name str Name of the disease required disease_identifier str Identifier for the disease result entry in the OMIM namespace required Notes While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. Source code in src/pheval/utils/phenopacket_utils.py 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 @dataclass ( frozen = True , eq = True ) class ProbandDisease : \"\"\" Represents a disease associated with a proband Args: disease_name (str): Name of the disease disease_identifier (str): Identifier for the disease result entry in the OMIM namespace Notes: While we recommend providing the disease identifier in the OMIM namespace, any matching format used in Phenopacket interpretations and result output is acceptable for result matching purposes in the analysis. \"\"\" disease_name : str disease_identifier : str","title":"ProbandDisease"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_gene_identifier_map","text":"Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: Name Type Description dict dict A mapping of gene identifiers to gene symbols. Notes The dictionary structure: { 'identifier': 'gene_symbol', ... } Source code in src/pheval/utils/phenopacket_utils.py 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 def create_gene_identifier_map () -> dict : \"\"\" Create a mapping of gene identifiers to gene symbols using HGNC data. Returns: dict: A mapping of gene identifiers to gene symbols. Notes: The dictionary structure: { 'identifier': 'gene_symbol', ... } \"\"\" hgnc_df = read_hgnc_data () identifier_map = {} for _index , row in hgnc_df . iterrows (): identifier_map [ row [ \"ensembl_gene_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"hgnc_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"entrez_id\" ]] = row [ \"symbol\" ] identifier_map [ row [ \"refseq_accession\" ]] = row [ \"symbol\" ] return identifier_map","title":"create_gene_identifier_map"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_hgnc_dict","text":"Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: Name Type Description defaultdict defaultdict A dictionary containing gene symbols as keys and their associated gene information. Notes The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } Source code in src/pheval/utils/phenopacket_utils.py 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 def create_hgnc_dict () -> defaultdict : \"\"\" Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data. Returns: defaultdict: A dictionary containing gene symbols as keys and their associated gene information. Notes: The dictionary structure: { 'gene_symbol': { 'ensembl_id': str, 'hgnc_id': str, 'entrez_id': str, 'refseq_accession': str, 'previous_symbol': [str, ...] }, ... } \"\"\" hgnc_df = read_hgnc_data () hgnc_data = defaultdict ( dict ) for _index , row in hgnc_df . iterrows (): previous_names = [] hgnc_data [ row [ \"symbol\" ]][ \"ensembl_id\" ] = row [ \"ensembl_gene_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"hgnc_id\" ] = row [ \"hgnc_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"entrez_id\" ] = row [ \"entrez_id\" ] hgnc_data [ row [ \"symbol\" ]][ \"refseq_accession\" ] = row [ \"refseq_accession\" ] previous = str ( row [ \"prev_symbol\" ]) . split ( \"|\" ) for p in previous : previous_names . append ( p . strip ( '\"' )) hgnc_data [ row [ \"symbol\" ]][ \"previous_symbol\" ] = previous_names return hgnc_data","title":"create_hgnc_dict"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.create_json_message","text":"Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. Source code in src/pheval/utils/phenopacket_utils.py 621 622 623 624 625 626 627 628 629 630 631 def create_json_message ( phenopacket : Union [ Phenopacket , Family ]) -> str : \"\"\" Create a JSON message for writing to a file. Args: - phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family object to convert to JSON. Returns: - str: A JSON-formatted string representation of the Phenopacket or Family object. \"\"\" return MessageToJson ( phenopacket )","title":"create_json_message"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.phenopacket_reader","text":"Read a Phenopacket file and returns its contents as a Phenopacket or Family object Parameters: Name Type Description Default file Path Path to the Phenopacket file required Returns: Type Description Union [ Phenopacket , Family ] Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object Source code in src/pheval/utils/phenopacket_utils.py 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 def phenopacket_reader ( file : Path ) -> Union [ Phenopacket , Family ]: \"\"\" Read a Phenopacket file and returns its contents as a Phenopacket or Family object Args: file (Path): Path to the Phenopacket file Returns: Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object \"\"\" file = open ( file , \"r\" ) phenopacket = json . load ( file ) file . close () if \"proband\" in phenopacket : return Parse ( json . dumps ( phenopacket ), Family ()) else : return Parse ( json . dumps ( phenopacket ), Phenopacket ())","title":"phenopacket_reader"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.read_hgnc_data","text":"Read HGNC data from a file and return it as a Pandas DataFrame. Returns: Type Description DataFrame pd.DataFrame: DataFrame containing the HGNC data. Source code in src/pheval/utils/phenopacket_utils.py 125 126 127 128 129 130 131 132 133 134 135 136 def read_hgnc_data () -> pd . DataFrame : \"\"\" Read HGNC data from a file and return it as a Pandas DataFrame. Returns: pd.DataFrame: DataFrame containing the HGNC data. \"\"\" return pd . read_csv ( os . path . dirname ( __file__ ) . replace ( \"utils\" , \"resources/hgnc_complete_set.txt\" ), delimiter = \" \\t \" , dtype = str , )","title":"read_hgnc_data"},{"location":"api/pheval/utils/phenopacket_utils/#src.pheval.utils.phenopacket_utils.write_phenopacket","text":"Write a Phenopacket or Family object to a file in JSON format. Parameters: Name Type Description Default phenopacket Phenopacket or Family The Phenopacket or Family object to be written. required output_file Path The Path object representing the file to write the Phenopacket data. required Returns: Type Description None None Source code in src/pheval/utils/phenopacket_utils.py 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 def write_phenopacket ( phenopacket : Union [ Phenopacket , Family ], output_file : Path ) -> None : \"\"\" Write a Phenopacket or Family object to a file in JSON format. Args: phenopacket (Phenopacket or Family): The Phenopacket or Family object to be written. output_file (Path): The Path object representing the file to write the Phenopacket data. Returns: None \"\"\" phenopacket_json = create_json_message ( phenopacket ) with open ( output_file , \"w\" ) as outfile : outfile . write ( phenopacket_json ) outfile . close ()","title":"write_phenopacket"},{"location":"api/pheval/utils/semsim_utils/","text":"Contains all pheval utility methods diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) Calculates score difference between two semantic similarity profiles Parameters: Name Type Description Default semsim_left DataFrame first semantic similarity dataframe required semsim_right DataFrame second semantic similarity dataframe required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). required Returns: Type Description DataFrame pd.DataFrame: A dataframe with terms and its scores differences Source code in src/pheval/utils/semsim_utils.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def diff_semsim ( semsim_left : pd . DataFrame , semsim_right : pd . DataFrame , score_column : str , absolute_diff : bool ) -> pd . DataFrame : \"\"\"Calculates score difference between two semantic similarity profiles Args: semsim_left (pd.DataFrame): first semantic similarity dataframe semsim_right (pd.DataFrame): second semantic similarity dataframe score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: pd.DataFrame: A dataframe with terms and its scores differences \"\"\" df = pd . merge ( semsim_left , semsim_right , on = [ \"subject_id\" , \"object_id\" ], how = \"outer\" ) if absolute_diff : df [ \"diff\" ] = df [ f \" { score_column } _x\" ] - df [ f \" { score_column } _y\" ] return df [[ \"subject_id\" , \"object_id\" , \"diff\" ]] df [ \"diff\" ] = df . apply ( lambda row : get_percentage_diff ( row [ f \" { score_column } _x\" ], row [ f \" { score_column } _y\" ]), axis = 1 ) return df [[ \"subject_id\" , \"object_id\" , f \" { score_column } _x\" , f \" { score_column } _y\" , \"diff\" ]] filter_non_0_score ( data , col ) Removes rows that have value equal to 0 based on the given column passed by col parameter Parameters: Name Type Description Default data DataFrame Dirty dataframe required col str Column to be filtered required Returns: Type Description DataFrame pd.DataFrame: Filtered dataframe Source code in src/pheval/utils/semsim_utils.py 14 15 16 17 18 19 20 21 22 23 24 def filter_non_0_score ( data : pd . DataFrame , col : str ) -> pd . DataFrame : \"\"\"Removes rows that have value equal to 0 based on the given column passed by col parameter Args: data (pd.DataFrame): Dirty dataframe col (str): Column to be filtered Returns: pd.DataFrame: Filtered dataframe \"\"\" return data [ data [ col ] != 0 ] get_percentage_diff ( current_number , previous_number ) Gets the percentage difference between two numbers Parameters: Name Type Description Default current_number float second number in comparison required previous_number float first number in comparison required Returns: Name Type Description float float percentage difference between two numbers Source code in src/pheval/utils/semsim_utils.py 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 def get_percentage_diff ( current_number : float , previous_number : float ) -> float : \"\"\"Gets the percentage difference between two numbers Args: current_number (float): second number in comparison previous_number (float): first number in comparison Returns: float: percentage difference between two numbers \"\"\" try : if current_number == previous_number : return \" {:.2%} \" . format ( 0 ) if current_number > previous_number : number = ( 1 - (( current_number / previous_number ))) * 100 else : number = ( 100 - (( previous_number / current_number ) * 100 )) * - 1 return \" {:.2%} \" . format ( number / 100 ) except ZeroDivisionError : return None parse_semsim ( df , cols ) Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Parameters: Name Type Description Default df DataFrame semantic similarity profile dataframe required cols list list of columns that will be selected on semsim data required Returns: Type Description DataFrame pd.Dataframe: parsed semantic similarity dataframe Source code in src/pheval/utils/semsim_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 def parse_semsim ( df : pd . DataFrame , cols : list ) -> pd . DataFrame : \"\"\"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Args: df (pd.DataFrame): semantic similarity profile dataframe cols (list): list of columns that will be selected on semsim data Returns: pd.Dataframe: parsed semantic similarity dataframe \"\"\" df [ cols [ - 1 ]] = pd . to_numeric ( df [ cols [ - 1 ]], errors = \"coerce\" ) df . replace ( \"None\" , numpy . nan ) . dropna ( subset = cols [ - 1 ], inplace = True ) return df percentage_diff ( semsim_left , semsim_right , score_column , output ) Compares two semantic similarity profiles Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required output Path Output path for the difference tsv file required Source code in src/pheval/utils/semsim_utils.py 67 68 69 70 71 72 73 74 75 76 77 def percentage_diff ( semsim_left : Path , semsim_right : Path , score_column : str , output : Path ): \"\"\"Compares two semantic similarity profiles Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) output (Path): Output path for the difference tsv file \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = False ) clean_df . sort_values ( by = \"diff\" , ascending = False ) . to_csv ( output , sep = \" \\t \" , index = False ) semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = True ) semsim_analysis Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). True Returns: Type Description DataFrame [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles Source code in src/pheval/utils/semsim_utils.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def semsim_analysis ( semsim_left : Path , semsim_right : Path , score_column : str , absolute_diff = True ) -> pd . DataFrame : \"\"\"semsim_analysis Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles \"\"\" validate_semsim_file_comparison ( semsim_left , semsim_right ) cols = [ \"subject_id\" , \"object_id\" , score_column ] semsim_left = pd . read_csv ( semsim_left , sep = \" \\t \" ) semsim_right = pd . read_csv ( semsim_right , sep = \" \\t \" ) file_utils . ensure_columns_exists ( cols = cols , err_message = \"must exist in semsim dataframes\" , dataframes = [ semsim_left , semsim_right ], ) semsim_left = parse_semsim ( semsim_left , cols ) semsim_right = parse_semsim ( semsim_right , cols ) diff_df = diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) return filter_non_0_score ( diff_df , \"diff\" ) semsim_heatmap_plot ( semsim_left , semsim_right , score_column ) Plots semantic similarity profiles heatmap Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required Source code in src/pheval/utils/semsim_utils.py 80 81 82 83 84 85 86 87 88 89 90 91 def semsim_heatmap_plot ( semsim_left : Path , semsim_right : Path , score_column : str ): \"\"\"Plots semantic similarity profiles heatmap Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column ) df = clean_df . pivot ( index = \"subject_id\" , columns = \"object_id\" , values = \"diff\" ) fig = px . imshow ( df , text_auto = True ) fig . show () validate_semsim_file_comparison ( semsim_left , semsim_right ) Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException Source code in src/pheval/utils/semsim_utils.py 124 125 126 127 128 129 130 131 132 133 134 135 def validate_semsim_file_comparison ( semsim_left : Path , semsim_right : Path ): \"\"\"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException \"\"\" if semsim_left == semsim_right : errmsg = \"Semantic similarity profiles are equal. Make sure you have selected different files to analyze\" raise Exception ( errmsg ) file_utils . ensure_file_exists ( semsim_left , semsim_right )","title":"Semsim utils"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.diff_semsim","text":"Calculates score difference between two semantic similarity profiles Parameters: Name Type Description Default semsim_left DataFrame first semantic similarity dataframe required semsim_right DataFrame second semantic similarity dataframe required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). required Returns: Type Description DataFrame pd.DataFrame: A dataframe with terms and its scores differences Source code in src/pheval/utils/semsim_utils.py 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 def diff_semsim ( semsim_left : pd . DataFrame , semsim_right : pd . DataFrame , score_column : str , absolute_diff : bool ) -> pd . DataFrame : \"\"\"Calculates score difference between two semantic similarity profiles Args: semsim_left (pd.DataFrame): first semantic similarity dataframe semsim_right (pd.DataFrame): second semantic similarity dataframe score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: pd.DataFrame: A dataframe with terms and its scores differences \"\"\" df = pd . merge ( semsim_left , semsim_right , on = [ \"subject_id\" , \"object_id\" ], how = \"outer\" ) if absolute_diff : df [ \"diff\" ] = df [ f \" { score_column } _x\" ] - df [ f \" { score_column } _y\" ] return df [[ \"subject_id\" , \"object_id\" , \"diff\" ]] df [ \"diff\" ] = df . apply ( lambda row : get_percentage_diff ( row [ f \" { score_column } _x\" ], row [ f \" { score_column } _y\" ]), axis = 1 ) return df [[ \"subject_id\" , \"object_id\" , f \" { score_column } _x\" , f \" { score_column } _y\" , \"diff\" ]]","title":"diff_semsim"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.filter_non_0_score","text":"Removes rows that have value equal to 0 based on the given column passed by col parameter Parameters: Name Type Description Default data DataFrame Dirty dataframe required col str Column to be filtered required Returns: Type Description DataFrame pd.DataFrame: Filtered dataframe Source code in src/pheval/utils/semsim_utils.py 14 15 16 17 18 19 20 21 22 23 24 def filter_non_0_score ( data : pd . DataFrame , col : str ) -> pd . DataFrame : \"\"\"Removes rows that have value equal to 0 based on the given column passed by col parameter Args: data (pd.DataFrame): Dirty dataframe col (str): Column to be filtered Returns: pd.DataFrame: Filtered dataframe \"\"\" return data [ data [ col ] != 0 ]","title":"filter_non_0_score"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.get_percentage_diff","text":"Gets the percentage difference between two numbers Parameters: Name Type Description Default current_number float second number in comparison required previous_number float first number in comparison required Returns: Name Type Description float float percentage difference between two numbers Source code in src/pheval/utils/semsim_utils.py 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 def get_percentage_diff ( current_number : float , previous_number : float ) -> float : \"\"\"Gets the percentage difference between two numbers Args: current_number (float): second number in comparison previous_number (float): first number in comparison Returns: float: percentage difference between two numbers \"\"\" try : if current_number == previous_number : return \" {:.2%} \" . format ( 0 ) if current_number > previous_number : number = ( 1 - (( current_number / previous_number ))) * 100 else : number = ( 100 - (( previous_number / current_number ) * 100 )) * - 1 return \" {:.2%} \" . format ( number / 100 ) except ZeroDivisionError : return None","title":"get_percentage_diff"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.parse_semsim","text":"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Parameters: Name Type Description Default df DataFrame semantic similarity profile dataframe required cols list list of columns that will be selected on semsim data required Returns: Type Description DataFrame pd.Dataframe: parsed semantic similarity dataframe Source code in src/pheval/utils/semsim_utils.py 27 28 29 30 31 32 33 34 35 36 37 38 39 def parse_semsim ( df : pd . DataFrame , cols : list ) -> pd . DataFrame : \"\"\"Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones Args: df (pd.DataFrame): semantic similarity profile dataframe cols (list): list of columns that will be selected on semsim data Returns: pd.Dataframe: parsed semantic similarity dataframe \"\"\" df [ cols [ - 1 ]] = pd . to_numeric ( df [ cols [ - 1 ]], errors = \"coerce\" ) df . replace ( \"None\" , numpy . nan ) . dropna ( subset = cols [ - 1 ], inplace = True ) return df","title":"parse_semsim"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.percentage_diff","text":"Compares two semantic similarity profiles Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required output Path Output path for the difference tsv file required Source code in src/pheval/utils/semsim_utils.py 67 68 69 70 71 72 73 74 75 76 77 def percentage_diff ( semsim_left : Path , semsim_right : Path , score_column : str , output : Path ): \"\"\"Compares two semantic similarity profiles Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) output (Path): Output path for the difference tsv file \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column , absolute_diff = False ) clean_df . sort_values ( by = \"diff\" , ascending = False ) . to_csv ( output , sep = \" \\t \" , index = False )","title":"percentage_diff"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.semsim_analysis","text":"semsim_analysis Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required absolute_diff bool Whether the difference is absolute (True) or percentage (False). True Returns: Type Description DataFrame [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles Source code in src/pheval/utils/semsim_utils.py 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 def semsim_analysis ( semsim_left : Path , semsim_right : Path , score_column : str , absolute_diff = True ) -> pd . DataFrame : \"\"\"semsim_analysis Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False). Defaults to True. Returns: [pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles \"\"\" validate_semsim_file_comparison ( semsim_left , semsim_right ) cols = [ \"subject_id\" , \"object_id\" , score_column ] semsim_left = pd . read_csv ( semsim_left , sep = \" \\t \" ) semsim_right = pd . read_csv ( semsim_right , sep = \" \\t \" ) file_utils . ensure_columns_exists ( cols = cols , err_message = \"must exist in semsim dataframes\" , dataframes = [ semsim_left , semsim_right ], ) semsim_left = parse_semsim ( semsim_left , cols ) semsim_right = parse_semsim ( semsim_right , cols ) diff_df = diff_semsim ( semsim_left , semsim_right , score_column , absolute_diff ) return filter_non_0_score ( diff_df , \"diff\" )","title":"semsim_analysis"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.semsim_heatmap_plot","text":"Plots semantic similarity profiles heatmap Parameters: Name Type Description Default semsim_left Path File path of the first semantic similarity profile required semsim_right Path File path of the second semantic similarity profile required score_column str Score column that will be computed (e.g. jaccard_similarity) required Source code in src/pheval/utils/semsim_utils.py 80 81 82 83 84 85 86 87 88 89 90 91 def semsim_heatmap_plot ( semsim_left : Path , semsim_right : Path , score_column : str ): \"\"\"Plots semantic similarity profiles heatmap Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile score_column (str): Score column that will be computed (e.g. jaccard_similarity) \"\"\" clean_df = semsim_analysis ( semsim_left , semsim_right , score_column ) df = clean_df . pivot ( index = \"subject_id\" , columns = \"object_id\" , values = \"diff\" ) fig = px . imshow ( df , text_auto = True ) fig . show ()","title":"semsim_heatmap_plot"},{"location":"api/pheval/utils/semsim_utils/#src.pheval.utils.semsim_utils.validate_semsim_file_comparison","text":"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException Source code in src/pheval/utils/semsim_utils.py 124 125 126 127 128 129 130 131 132 133 134 135 def validate_semsim_file_comparison ( semsim_left : Path , semsim_right : Path ): \"\"\"Checks if files exist and whether they're different Args: semsim_left (Path): File path of the first semantic similarity profile semsim_right (Path): File path of the second semantic similarity profile Raises: Exception: FileNotFoundException \"\"\" if semsim_left == semsim_right : errmsg = \"Semantic similarity profiles are equal. Make sure you have selected different files to analyze\" raise Exception ( errmsg ) file_utils . ensure_file_exists ( semsim_left , semsim_right )","title":"validate_semsim_file_comparison"},{"location":"api/pheval/utils/utils/","text":"Contains all pheval utility methods rand ( df , min_num , max_num , scramble_factor ) Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number Source code in src/pheval/utils/utils.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def rand ( df : pd . DataFrame , min_num : int , max_num : int , scramble_factor : float ) -> float : \"\"\" Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number \"\"\" try : return df + ( random . uniform ( min_num , max_num ) * scramble_factor ) except TypeError as err : info_log . error ( df , exc_info = err ) return df semsim_scramble ( input , output , columns_to_be_scrambled , scramble_factor = 0.5 ) Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 def semsim_scramble ( input : Path , output : Path , columns_to_be_scrambled : List [ str ], scramble_factor : float = 0.5 , ) -> pd . DataFrame : \"\"\" Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe \"\"\" semsim = pd . read_csv ( input , sep = \" \\t \" ) dataframe = semsim_scramble_df ( semsim , columns_to_be_scrambled , scramble_factor ) dataframe . to_csv ( output , sep = \" \\t \" , index = False ) semsim_scramble_df ( dataframe , columns_to_be_scrambled , scramble_factor ) scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 def semsim_scramble_df ( dataframe : pd . DataFrame , columns_to_be_scrambled : List [ str ], scramble_factor : float , ) -> pd . DataFrame : \"\"\"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe \"\"\" for col in columns_to_be_scrambled : min_num = dataframe [ col ] . min () max_num = dataframe [ col ] . max () dataframe [ col ] = dataframe [ col ] . apply ( rand , args = ( min_num , max_num , scramble_factor )) return dataframe","title":"Utils"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.rand","text":"Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number Source code in src/pheval/utils/utils.py 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def rand ( df : pd . DataFrame , min_num : int , max_num : int , scramble_factor : float ) -> float : \"\"\" Numeric scrambling Args: df (pd.DataFrame): dataframe records min_num (int): min value from this records max_num (int): max value from this records scramble_factor (float): scramble factor scalar Returns: float: randomized number \"\"\" try : return df + ( random . uniform ( min_num , max_num ) * scramble_factor ) except TypeError as err : info_log . error ( df , exc_info = err ) return df","title":"rand"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.semsim_scramble","text":"Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 def semsim_scramble ( input : Path , output : Path , columns_to_be_scrambled : List [ str ], scramble_factor : float = 0.5 , ) -> pd . DataFrame : \"\"\" Scrambles semantic similarity profile with a magnitude between 0 and 1 (scramble_factor: 0 means no scrambling and 1 means complete randomisation). It then randomises the above scores with a degree of the scramble_factor and returns a scrambles pandas dataframe. Args: input (Path): scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): columns that will be scrambled in semsim file (e.g. jaccard_similarity). output (Path) Returns: pd.Dataframe: scrambled dataframe \"\"\" semsim = pd . read_csv ( input , sep = \" \\t \" ) dataframe = semsim_scramble_df ( semsim , columns_to_be_scrambled , scramble_factor ) dataframe . to_csv ( output , sep = \" \\t \" , index = False )","title":"semsim_scramble"},{"location":"api/pheval/utils/utils/#src.pheval.utils.utils.semsim_scramble_df","text":"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe Source code in src/pheval/utils/utils.py 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 def semsim_scramble_df ( dataframe : pd . DataFrame , columns_to_be_scrambled : List [ str ], scramble_factor : float , ) -> pd . DataFrame : \"\"\"scramble_semsim_df Args: dataframe (pd.DataFrame): dataframe that contains semsim profile scramble_factor (float) scalar scramble factor columns_to_be_scrambled (List[str]): Returns: pd.Dataframe: scrambled dataframe \"\"\" for col in columns_to_be_scrambled : min_num = dataframe [ col ] . min () max_num = dataframe [ col ] . max () dataframe [ col ] = dataframe [ col ] . apply ( rand , args = ( min_num , max_num , scramble_factor )) return dataframe","title":"semsim_scramble_df"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index a153d4ee..e5dc08eb 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,178 +2,178 @@
https://monarch-initiative.github.io/pheval/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/CODE_OF_CONDUCT/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/about/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/contact/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/contributing/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/executing_a_benchmark/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/exomiser_pipeline/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/pipeline/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/plugins/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/roadmap/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/styleguide/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/cli/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/config_parser/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/run_metadata/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/analysis/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/assess_prioritisation_base/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/benchmark_db_manager/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/benchmark_generator/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/benchmarking_data/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/binary_classification_stats/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/disease_prioritisation_analysis/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/gene_prioritisation_analysis/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/generate_plots/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/generate_summary_outputs/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/parse_benchmark_summary/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/parse_corpus/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/prioritisation_result_types/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/rank_stats/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/run_data_parser/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/analyse/variant_prioritisation_analysis/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/infra/exomiserdb/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/post_processing/post_processing/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/prepare/create_noisy_phenopackets/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/prepare/create_spiked_vcf/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/prepare/custom_exceptions/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/prepare/prepare_corpus/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/prepare/update_phenopacket/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/runners/runner/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/utils/exomiser/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/utils/file_utils/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/utils/phenopacket_utils/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/utils/semsim_utils/
- 2024-10-23
+ 2024-12-04
https://monarch-initiative.github.io/pheval/api/pheval/utils/utils/
- 2024-10-23
+ 2024-12-04
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 1a9d21ad..e4c98b23 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|