diff --git a/docs/404.html b/docs/404.html
index 35acd5b9..64e8ebc5 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -649,7 +649,7 @@
-
+
Phenio
diff --git a/docs/CLI/index.html b/docs/CLI/index.html
index 7a0019b0..dead4af2 100644
--- a/docs/CLI/index.html
+++ b/docs/CLI/index.html
@@ -755,7 +755,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/1. Propose/index.html b/docs/Create-an-Ingest/1. Propose/index.html
index 2b4c61cc..9b66cd57 100644
--- a/docs/Create-an-Ingest/1. Propose/index.html
+++ b/docs/Create-an-Ingest/1. Propose/index.html
@@ -670,7 +670,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/2. Configure/index.html b/docs/Create-an-Ingest/2. Configure/index.html
index be285eab..b8b5dcbe 100644
--- a/docs/Create-an-Ingest/2. Configure/index.html
+++ b/docs/Create-an-Ingest/2. Configure/index.html
@@ -670,7 +670,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/3. Document/index.html b/docs/Create-an-Ingest/3. Document/index.html
index 9082642c..ac7a7549 100644
--- a/docs/Create-an-Ingest/3. Document/index.html
+++ b/docs/Create-an-Ingest/3. Document/index.html
@@ -670,7 +670,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/4. Implement/index.html b/docs/Create-an-Ingest/4. Implement/index.html
index caa19ec3..b1e76342 100644
--- a/docs/Create-an-Ingest/4. Implement/index.html
+++ b/docs/Create-an-Ingest/4. Implement/index.html
@@ -736,7 +736,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/5. Test/index.html b/docs/Create-an-Ingest/5. Test/index.html
index 0e3e472a..c84b8722 100644
--- a/docs/Create-an-Ingest/5. Test/index.html
+++ b/docs/Create-an-Ingest/5. Test/index.html
@@ -743,7 +743,7 @@
-
+
Phenio
diff --git a/docs/Create-an-Ingest/index.html b/docs/Create-an-Ingest/index.html
index 9a09c2cb..7de5be61 100644
--- a/docs/Create-an-Ingest/index.html
+++ b/docs/Create-an-Ingest/index.html
@@ -670,7 +670,7 @@
-
+
Phenio
diff --git a/docs/KG-Build-Process/kg-build-process/index.html b/docs/KG-Build-Process/kg-build-process/index.html
index bb1371b4..564d7352 100644
--- a/docs/KG-Build-Process/kg-build-process/index.html
+++ b/docs/KG-Build-Process/kg-build-process/index.html
@@ -788,7 +788,7 @@
-
+
Phenio
diff --git a/docs/Principles/modeling-principles/index.html b/docs/Principles/modeling-principles/index.html
index 058af706..00877428 100644
--- a/docs/Principles/modeling-principles/index.html
+++ b/docs/Principles/modeling-principles/index.html
@@ -734,7 +734,7 @@
-
+
Phenio
diff --git a/docs/Sources/alliance/index.html b/docs/Sources/alliance/index.html
index f634e200..81ad4644 100644
--- a/docs/Sources/alliance/index.html
+++ b/docs/Sources/alliance/index.html
@@ -729,7 +729,7 @@
-
+
Phenio
diff --git a/docs/Sources/bgee/index.html b/docs/Sources/bgee/index.html
index 8cbf0bc8..8120e374 100644
--- a/docs/Sources/bgee/index.html
+++ b/docs/Sources/bgee/index.html
@@ -708,7 +708,7 @@
-
+
Phenio
diff --git a/docs/Sources/ctd/index.html b/docs/Sources/ctd/index.html
index d19137ed..4e5075bf 100644
--- a/docs/Sources/ctd/index.html
+++ b/docs/Sources/ctd/index.html
@@ -701,7 +701,7 @@
-
+
Phenio
diff --git a/docs/Sources/dictybase/index.html b/docs/Sources/dictybase/index.html
index 5357f436..38610998 100644
--- a/docs/Sources/dictybase/index.html
+++ b/docs/Sources/dictybase/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/flybase/index.html b/docs/Sources/flybase/index.html
index ae8cd9d9..74ce8c44 100644
--- a/docs/Sources/flybase/index.html
+++ b/docs/Sources/flybase/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/goa/index.html b/docs/Sources/goa/index.html
index 42d7a9d3..83b98547 100644
--- a/docs/Sources/goa/index.html
+++ b/docs/Sources/goa/index.html
@@ -735,7 +735,7 @@
-
+
Phenio
diff --git a/docs/Sources/hgnc/index.html b/docs/Sources/hgnc/index.html
index a6c8cb8d..7646395a 100644
--- a/docs/Sources/hgnc/index.html
+++ b/docs/Sources/hgnc/index.html
@@ -708,7 +708,7 @@
-
+
Phenio
diff --git a/docs/Sources/hpoa/index.html b/docs/Sources/hpoa/index.html
index 639249c1..48100ef1 100644
--- a/docs/Sources/hpoa/index.html
+++ b/docs/Sources/hpoa/index.html
@@ -729,7 +729,7 @@
-
+
Phenio
diff --git a/docs/Sources/index.html b/docs/Sources/index.html
index 76436666..08c0025f 100644
--- a/docs/Sources/index.html
+++ b/docs/Sources/index.html
@@ -670,7 +670,7 @@
-
+
Phenio
diff --git a/docs/Sources/mgi/index.html b/docs/Sources/mgi/index.html
index aba1e016..19276cd0 100644
--- a/docs/Sources/mgi/index.html
+++ b/docs/Sources/mgi/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/ncbi/index.html b/docs/Sources/ncbi/index.html
index 430f9475..8c7fce19 100644
--- a/docs/Sources/ncbi/index.html
+++ b/docs/Sources/ncbi/index.html
@@ -708,7 +708,7 @@
-
+
Phenio
diff --git a/docs/Sources/panther/index.html b/docs/Sources/panther/index.html
index 777f6fd0..b658ee84 100644
--- a/docs/Sources/panther/index.html
+++ b/docs/Sources/panther/index.html
@@ -769,7 +769,7 @@
-
+
Phenio
@@ -1305,13 +1305,13 @@ Citation
-
diff --git a/docs/Sources/reactome/index.html b/docs/Sources/reactome/index.html
index 8916b163..8d798ce6 100644
--- a/docs/Sources/reactome/index.html
+++ b/docs/Sources/reactome/index.html
@@ -660,7 +660,7 @@
-
+
Phenio
diff --git a/docs/Sources/rgd/index.html b/docs/Sources/rgd/index.html
index 1651e4e3..440602d8 100644
--- a/docs/Sources/rgd/index.html
+++ b/docs/Sources/rgd/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/sgd/index.html b/docs/Sources/sgd/index.html
index 018ff8c5..06530dc3 100644
--- a/docs/Sources/sgd/index.html
+++ b/docs/Sources/sgd/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/string/index.html b/docs/Sources/string/index.html
index b41daf96..181fe0f9 100644
--- a/docs/Sources/string/index.html
+++ b/docs/Sources/string/index.html
@@ -660,7 +660,7 @@
-
+
Phenio
diff --git a/docs/Sources/xenbase/index.html b/docs/Sources/xenbase/index.html
index 90349022..903650c6 100644
--- a/docs/Sources/xenbase/index.html
+++ b/docs/Sources/xenbase/index.html
@@ -656,7 +656,7 @@
-
+
Phenio
diff --git a/docs/Sources/zfin/index.html b/docs/Sources/zfin/index.html
index 198a0597..c5b6b654 100644
--- a/docs/Sources/zfin/index.html
+++ b/docs/Sources/zfin/index.html
@@ -660,7 +660,7 @@
-
+
Phenio
diff --git a/docs/index.html b/docs/index.html
index 988a5dfd..8802507f 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -713,7 +713,7 @@
-
+
Phenio
diff --git a/docs/search/search_index.json b/docs/search/search_index.json
index 836adba1..da579d90 100644
--- a/docs/search/search_index.json
+++ b/docs/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Monarch Ingest Overview The Monarch Ingest generates KGX formatted files conforming to the BioLink Model from a wide variety of biomedical data sources. The eventual output of the Monarch Ingest process is the Monarch KG . The latest version of this can be found at data.monarchinitiative.org See also the folder monarch-kg-dev/latest Monarch Ingest is built using Poetry , which will create its own virtual environment. Installation monarch-ingest is a Python 3.8+ package, installable via Poetry . Install Poetry , if you don't already have it: curl -sSL https://install.python-poetry.org | python3 - # Optional: Have poetry create its venvs in your project directories poetry config virtualenvs.in-project true Clone the repo and build the code: git clone git@github.com/monarch-initiative/monarch-ingest Install monarch-ingest: cd monarch-ingest poetry install (Optional) Activate the virtual environment: # This step removes the need to prefix all commands with `poetry run` poetry shell Usage For a detailed tutorial on ingests and how to make one, see the Create an Ingest tab . CLI usage is available in the CLI tab , gcor by running ingest --help . Run the whole pipeline! Download the source data: ingest download --all Run all transforms: ingest transform --all Merge all transformed output into a tar.gz containing one node and one edge file ingest merge Upload the results to the Monarch Ingest Google bucket ingest release","title":"Welcome"},{"location":"#monarch-ingest","text":"","title":"Monarch Ingest"},{"location":"#overview","text":"The Monarch Ingest generates KGX formatted files conforming to the BioLink Model from a wide variety of biomedical data sources. The eventual output of the Monarch Ingest process is the Monarch KG . The latest version of this can be found at data.monarchinitiative.org See also the folder monarch-kg-dev/latest Monarch Ingest is built using Poetry , which will create its own virtual environment.","title":"Overview"},{"location":"#installation","text":"monarch-ingest is a Python 3.8+ package, installable via Poetry . Install Poetry , if you don't already have it: curl -sSL https://install.python-poetry.org | python3 - # Optional: Have poetry create its venvs in your project directories poetry config virtualenvs.in-project true Clone the repo and build the code: git clone git@github.com/monarch-initiative/monarch-ingest Install monarch-ingest: cd monarch-ingest poetry install (Optional) Activate the virtual environment: # This step removes the need to prefix all commands with `poetry run` poetry shell","title":"Installation"},{"location":"#usage","text":"For a detailed tutorial on ingests and how to make one, see the Create an Ingest tab . CLI usage is available in the CLI tab , gcor by running ingest --help . Run the whole pipeline! Download the source data: ingest download --all Run all transforms: ingest transform --all Merge all transformed output into a tar.gz containing one node and one edge file ingest merge Upload the results to the Monarch Ingest Google bucket ingest release","title":"Usage"},{"location":"CLI/","text":"ingest Usage : $ ingest [ OPTIONS ] COMMAND [ ARGS ] ... Options : --version --install-completion : Install completion for the current shell. --show-completion : Show completion for the current shell, to copy it or customize the installation. --help : Show this message and exit. Commands : closure download : Downloads data defined in download.yaml export jsonl merge : Merge nodes and edges into kg release : Copy data to Monarch GCP data buckets solr sqlite transform : Run Koza transformation on specified... ingest closure Usage : $ ingest closure [ OPTIONS ] Options : --help : Show this message and exit. ingest download Downloads data defined in download.yaml Usage : $ ingest download [ OPTIONS ] Options : --ingests TEXT : Which ingests to download data for --all / --no-all : Download all ingest datasets [default: no-all] --help : Show this message and exit. ingest export Usage : $ ingest export [ OPTIONS ] Options : --help : Show this message and exit. ingest jsonl Usage : $ ingest jsonl [ OPTIONS ] Options : --help : Show this message and exit. ingest merge Merge nodes and edges into kg Usage : $ ingest merge [ OPTIONS ] Options : --input-dir TEXT : Directory with nodes and edges to be merged [default: output/transform_output] --output-dir TEXT : Directory to output data [default: output] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose --help : Show this message and exit. ingest release Copy data to Monarch GCP data buckets Usage : $ ingest release [ OPTIONS ] Options : --dir TEXT : Directory with kg to be released [default: output] --kghub / --no-kghub : Also release to kghub S3 bucket [default: no-kghub] --help : Show this message and exit. ingest solr Usage : $ ingest solr [ OPTIONS ] Options : --help : Show this message and exit. ingest sqlite Usage : $ ingest sqlite [ OPTIONS ] Options : --help : Show this message and exit. ingest transform Run Koza transformation on specified Monarch ingests Usage : $ ingest transform [ OPTIONS ] Options : -o, --output-dir TEXT : Directory to output data [default: output] -i, --ingest TEXT : Run a single ingest (see ingests.yaml for a list) --phenio / --no-phenio : Run the phenio transform [default: no-phenio] -a, --all : Ingest all sources -f, --force : Force ingest, even if output exists (on by default for single ingests) --rdf / --no-rdf : Output rdf files along with tsv [default: no-rdf] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose, including Koza logs -l, --log : Write DEBUG level logs to ./logs/ for each ingest -n, --row-limit INTEGER : Number of rows to process --help : Show this message and exit.","title":"CLI"},{"location":"CLI/#ingest","text":"Usage : $ ingest [ OPTIONS ] COMMAND [ ARGS ] ... Options : --version --install-completion : Install completion for the current shell. --show-completion : Show completion for the current shell, to copy it or customize the installation. --help : Show this message and exit. Commands : closure download : Downloads data defined in download.yaml export jsonl merge : Merge nodes and edges into kg release : Copy data to Monarch GCP data buckets solr sqlite transform : Run Koza transformation on specified...","title":"ingest"},{"location":"CLI/#ingest-closure","text":"Usage : $ ingest closure [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest closure"},{"location":"CLI/#ingest-download","text":"Downloads data defined in download.yaml Usage : $ ingest download [ OPTIONS ] Options : --ingests TEXT : Which ingests to download data for --all / --no-all : Download all ingest datasets [default: no-all] --help : Show this message and exit.","title":"ingest download"},{"location":"CLI/#ingest-export","text":"Usage : $ ingest export [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest export"},{"location":"CLI/#ingest-jsonl","text":"Usage : $ ingest jsonl [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest jsonl"},{"location":"CLI/#ingest-merge","text":"Merge nodes and edges into kg Usage : $ ingest merge [ OPTIONS ] Options : --input-dir TEXT : Directory with nodes and edges to be merged [default: output/transform_output] --output-dir TEXT : Directory to output data [default: output] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose --help : Show this message and exit.","title":"ingest merge"},{"location":"CLI/#ingest-release","text":"Copy data to Monarch GCP data buckets Usage : $ ingest release [ OPTIONS ] Options : --dir TEXT : Directory with kg to be released [default: output] --kghub / --no-kghub : Also release to kghub S3 bucket [default: no-kghub] --help : Show this message and exit.","title":"ingest release"},{"location":"CLI/#ingest-solr","text":"Usage : $ ingest solr [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest solr"},{"location":"CLI/#ingest-sqlite","text":"Usage : $ ingest sqlite [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest sqlite"},{"location":"CLI/#ingest-transform","text":"Run Koza transformation on specified Monarch ingests Usage : $ ingest transform [ OPTIONS ] Options : -o, --output-dir TEXT : Directory to output data [default: output] -i, --ingest TEXT : Run a single ingest (see ingests.yaml for a list) --phenio / --no-phenio : Run the phenio transform [default: no-phenio] -a, --all : Ingest all sources -f, --force : Force ingest, even if output exists (on by default for single ingests) --rdf / --no-rdf : Output rdf files along with tsv [default: no-rdf] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose, including Koza logs -l, --log : Write DEBUG level logs to ./logs/ for each ingest -n, --row-limit INTEGER : Number of rows to process --help : Show this message and exit.","title":"ingest transform"},{"location":"Create-an-Ingest/","text":"What is an Ingest? Ingest Overview An ingest consists of 2 main steps: Downloading the data Transforming the data With 2 post-processing steps: Merging the output into a KGX knowledge graph Releasing the result to the Monarch Initiative Google Cloud bucket Let's go through the process for running an existing monarch ingest! Step 1. Download Download the dataset for your ingest, for example: ingest download --tags ncbi_gene or to download all source data: ingest download --all Step 2. Transform Transform the data, for example: ingest transform --tag ncbi_gene --row-limit 20 --log or ingest transform --all Step 3. Merge This step is typically performed after ingest transform --all , and merges all output node and edge files into a tar.gz containing one node and one edge file: ingest merge Step 4. Release Once you've transformed all the data and merged the output, you can create and upload a release: ingest release -- Now let's look at how to create and add a new ingest! First step: Propose a new Ingest","title":"What is an Ingest?"},{"location":"Create-an-Ingest/#what-is-an-ingest","text":"Ingest Overview An ingest consists of 2 main steps: Downloading the data Transforming the data With 2 post-processing steps: Merging the output into a KGX knowledge graph Releasing the result to the Monarch Initiative Google Cloud bucket Let's go through the process for running an existing monarch ingest! Step 1. Download Download the dataset for your ingest, for example: ingest download --tags ncbi_gene or to download all source data: ingest download --all Step 2. Transform Transform the data, for example: ingest transform --tag ncbi_gene --row-limit 20 --log or ingest transform --all Step 3. Merge This step is typically performed after ingest transform --all , and merges all output node and edge files into a tar.gz containing one node and one edge file: ingest merge Step 4. Release Once you've transformed all the data and merged the output, you can create and upload a release: ingest release -- Now let's look at how to create and add a new ingest! First step: Propose a new Ingest","title":"What is an Ingest?"},{"location":"Create-an-Ingest/1.%20Propose/","text":"Propose Propose an Ingest : create a ticket on GitHub Ingest that includes the name of the source and justification for it's inclusion in Monarch. Assign the ticket to @putmantime & @sagehrke (Monarch PM). Who : Anyone can submit a proposal. Estimate Workload : Utilize planning poker to identify the amount of work the proposed ingest will be. Who : Monarch Technical Team. The Monarch PM will initialize the planning poker vote. Put up for vote & discussion : Use voting on github (thumb up for approve / thumb down for reject). Voting & discussion are open for two weeks. Who : Anyone can vote on a proposal. The Monarch PM will initialize the vote. If voted positive : assign the ingest to a team member, start working on ingest, then create a PR. Skip to Step 6. Who : Monarch Technical Team If voted negative , note why it was downvoted and close the issue. Who : Monarch Technical Team Disseminate the proposed model and gather feedback : Send an email to the Monarch Leads Google Group and the slack ingest channel (kg-monarch) requesting input. All discussions are done in GitHub on the PR. Tag those that need to respond in a comment on the PR. The feedback stage is open for two weeks. Who : Monarch Technical Team member assigned to the PR in Step 4 will disseminate the proposal. Anyone can comment or suggest input. Deploy the new ingest Who : Monarch Technical Team Now let's look at how to create and add a new ingest! First step: Configure","title":"Propose"},{"location":"Create-an-Ingest/1.%20Propose/#propose","text":"Propose an Ingest : create a ticket on GitHub Ingest that includes the name of the source and justification for it's inclusion in Monarch. Assign the ticket to @putmantime & @sagehrke (Monarch PM). Who : Anyone can submit a proposal. Estimate Workload : Utilize planning poker to identify the amount of work the proposed ingest will be. Who : Monarch Technical Team. The Monarch PM will initialize the planning poker vote. Put up for vote & discussion : Use voting on github (thumb up for approve / thumb down for reject). Voting & discussion are open for two weeks. Who : Anyone can vote on a proposal. The Monarch PM will initialize the vote. If voted positive : assign the ingest to a team member, start working on ingest, then create a PR. Skip to Step 6. Who : Monarch Technical Team If voted negative , note why it was downvoted and close the issue. Who : Monarch Technical Team Disseminate the proposed model and gather feedback : Send an email to the Monarch Leads Google Group and the slack ingest channel (kg-monarch) requesting input. All discussions are done in GitHub on the PR. Tag those that need to respond in a comment on the PR. The feedback stage is open for two weeks. Who : Monarch Technical Team member assigned to the PR in Step 4 will disseminate the proposal. Anyone can comment or suggest input. Deploy the new ingest Who : Monarch Technical Team Now let's look at how to create and add a new ingest! First step: Configure","title":"Propose"},{"location":"Create-an-Ingest/2.%20Configure/","text":"Configure Make a directory for your ingest, using the source of the data as the name: mkdir src/monarch_ingest/ingests/ For example: mkdir src/monarch_ingest/ingests/ncbi Add data sources to src/monarch_ingest/download.yaml : # - url : https://.com/downloads/somedata.txt local_name : data//somedata.txt tag : _ For example: # mgi - url : http://www.informatics.jax.org/downloads/reports/MRK_Reference.rpt local_name : data/mgi/MRK_Reference.rpt tag : mgi_publication_to_gene Note: You can now use ingest download --tags or ingest download --all , and your data will be downloaded to the appropriate subdir in data/ Add your ingest to src/monarch_ingest/ingests.yaml : : config : 'ingests//.yaml For example: ncbi_gene : config : 'ingests/ncbi/gene.yaml' Copy the template: cp ingest_template/* src/monarch_ingest/ingests/ Edit metadata.yaml : Update the description, rights link, url, etc and then add your source_file Edit the source file yaml Match the columns or required fields with what's available in the file to be ingested If it's an ingest that exists in Dipper , check out what Dipper does. Check the Biolink Model documentation to look at what you can capture If what we need from an ingest can't be captured in the model yet, make a new Biolink issue Set the header properties If there is no header at all, set header: False If there are comment lines before the header, count them and set skip_lines: {n} -- Next step: Adding documentation","title":"Configure"},{"location":"Create-an-Ingest/2.%20Configure/#configure","text":"Make a directory for your ingest, using the source of the data as the name: mkdir src/monarch_ingest/ingests/ For example: mkdir src/monarch_ingest/ingests/ncbi Add data sources to src/monarch_ingest/download.yaml : # - url : https://.com/downloads/somedata.txt local_name : data//somedata.txt tag : _ For example: # mgi - url : http://www.informatics.jax.org/downloads/reports/MRK_Reference.rpt local_name : data/mgi/MRK_Reference.rpt tag : mgi_publication_to_gene Note: You can now use ingest download --tags or ingest download --all , and your data will be downloaded to the appropriate subdir in data/ Add your ingest to src/monarch_ingest/ingests.yaml : : config : 'ingests//.yaml For example: ncbi_gene : config : 'ingests/ncbi/gene.yaml' Copy the template: cp ingest_template/* src/monarch_ingest/ingests/ Edit metadata.yaml : Update the description, rights link, url, etc and then add your source_file Edit the source file yaml Match the columns or required fields with what's available in the file to be ingested If it's an ingest that exists in Dipper , check out what Dipper does. Check the Biolink Model documentation to look at what you can capture If what we need from an ingest can't be captured in the model yet, make a new Biolink issue Set the header properties If there is no header at all, set header: False If there are comment lines before the header, count them and set skip_lines: {n} -- Next step: Adding documentation","title":"Configure"},{"location":"Create-an-Ingest/3.%20Document/","text":"Document The documentation for an ingest should reflect both the decision-making process that led to the output, and the output itself. Begin by copying the source.md file to the docs/Sources/ folder, renaming it to match the ingest name. Tip This is a great time to look over the columns in the ingest file. Consider what biolink classes are appropriate to represent them, and what fields are available to populate on each. Some helpful resources: Biolink Documentation List of Biolink Associations Use a Jupyter Notebook with Biolink Model Toolkit to do things like get_element_by_mapping('RO:0002410') For ingests migrating from Dipper, check out the documentation and source code -- Next step: Begin implementation","title":"Document"},{"location":"Create-an-Ingest/3.%20Document/#document","text":"The documentation for an ingest should reflect both the decision-making process that led to the output, and the output itself. Begin by copying the source.md file to the docs/Sources/ folder, renaming it to match the ingest name. Tip This is a great time to look over the columns in the ingest file. Consider what biolink classes are appropriate to represent them, and what fields are available to populate on each. Some helpful resources: Biolink Documentation List of Biolink Associations Use a Jupyter Notebook with Biolink Model Toolkit to do things like get_element_by_mapping('RO:0002410') For ingests migrating from Dipper, check out the documentation and source code -- Next step: Begin implementation","title":"Document"},{"location":"Create-an-Ingest/4.%20Implement/","text":"Implement Most Koza scripts can run in flat mode, which means that the transform code itself doesn't need to handle the looping mechanism, and instead the transform code will have a row injected at the top and call the write command at the bottom. In between fields from the incoming row should be mapped to Biolink instances. Imports and setup Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer. from koza.cli_runner import koza_app from biolink.pydanticmodel import Gene # The source name is used for reading and writing source_name = \"gene-information\" Inject the row # inject a single row from the source row = koza_app . get_row ( source_name ) Extras Next up handle any additional set up for the ingest, such as including a map or bringing in the CURIE cleaning service curie_cleaner = koza_app . curie_cleaner eqe2zp = koza_app . get_map ( \"eqe2zp\" ) translation_table = koza_app . translation_table Creating entities At this step, hopefully your documentation is so good that you're just letting your fingers take on the last step of converting what you've already planned into Python syntax. Ideally not much logic will be needed here, and if there's a lot, it might be worth considering whether an ingest (even on the same file) can be split across multiple transforms so that each is as easy to read as possible. Aim to add all properties when creating the instance, but in some cases adding optional lists might need to happen below. from biolink.pydanticmodel import Gene gene = Gene ( id = 'somethingbase:' + row [ 'ID' ], name = row [ 'Name' ] ) # populate any additional optional properties if row [ 'xrefs' ]: gene . xrefs = [ curie_cleaner . clean ( xref ) for xref in row [ 'xrefs' ]] Writing At the end of the script, call the writer. The first argument must be the source_name (so that it will know where to write), entities should be passed in as additional arguments. koza_app . write ( gene , phenotypicFeature , association ) Running your ingest To execute your ingest, you can now run: ingest transform --tag -- Next step: Testing!","title":"Implement"},{"location":"Create-an-Ingest/4.%20Implement/#implement","text":"Most Koza scripts can run in flat mode, which means that the transform code itself doesn't need to handle the looping mechanism, and instead the transform code will have a row injected at the top and call the write command at the bottom. In between fields from the incoming row should be mapped to Biolink instances.","title":"Implement"},{"location":"Create-an-Ingest/4.%20Implement/#imports-and-setup","text":"Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer. from koza.cli_runner import koza_app from biolink.pydanticmodel import Gene # The source name is used for reading and writing source_name = \"gene-information\"","title":"Imports and setup"},{"location":"Create-an-Ingest/4.%20Implement/#inject-the-row","text":"# inject a single row from the source row = koza_app . get_row ( source_name )","title":"Inject the row"},{"location":"Create-an-Ingest/4.%20Implement/#extras","text":"Next up handle any additional set up for the ingest, such as including a map or bringing in the CURIE cleaning service curie_cleaner = koza_app . curie_cleaner eqe2zp = koza_app . get_map ( \"eqe2zp\" ) translation_table = koza_app . translation_table","title":"Extras"},{"location":"Create-an-Ingest/4.%20Implement/#creating-entities","text":"At this step, hopefully your documentation is so good that you're just letting your fingers take on the last step of converting what you've already planned into Python syntax. Ideally not much logic will be needed here, and if there's a lot, it might be worth considering whether an ingest (even on the same file) can be split across multiple transforms so that each is as easy to read as possible. Aim to add all properties when creating the instance, but in some cases adding optional lists might need to happen below. from biolink.pydanticmodel import Gene gene = Gene ( id = 'somethingbase:' + row [ 'ID' ], name = row [ 'Name' ] ) # populate any additional optional properties if row [ 'xrefs' ]: gene . xrefs = [ curie_cleaner . clean ( xref ) for xref in row [ 'xrefs' ]]","title":"Creating entities"},{"location":"Create-an-Ingest/4.%20Implement/#writing","text":"At the end of the script, call the writer. The first argument must be the source_name (so that it will know where to write), entities should be passed in as additional arguments. koza_app . write ( gene , phenotypicFeature , association )","title":"Writing"},{"location":"Create-an-Ingest/4.%20Implement/#running-your-ingest","text":"To execute your ingest, you can now run: ingest transform --tag -- Next step: Testing!","title":"Running your ingest"},{"location":"Create-an-Ingest/5.%20Test/","text":"Testing You may want to start with the test template within ingest_template Basic fixtures First, set up your basic fixtures, taking care to set the correct source name and location for the transform code. import pytest from koza.cli_runner import get_translation_table @pytest . fixture def tt (): return get_translation_table ( \"src/monarch_ingest/translation_table.yaml\" , None ) # This name must match the ingest name in the transform code @pytest . fixture def source_name (): return \"something-to-somethingelse\" # This is the location of the transform code @pytest . fixture def script (): return \"./src/monarch_ingest/ingests/somethingbase/something2somethingelse.py\" A map, if necessary Some ingests will depend on one or more maps, that fixture can be set up here. Note that this fixture must return a map of maps, and that the inner maps will map from an ID to a dictionary representing column headers and values. In the example below, a map is created that maps from a big concatenated natural key (as the ID) for ZP to a single column (called iri ) that contains the ZP ID. This map is then placed into the map cache under the name eqe2zp @pytest . fixture def map_cache (): eqe2zp = { \"0-0-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0004225\" }, \"BSPO:0000112-BFO:0000050-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0011243\" }, \"BSPO:0000000-BFO:0000050-ZFA:0000823-PATO:0000642-BSPO:0000007-BFO:0000050-ZFA:0000823\" : { \"iri\" : \"ZP:0000157\" }, } return { \"eqe2zp\" : eqe2zp } Fixtures for test data Create a fixture that returns a dictionary to represent a single row. As a matter of strategy, this row should probably represent a fairly basic row being ingested. One trick so that you don't have to manually convert from the imput format to a python dictionary format is to run your ingest with a debugger and set a breakpoint just after a row has been injected. If you want a more specific piece of data, check out conditional breakpoints. @pytest . fixture def basic_row (): return { \"ID\" : \"341492416\" , \"Gene Symbol\" : \"pax2a\" , \"Gene ID\" : \"ZDB-GENE-990415-8\" , #... \"Fish Environment ID\" : \"ZDB-GENOX-041102-1385\" , \"Publication ID\" : \"ZDB-PUB-970210-19\" , \"Figure ID\" : \"ZDB-FIG-120307-8\" , } Fixture for transforming a single row This sets up a fixture you can call more than once to independently test different attributes @pytest . fixture def basic_g2p ( mock_koza , source_name , basic_row , script , map_cache , tt ): return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) Test the basics of the ingest Confirm that entities are created matching the expectations on the row # A simple end-to-end test is to confirm that the IDs are set on def test_gene ( basic_g2p ): gene = basic_g2p [ 0 ] assert gene assert gene . id == \"ZFIN:ZDB-GENE-990415-8\" def test_phenotypic_feature ( basic_g2p ): phenotypic_feature = basic_g2p [ 1 ] assert phenotypic_feature assert phenotypic_feature . id == \"ZP:0004225\" def test_association ( basic_g2p ): association = basic_g2p [ 2 ] assert association assert association . subject == \"ZFIN:ZDB-GENE-990415-8\" assert association . object == \"ZP:0004225\" assert association . publications assert association . publications [ 0 ] == \"ZFIN:ZDB-PUB-970210-19\" Test against an alternate row For any branching within the transform code, it's a good idea to test against all of the paths through the code. It's possible to set conditional breakpoints to find real examples in the code that will hit each code path, but it may be more practical to modify the basic row as a new fixture The example below creates a row with additional columns filled in. @pytest . fixture def postcomposed ( mock_koza , source_name , basic_row , script , map_cache , tt ): basic_row [ \"Affected Structure or Process 1 subterm ID\" ] = \"BSPO:0000112\" basic_row [ \"Post-composed Relationship ID\" ] = \"BFO:0000050\" basic_row [ \"Affected Structure or Process 1 superterm ID\" ] = \"ZFA:0000042\" return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) Parameterized tests Mixing parameterization and fixtures changes the approach a little. In this case it makes more sense to alter the row using a parameter and then create the entities within the same method. The test below is intended to confirm that when the tag column has any of the specified values, the row will be ignored (confirmed because no entities are created). @pytest . mark . parametrize ( \"tag\" , [ \"normal\" , \"exacerbated\" , \"ameliorated\" ]) def test_excluded_tags ( mock_koza , source_name , basic_row , script , map_cache , tt , tag ): basic_row [ \"Phenotype Tag\" ] = tag entities = mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) assert len ( entities ) == 0","title":"Testing"},{"location":"Create-an-Ingest/5.%20Test/#testing","text":"You may want to start with the test template within ingest_template","title":"Testing"},{"location":"Create-an-Ingest/5.%20Test/#basic-fixtures","text":"First, set up your basic fixtures, taking care to set the correct source name and location for the transform code. import pytest from koza.cli_runner import get_translation_table @pytest . fixture def tt (): return get_translation_table ( \"src/monarch_ingest/translation_table.yaml\" , None ) # This name must match the ingest name in the transform code @pytest . fixture def source_name (): return \"something-to-somethingelse\" # This is the location of the transform code @pytest . fixture def script (): return \"./src/monarch_ingest/ingests/somethingbase/something2somethingelse.py\"","title":"Basic fixtures"},{"location":"Create-an-Ingest/5.%20Test/#a-map-if-necessary","text":"Some ingests will depend on one or more maps, that fixture can be set up here. Note that this fixture must return a map of maps, and that the inner maps will map from an ID to a dictionary representing column headers and values. In the example below, a map is created that maps from a big concatenated natural key (as the ID) for ZP to a single column (called iri ) that contains the ZP ID. This map is then placed into the map cache under the name eqe2zp @pytest . fixture def map_cache (): eqe2zp = { \"0-0-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0004225\" }, \"BSPO:0000112-BFO:0000050-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0011243\" }, \"BSPO:0000000-BFO:0000050-ZFA:0000823-PATO:0000642-BSPO:0000007-BFO:0000050-ZFA:0000823\" : { \"iri\" : \"ZP:0000157\" }, } return { \"eqe2zp\" : eqe2zp }","title":"A map, if necessary"},{"location":"Create-an-Ingest/5.%20Test/#fixtures-for-test-data","text":"Create a fixture that returns a dictionary to represent a single row. As a matter of strategy, this row should probably represent a fairly basic row being ingested. One trick so that you don't have to manually convert from the imput format to a python dictionary format is to run your ingest with a debugger and set a breakpoint just after a row has been injected. If you want a more specific piece of data, check out conditional breakpoints. @pytest . fixture def basic_row (): return { \"ID\" : \"341492416\" , \"Gene Symbol\" : \"pax2a\" , \"Gene ID\" : \"ZDB-GENE-990415-8\" , #... \"Fish Environment ID\" : \"ZDB-GENOX-041102-1385\" , \"Publication ID\" : \"ZDB-PUB-970210-19\" , \"Figure ID\" : \"ZDB-FIG-120307-8\" , }","title":"Fixtures for test data"},{"location":"Create-an-Ingest/5.%20Test/#fixture-for-transforming-a-single-row","text":"This sets up a fixture you can call more than once to independently test different attributes @pytest . fixture def basic_g2p ( mock_koza , source_name , basic_row , script , map_cache , tt ): return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , )","title":"Fixture for transforming a single row"},{"location":"Create-an-Ingest/5.%20Test/#test-the-basics-of-the-ingest","text":"Confirm that entities are created matching the expectations on the row # A simple end-to-end test is to confirm that the IDs are set on def test_gene ( basic_g2p ): gene = basic_g2p [ 0 ] assert gene assert gene . id == \"ZFIN:ZDB-GENE-990415-8\" def test_phenotypic_feature ( basic_g2p ): phenotypic_feature = basic_g2p [ 1 ] assert phenotypic_feature assert phenotypic_feature . id == \"ZP:0004225\" def test_association ( basic_g2p ): association = basic_g2p [ 2 ] assert association assert association . subject == \"ZFIN:ZDB-GENE-990415-8\" assert association . object == \"ZP:0004225\" assert association . publications assert association . publications [ 0 ] == \"ZFIN:ZDB-PUB-970210-19\"","title":"Test the basics of the ingest"},{"location":"Create-an-Ingest/5.%20Test/#test-against-an-alternate-row","text":"For any branching within the transform code, it's a good idea to test against all of the paths through the code. It's possible to set conditional breakpoints to find real examples in the code that will hit each code path, but it may be more practical to modify the basic row as a new fixture The example below creates a row with additional columns filled in. @pytest . fixture def postcomposed ( mock_koza , source_name , basic_row , script , map_cache , tt ): basic_row [ \"Affected Structure or Process 1 subterm ID\" ] = \"BSPO:0000112\" basic_row [ \"Post-composed Relationship ID\" ] = \"BFO:0000050\" basic_row [ \"Affected Structure or Process 1 superterm ID\" ] = \"ZFA:0000042\" return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , )","title":"Test against an alternate row"},{"location":"Create-an-Ingest/5.%20Test/#parameterized-tests","text":"Mixing parameterization and fixtures changes the approach a little. In this case it makes more sense to alter the row using a parameter and then create the entities within the same method. The test below is intended to confirm that when the tag column has any of the specified values, the row will be ignored (confirmed because no entities are created). @pytest . mark . parametrize ( \"tag\" , [ \"normal\" , \"exacerbated\" , \"ameliorated\" ]) def test_excluded_tags ( mock_koza , source_name , basic_row , script , map_cache , tt , tag ): basic_row [ \"Phenotype Tag\" ] = tag entities = mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) assert len ( entities ) == 0","title":"Parameterized tests"},{"location":"KG-Build-Process/kg-build-process/","text":"Monarch KG Build Process Download A weekly job indepent from the KG build process runs to download data sources and store then on a cloud bucket. This replaces DipperCache from the old pipeline. KGHub Downloader reads from downloads.yaml to download each file. Some post-processing is done in a shell script before the files are uploaded to the cloud bucket. At the start of the main ingest build, data files are copied from the cloud bucket. Transform A call to the ingest command line tool runs each source ingest defined in ingest.yaml , producing both KGX tsv and RDF nt output. Source Ingests Ingests are documented individually in the Sources section of this documentation. Ingests are either node or edge specific, and use IDs as defined in the source data files without additional re-mapping of identifiers. The primary role they have is to represent sources in biolink model and KGX format, and secondarily they may also subset from the source files. The output of individual ingests can be found in the transform_output directory in each release. Phenio-KG Ontologies in Monarch are built first as Phenio , then converted into the biolink model and represented as KGX in kg-phenio . The ingest CLI has transform_phenio method then performs some further filtering on the kg-phenio node and edge files. Limiting to nodes and edges that match a subset of curie namespaces, and limiting node property columns to a relevant subset. Merge With all transforms complete, the individual kgx node and edge files in output/transform_output can be combined into a merged graph. This is done by the merge command in the ingest CLI. At this point, the individual node and edge KGX files from the transforms may not have matching IDs, and in fact, we may have edges that point to nodes that are not present in our canonical node sources (e.g. a STRING edge that points to an ENSEMBL gene that can't be mapped to HGNC). The merge process is broken down into concatenation, mapping, and finally a QC filter step. We developed a tool called cat merge Concatenate The first step just loads all node kgx files into one dataframe, and all edge kgx files into another. Map The mapping step replaces subject and object IDs in edge files using SSSOM mapping files, with the IDs from the intial ingests stored in original_subject and original_object fields. Mappings for genes are generated in our monarch-gene-mapping process, and are available at data.monarchinitiative.org . Diseases are mapped using the MONDO SSSOM. This step is requires that the subject of the SSSOM file be our canonical ID, and the object be the non-canonical ID. There is room for improvement here. QC Filter After edges have been mapped, it's important to cull the graph that point to nodes that don't exist in the graph. The QC filtering step performs joins against the node table/dataframe to split out these edges into their own kgx file ( monarch-kg-dangling-edges.tsv that can be used for QC purposes. A group of edges that wind up in this file could be due to a number of reasons: * We're missing an ontology or other node source that is required for an ingest/source: this is something we want to fix \ud83d\udc4e * We're missing mapping necessary to translate between an edge ingest and our canonical node sources: this is something we want to fix \ud83d\udc4e * The edge ingest includes edges which can't be mapped to our canonical node sources: this is a feature! \ud83d\udc4d We have a visualization of this split between connected and dangling edges for each ingest on our QC Dashboard that we can use to problem-solve our mappings and node sources. Neo4j A neo4j dump is created using the merged tar.gz file using KGX's neo4j loader and a docker container. This process is defined directly in the Jenkinsfile . Denormalize For Solr (and secondarily SQLite) we produce a denormalized edge file, which includes additional details for the subjects and objects of each edge, including the category, namespace/prefix, and ontology ancestor closures following the GOLR pattern (ID and label closure lists). The closure file is generated by relation-graph and are included in the kg-phenio download. The after_download script makes a filtered version that only includes rdfs:subClassOf , BFO:0000050 , and UPHENO:0000001 . SQLite A SQLite database file is produced by loading node and edge files into a SQLite database using a simple shell script , along with the primary node and edge tables, edge tables for danging and denormalized edges are included as well. Solr Our solr index is loaded directly from the node kgx tsv file and the denormalized edge tsv file using LinkML-Solr . The LinkML schema for the Solr index is lives in the monarch-py data access library (see documentation for Entity and Association classes). LinkML-Solr starts Solr in docker via the lsolr command, defines the Solr schema based on the LinkML Schema and then bulk loads the data. Currently, a small amount of additional Solr configuration ( defining new field types , and copy-fields declarations to fill them) is done via curl commands in shell scripts. Our solr load process is defined in scripts/load_solr.sh","title":"KG Build Process"},{"location":"KG-Build-Process/kg-build-process/#monarch-kg-build-process","text":"","title":"Monarch KG Build Process"},{"location":"KG-Build-Process/kg-build-process/#download","text":"A weekly job indepent from the KG build process runs to download data sources and store then on a cloud bucket. This replaces DipperCache from the old pipeline. KGHub Downloader reads from downloads.yaml to download each file. Some post-processing is done in a shell script before the files are uploaded to the cloud bucket. At the start of the main ingest build, data files are copied from the cloud bucket.","title":"Download"},{"location":"KG-Build-Process/kg-build-process/#transform","text":"A call to the ingest command line tool runs each source ingest defined in ingest.yaml , producing both KGX tsv and RDF nt output.","title":"Transform"},{"location":"KG-Build-Process/kg-build-process/#source-ingests","text":"Ingests are documented individually in the Sources section of this documentation. Ingests are either node or edge specific, and use IDs as defined in the source data files without additional re-mapping of identifiers. The primary role they have is to represent sources in biolink model and KGX format, and secondarily they may also subset from the source files. The output of individual ingests can be found in the transform_output directory in each release.","title":"Source Ingests"},{"location":"KG-Build-Process/kg-build-process/#phenio-kg","text":"Ontologies in Monarch are built first as Phenio , then converted into the biolink model and represented as KGX in kg-phenio . The ingest CLI has transform_phenio method then performs some further filtering on the kg-phenio node and edge files. Limiting to nodes and edges that match a subset of curie namespaces, and limiting node property columns to a relevant subset.","title":"Phenio-KG"},{"location":"KG-Build-Process/kg-build-process/#merge","text":"With all transforms complete, the individual kgx node and edge files in output/transform_output can be combined into a merged graph. This is done by the merge command in the ingest CLI. At this point, the individual node and edge KGX files from the transforms may not have matching IDs, and in fact, we may have edges that point to nodes that are not present in our canonical node sources (e.g. a STRING edge that points to an ENSEMBL gene that can't be mapped to HGNC). The merge process is broken down into concatenation, mapping, and finally a QC filter step. We developed a tool called cat merge","title":"Merge"},{"location":"KG-Build-Process/kg-build-process/#concatenate","text":"The first step just loads all node kgx files into one dataframe, and all edge kgx files into another.","title":"Concatenate"},{"location":"KG-Build-Process/kg-build-process/#map","text":"The mapping step replaces subject and object IDs in edge files using SSSOM mapping files, with the IDs from the intial ingests stored in original_subject and original_object fields. Mappings for genes are generated in our monarch-gene-mapping process, and are available at data.monarchinitiative.org . Diseases are mapped using the MONDO SSSOM. This step is requires that the subject of the SSSOM file be our canonical ID, and the object be the non-canonical ID. There is room for improvement here.","title":"Map"},{"location":"KG-Build-Process/kg-build-process/#qc-filter","text":"After edges have been mapped, it's important to cull the graph that point to nodes that don't exist in the graph. The QC filtering step performs joins against the node table/dataframe to split out these edges into their own kgx file ( monarch-kg-dangling-edges.tsv that can be used for QC purposes. A group of edges that wind up in this file could be due to a number of reasons: * We're missing an ontology or other node source that is required for an ingest/source: this is something we want to fix \ud83d\udc4e * We're missing mapping necessary to translate between an edge ingest and our canonical node sources: this is something we want to fix \ud83d\udc4e * The edge ingest includes edges which can't be mapped to our canonical node sources: this is a feature! \ud83d\udc4d We have a visualization of this split between connected and dangling edges for each ingest on our QC Dashboard that we can use to problem-solve our mappings and node sources.","title":"QC Filter"},{"location":"KG-Build-Process/kg-build-process/#neo4j","text":"A neo4j dump is created using the merged tar.gz file using KGX's neo4j loader and a docker container. This process is defined directly in the Jenkinsfile .","title":"Neo4j"},{"location":"KG-Build-Process/kg-build-process/#denormalize","text":"For Solr (and secondarily SQLite) we produce a denormalized edge file, which includes additional details for the subjects and objects of each edge, including the category, namespace/prefix, and ontology ancestor closures following the GOLR pattern (ID and label closure lists). The closure file is generated by relation-graph and are included in the kg-phenio download. The after_download script makes a filtered version that only includes rdfs:subClassOf , BFO:0000050 , and UPHENO:0000001 .","title":"Denormalize"},{"location":"KG-Build-Process/kg-build-process/#sqlite","text":"A SQLite database file is produced by loading node and edge files into a SQLite database using a simple shell script , along with the primary node and edge tables, edge tables for danging and denormalized edges are included as well.","title":"SQLite"},{"location":"KG-Build-Process/kg-build-process/#solr","text":"Our solr index is loaded directly from the node kgx tsv file and the denormalized edge tsv file using LinkML-Solr . The LinkML schema for the Solr index is lives in the monarch-py data access library (see documentation for Entity and Association classes). LinkML-Solr starts Solr in docker via the lsolr command, defines the Solr schema based on the LinkML Schema and then bulk loads the data. Currently, a small amount of additional Solr configuration ( defining new field types , and copy-fields declarations to fill them) is done via curl commands in shell scripts. Our solr load process is defined in scripts/load_solr.sh","title":"Solr"},{"location":"Principles/modeling-principles/","text":"Modeling Principles Conforms to Schema The Monarch Biolink Specification is an implementation of the Biolink Model. The KG must be conformant with The Monarch Biolink Specification. Node Normalization The final KG must have Nodes normalized to the canonical prefix for any given node type. The canonical prefix should be determined by The Monarch Biolink Model Specification. Authoratative Source Providers of Associations are not the authoratative sources for the Nodes in general. Nodes should be ingested from their own authoratative source, seperate from edge ingests. Genes and Proteins Genes and reference Proteins shall be treated as equivalent. When collapsing nodes give the Gene Id the priority, original_subject = UniProt Id. If in future there is a need to represent Isoforms, then UniProt Isoform Ids should be used. Variants Variant to Disease/Phenotype Associations may be rolled up to the Gene level. If they are rolled up, then a subject_modifier = Variant Id. Gene to Disease Associations Gene to Disease Associations should come from high quality sources that have been vetted by domain experts within Monarch. Gene to Disease Associations must not confuse single Gene causal Mendelian Associations with otherwise associated Genes. (e.g. contributing or associated Genes)","title":"Principles"},{"location":"Principles/modeling-principles/#modeling-principles","text":"","title":"Modeling Principles"},{"location":"Principles/modeling-principles/#conforms-to-schema","text":"The Monarch Biolink Specification is an implementation of the Biolink Model. The KG must be conformant with The Monarch Biolink Specification.","title":"Conforms to Schema"},{"location":"Principles/modeling-principles/#node-normalization","text":"The final KG must have Nodes normalized to the canonical prefix for any given node type. The canonical prefix should be determined by The Monarch Biolink Model Specification.","title":"Node Normalization"},{"location":"Principles/modeling-principles/#authoratative-source","text":"Providers of Associations are not the authoratative sources for the Nodes in general. Nodes should be ingested from their own authoratative source, seperate from edge ingests.","title":"Authoratative Source"},{"location":"Principles/modeling-principles/#genes-and-proteins","text":"Genes and reference Proteins shall be treated as equivalent. When collapsing nodes give the Gene Id the priority, original_subject = UniProt Id. If in future there is a need to represent Isoforms, then UniProt Isoform Ids should be used.","title":"Genes and Proteins"},{"location":"Principles/modeling-principles/#variants","text":"Variant to Disease/Phenotype Associations may be rolled up to the Gene level. If they are rolled up, then a subject_modifier = Variant Id.","title":"Variants"},{"location":"Principles/modeling-principles/#gene-to-disease-associations","text":"Gene to Disease Associations should come from high quality sources that have been vetted by domain experts within Monarch. Gene to Disease Associations must not confuse single Gene causal Mendelian Associations with otherwise associated Genes. (e.g. contributing or associated Genes)","title":"Gene to Disease Associations"},{"location":"Sources/","text":"Data Sources This section contains detailed information on all datasets and ontologies ingested to create the Monarch knowledge graph. To learn more about a specific dataset/ontology, click on the source name in the list to the left.","title":"Overview"},{"location":"Sources/#data-sources","text":"This section contains detailed information on all datasets and ontologies ingested to create the Monarch knowledge graph. To learn more about a specific dataset/ontology, click on the source name in the list to the left.","title":"Data Sources"},{"location":"Sources/alliance/","text":"Alliance The Alliance of Genome Resources contains a subset of model organism data from member databases that is harmonized to the same model. Over time, as the alliance adds additional data types, individual MOD ingests can be replaced by collective Alliance ingest. The Alliance has bulk data downloads, ingest data formats, and an API. The preference should be bulk downloads first, followed by ingest formats, finally by API calls. In some cases it may continue to be more practical to load from individual MODs when data is not yet fully harmonized in the Alliance. Alliance Bulk Downloads Alliance schemas Gene Information Genes for all Alliance species (Human, Rat, Mouse, Fish, Fly, Worm, Yeast, Frog) are loaded using the BGI formatted ingest files, as there are no Gene export files. Biolink captured biolink:Gene id symbol name in_taxon source synonyms xref Gene to Phenotype Phenotype for the subset of Alliance species which use phenotype ontologies (Human, Rat, Mouse, Worm) are loaded using the phenotype ingest format , since there is not yet a phenotype export file from the Alliance. This file contains both Gene and Allele phenotypes, so a single column TSV is produced from BGI files listing Gene IDs to check the category and only genes are included. Environmental conditions are present for some species and are captured using the qualifier. Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (condition terms) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source']) Gene Expression This is the full data model of the Alliance file ingested; however, not all fields are currently used in the current ingest (in most cases, these fields are not yet set in the input data sets; see the gene_to_expression.yaml file) Species SpeciesID GeneID GeneSymbol Location StageTerm AssayID AssayTermName CellularComponentID CellularComponentTerm CellularComponentQualifierIDs CellularComponentQualifierTermNames SubStructureID SubStructureName SubStructureQualifierIDs SubStructureQualifierTermNames AnatomyTermID AnatomyTermName AnatomyTermQualifierIDs AnatomyTermQualifierTermNames SourceURL Source Reference Discussion Group : https://www.alliancegenome.org/working-groups#expression Download : https://www.alliancegenome.org/downloads#expression Biolink captured biolink:Gene id (row['GeneID']) name (row['GeneSymbol']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:AnatomicalEntity id (row['AnatomyTermID']) name (row['AnatomyTermName']) source ( infores mapped from row['Source']) biolink:CellularComponent # is_a: anatomical entity... id (row['CellularComponentID']) name (row['CellularComponentTerm']) source ( infores mapped from row['Source']) biolink:LifeStage id (CURIE heuristically inferred from row['SpeciesID'] and row['StageTerm']) name (row['StageTerm']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:GeneToExpressionSiteAssociation id (random uuid) subject (Gene.id) predicates (biolink:expressed_in) object (AnatomicalEntity.id or CellularComponent.id) stage qualifier (LifeStage.id) # if specified; None otherwise has evidence (row['AssayID']) # e.g. taken from MMO - \"measurement method ontology\" publications (row['Reference']) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source']) Literature The Alliance has a well defined literature ingest format that aligns publications from MOD members. Mapping of Alliance publication category to biolink category Alliance category Biolink publication type Research Article IAO:0000013 Review Article IAO:0000013 Thesis IAO:0000311 Book IAO:0000311 Other IAO:0000311 Preprint IAO:0000013 Conference Publication IAO:0000311 Personal Communication IAO:0000311 Direct Data Submission IAO:0000311 Internal Process Reference IAO:0000311 Unknown IAO:0000311 Retraction IAO:0000311 This ingest doesn't make an effort to sort these publication categories into more specific classes than biolink:Publication, but does set the type. Biolink captured biolink:Publication id (primaryId) name (title) summary (abstract) authors (authors.name flattened as a comma separated string) xref (crossReferences.id) mesh terms (meshTerms.meshHeadingTerm , meshTerms.meshQualifierTerm) type (IAO:0000311 for publication, IAO:0000013 for article) creation date (datePublished) keywords (keywords) Citation Harmonizing model organism data in the Alliance of Genome Resources. 2022. Alliance of Genome Resources Consortium. Genetics, Volume 220, Issue 4, April 2022. Published Online: 25 February 2022. doi: doi.org/10.1093/genetics/iyac022. PMID: 35380658; PMCID: PMC8982023.","title":"Alliance"},{"location":"Sources/alliance/#alliance","text":"The Alliance of Genome Resources contains a subset of model organism data from member databases that is harmonized to the same model. Over time, as the alliance adds additional data types, individual MOD ingests can be replaced by collective Alliance ingest. The Alliance has bulk data downloads, ingest data formats, and an API. The preference should be bulk downloads first, followed by ingest formats, finally by API calls. In some cases it may continue to be more practical to load from individual MODs when data is not yet fully harmonized in the Alliance. Alliance Bulk Downloads Alliance schemas","title":"Alliance"},{"location":"Sources/alliance/#gene-information","text":"Genes for all Alliance species (Human, Rat, Mouse, Fish, Fly, Worm, Yeast, Frog) are loaded using the BGI formatted ingest files, as there are no Gene export files. Biolink captured biolink:Gene id symbol name in_taxon source synonyms xref","title":"Gene Information"},{"location":"Sources/alliance/#gene-to-phenotype","text":"Phenotype for the subset of Alliance species which use phenotype ontologies (Human, Rat, Mouse, Worm) are loaded using the phenotype ingest format , since there is not yet a phenotype export file from the Alliance. This file contains both Gene and Allele phenotypes, so a single column TSV is produced from BGI files listing Gene IDs to check the category and only genes are included. Environmental conditions are present for some species and are captured using the qualifier. Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (condition terms) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source'])","title":"Gene to Phenotype"},{"location":"Sources/alliance/#gene-expression","text":"This is the full data model of the Alliance file ingested; however, not all fields are currently used in the current ingest (in most cases, these fields are not yet set in the input data sets; see the gene_to_expression.yaml file) Species SpeciesID GeneID GeneSymbol Location StageTerm AssayID AssayTermName CellularComponentID CellularComponentTerm CellularComponentQualifierIDs CellularComponentQualifierTermNames SubStructureID SubStructureName SubStructureQualifierIDs SubStructureQualifierTermNames AnatomyTermID AnatomyTermName AnatomyTermQualifierIDs AnatomyTermQualifierTermNames SourceURL Source Reference Discussion Group : https://www.alliancegenome.org/working-groups#expression Download : https://www.alliancegenome.org/downloads#expression Biolink captured biolink:Gene id (row['GeneID']) name (row['GeneSymbol']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:AnatomicalEntity id (row['AnatomyTermID']) name (row['AnatomyTermName']) source ( infores mapped from row['Source']) biolink:CellularComponent # is_a: anatomical entity... id (row['CellularComponentID']) name (row['CellularComponentTerm']) source ( infores mapped from row['Source']) biolink:LifeStage id (CURIE heuristically inferred from row['SpeciesID'] and row['StageTerm']) name (row['StageTerm']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:GeneToExpressionSiteAssociation id (random uuid) subject (Gene.id) predicates (biolink:expressed_in) object (AnatomicalEntity.id or CellularComponent.id) stage qualifier (LifeStage.id) # if specified; None otherwise has evidence (row['AssayID']) # e.g. taken from MMO - \"measurement method ontology\" publications (row['Reference']) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source'])","title":"Gene Expression"},{"location":"Sources/alliance/#literature","text":"The Alliance has a well defined literature ingest format that aligns publications from MOD members. Mapping of Alliance publication category to biolink category Alliance category Biolink publication type Research Article IAO:0000013 Review Article IAO:0000013 Thesis IAO:0000311 Book IAO:0000311 Other IAO:0000311 Preprint IAO:0000013 Conference Publication IAO:0000311 Personal Communication IAO:0000311 Direct Data Submission IAO:0000311 Internal Process Reference IAO:0000311 Unknown IAO:0000311 Retraction IAO:0000311 This ingest doesn't make an effort to sort these publication categories into more specific classes than biolink:Publication, but does set the type. Biolink captured biolink:Publication id (primaryId) name (title) summary (abstract) authors (authors.name flattened as a comma separated string) xref (crossReferences.id) mesh terms (meshTerms.meshHeadingTerm , meshTerms.meshQualifierTerm) type (IAO:0000311 for publication, IAO:0000013 for article) creation date (datePublished) keywords (keywords)","title":"Literature"},{"location":"Sources/alliance/#citation","text":"Harmonizing model organism data in the Alliance of Genome Resources. 2022. Alliance of Genome Resources Consortium. Genetics, Volume 220, Issue 4, April 2022. Published Online: 25 February 2022. doi: doi.org/10.1093/genetics/iyac022. PMID: 35380658; PMCID: PMC8982023.","title":"Citation"},{"location":"Sources/bgee/","text":"BGee Bgee is a database for retrieval and comparison of gene expression patterns across multiple animal species, produced from multiple data types (bulk RNA-Seq, single-cell RNA-Seq, Affymetrix, in situ hybridization, and EST data) and from multiple data sets (including GTEx data). Gene Expression This is the full data model of the Bgee simple gene expression file; however, not all fields are currently used in the current ingest. Files are named by Species ID. \"Gene name\" Anatomical entity ID \"Anatomical entity name\" Expression Call quality FDR Expression score Expression rank Biolink Captured biolink:GeneToExpressionSiteAssociation id (random uuid, generated) subject ( Gene ID ) predicates (biolink:expressed_in, constant) object ( Anatomical entity ID ) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:bgee\"]) Decisions and Discussion We elected to use the simple gene expression file for ease of use and because the advanced doesn't contain much more data we are likely to use. We could potentially import has evidence from the advanced file comparing Affimetrix expression and RNA-Seq expression but this doesn't seem valuable at this time. Stage and Strain information is also available in all_conditions file. We have elected to not import the stage information due to multiple duplicate edges based on strain. Citation \"Bastian FB, Roux J, Niknejad A, Comte A, Fonseca Costa SS, Mendes de Farias T, Moretti S, Parmentier G, Rech de Laval V, Rosikiewicz M, Wollbrett J, Echchiki A, Escoriza A, Gharib W, Gonzales-Porta M, Jarosz Y, Laurenczy B, Moret P, Person E, Roelli P, Sanjeev K, Seppey M, Robinson-Rechavi M. The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals in Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831-D847\"","title":"BGee"},{"location":"Sources/bgee/#bgee","text":"Bgee is a database for retrieval and comparison of gene expression patterns across multiple animal species, produced from multiple data types (bulk RNA-Seq, single-cell RNA-Seq, Affymetrix, in situ hybridization, and EST data) and from multiple data sets (including GTEx data).","title":"BGee"},{"location":"Sources/bgee/#gene-expression","text":"This is the full data model of the Bgee simple gene expression file; however, not all fields are currently used in the current ingest. Files are named by Species ID. \"Gene name\" Anatomical entity ID \"Anatomical entity name\" Expression Call quality FDR Expression score Expression rank Biolink Captured biolink:GeneToExpressionSiteAssociation id (random uuid, generated) subject ( Gene ID ) predicates (biolink:expressed_in, constant) object ( Anatomical entity ID ) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:bgee\"]) Decisions and Discussion We elected to use the simple gene expression file for ease of use and because the advanced doesn't contain much more data we are likely to use. We could potentially import has evidence from the advanced file comparing Affimetrix expression and RNA-Seq expression but this doesn't seem valuable at this time. Stage and Strain information is also available in all_conditions file. We have elected to not import the stage information due to multiple duplicate edges based on strain.","title":"Gene Expression"},{"location":"Sources/bgee/#citation","text":"\"Bastian FB, Roux J, Niknejad A, Comte A, Fonseca Costa SS, Mendes de Farias T, Moretti S, Parmentier G, Rech de Laval V, Rosikiewicz M, Wollbrett J, Echchiki A, Escoriza A, Gharib W, Gonzales-Porta M, Jarosz Y, Laurenczy B, Moret P, Person E, Roelli P, Sanjeev K, Seppey M, Robinson-Rechavi M. The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals in Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831-D847\"","title":"Citation"},{"location":"Sources/ctd/","text":"Comparative Toxicogenomics Database (CTD) CTD is a robust, publicly available database that aims to advance understanding about how environmental exposures affect human health. It provides manually curated information about chemical\u2013gene/protein interactions, chemical\u2013disease and gene\u2013disease relationships. These data are integrated with functional and pathway data to aid in development of hypotheses about the mechanisms underlying environmentally influenced diseases. CTD Bulk Downloads Chemical to Disease This ingest takes only the chemical to disease rows where a direct evidence label is applied, and creates ChemicalEntity and Disease nodes connected by a ChemicalToDiseaseOrPhenotypicFeatureAssociation. The the chemical ID row is expected to need a 'MESH:' prefix added, the disease id is used as-is. Rows are included only if the direct evidence field is 'therapeutic' and the biolink:affects predicate is used to avoid making too strong a claim. Biolink Captured ChemicalToDiseaseOrPhenotypicFeatureAssociation id (random uuid) subject (chemical id) predicate ( biolink:affects ) object (disease id) publication (pubmed ids provided by file) aggregating_knowledge_source ( [\"infores:monarchinitiative\"] ) primary_knowledge_source ( infores:ctd ) Citation Davis AP, Wiegers TC, Johnson RJ, Sciaky D, Wiegers J, Mattingly CJ Comparative Toxicogenomics Database (CTD): update 2023. Nucleic Acids Res. 2022 Sep 28.","title":"CTD"},{"location":"Sources/ctd/#comparative-toxicogenomics-database-ctd","text":"CTD is a robust, publicly available database that aims to advance understanding about how environmental exposures affect human health. It provides manually curated information about chemical\u2013gene/protein interactions, chemical\u2013disease and gene\u2013disease relationships. These data are integrated with functional and pathway data to aid in development of hypotheses about the mechanisms underlying environmentally influenced diseases. CTD Bulk Downloads Chemical to Disease This ingest takes only the chemical to disease rows where a direct evidence label is applied, and creates ChemicalEntity and Disease nodes connected by a ChemicalToDiseaseOrPhenotypicFeatureAssociation. The the chemical ID row is expected to need a 'MESH:' prefix added, the disease id is used as-is. Rows are included only if the direct evidence field is 'therapeutic' and the biolink:affects predicate is used to avoid making too strong a claim. Biolink Captured ChemicalToDiseaseOrPhenotypicFeatureAssociation id (random uuid) subject (chemical id) predicate ( biolink:affects ) object (disease id) publication (pubmed ids provided by file) aggregating_knowledge_source ( [\"infores:monarchinitiative\"] ) primary_knowledge_source ( infores:ctd )","title":"Comparative Toxicogenomics Database (CTD)"},{"location":"Sources/ctd/#citation","text":"Davis AP, Wiegers TC, Johnson RJ, Sciaky D, Wiegers J, Mattingly CJ Comparative Toxicogenomics Database (CTD): update 2023. Nucleic Acids Res. 2022 Sep 28.","title":"Citation"},{"location":"Sources/dictybase/","text":"Dictybase Dictybase is a comprehensive database for the ameboid protozoan Dictyostelium discoideum , which is a powerful model system for genetic and functional analysis of gene function. Dictybase Bulk Downloads Gene Information Dictybase genes in the Gene to Phenotype ingest (below) are either directly identified from their gene identifier, mapped directly to NCBI Dictyostelium discoideum gene identifier mappings or mapped indirectly from the Dictybase identifier, names and synonyms mappings , with synonyms being populated as available (Note: full gene product information is not captured at this time). Gene to Phenotype Data is available in a well-documented easy-to-parse GAF-like format with associations to an UPHENO-compliant ontology. Phenotypes are linked to Strains, and the Strains are linked to Genes. Biolink Captured biolink:Gene 'id' (NCBI or Dictybase) 'category' 'name' 'symbol' 'in_taxon' 'source' biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) category (GeneToPhenotypicFeatureAssociation) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:dictybase) Citation Fey, P., Dodson, R., Basu, S., Chisholm, R. L. (2013). 'One Stop Shop for Everything Dictyostelium: dictyBase and the Dicty Stock Center'. Dictyostelium discoideum Protocols. Methods Mol. Biol. 983:59-92, edited by Ludwig Eichinger and Francisco Rivero.","title":"Dictybase"},{"location":"Sources/dictybase/#dictybase","text":"Dictybase is a comprehensive database for the ameboid protozoan Dictyostelium discoideum , which is a powerful model system for genetic and functional analysis of gene function. Dictybase Bulk Downloads","title":"Dictybase"},{"location":"Sources/dictybase/#gene-information","text":"Dictybase genes in the Gene to Phenotype ingest (below) are either directly identified from their gene identifier, mapped directly to NCBI Dictyostelium discoideum gene identifier mappings or mapped indirectly from the Dictybase identifier, names and synonyms mappings , with synonyms being populated as available (Note: full gene product information is not captured at this time).","title":"Gene Information"},{"location":"Sources/dictybase/#gene-to-phenotype","text":"Data is available in a well-documented easy-to-parse GAF-like format with associations to an UPHENO-compliant ontology. Phenotypes are linked to Strains, and the Strains are linked to Genes. Biolink Captured biolink:Gene 'id' (NCBI or Dictybase) 'category' 'name' 'symbol' 'in_taxon' 'source' biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) category (GeneToPhenotypicFeatureAssociation) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:dictybase)","title":"Gene to Phenotype"},{"location":"Sources/dictybase/#citation","text":"Fey, P., Dodson, R., Basu, S., Chisholm, R. L. (2013). 'One Stop Shop for Everything Dictyostelium: dictyBase and the Dicty Stock Center'. Dictyostelium discoideum Protocols. Methods Mol. Biol. 983:59-92, edited by Ludwig Eichinger and Francisco Rivero.","title":"Citation"},{"location":"Sources/flybase/","text":"FlyBase is the model organism database providing integrated genetic, genomic, phenomic, and biological data for Drosophila melanogaster. FlyBase bulk downloads Gene Literature This ingest uses FlyBase's publication-to-gene download file, which contains all entities and only assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the FlyBase_publication_id for the publication node if PubMed_id is not available, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:flybase) Citation Gramates LS, Agapite J, Attrill H, Calvi BR, Crosby M, dos Santos G Goodman JL, Goutte-Gattat D, Jenkins V, Kaufman T, Larkin A, Matthews B, Millburn G, Strelets VB, and the FlyBase Consortium (2022) FlyBase: a guided tour of highlighted features. Genetics, Volume 220, Issue 4, April 2022, iyac035","title":"Flybase"},{"location":"Sources/flybase/#gene-literature","text":"This ingest uses FlyBase's publication-to-gene download file, which contains all entities and only assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the FlyBase_publication_id for the publication node if PubMed_id is not available, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:flybase)","title":"Gene Literature"},{"location":"Sources/flybase/#citation","text":"Gramates LS, Agapite J, Attrill H, Calvi BR, Crosby M, dos Santos G Goodman JL, Goutte-Gattat D, Jenkins V, Kaufman T, Larkin A, Matthews B, Millburn G, Strelets VB, and the FlyBase Consortium (2022) FlyBase: a guided tour of highlighted features. Genetics, Volume 220, Issue 4, April 2022, iyac035","title":"Citation"},{"location":"Sources/goa/","text":"Gene Ontology Annotation (GOA) Database The Gene Ontology Annotation Database compiles high-quality Gene Ontology (GO) annotations to proteins in the UniProt Knowledgebase (UniProtKB) , RNA molecules from RNACentral and protein complexes from the Complex Portal . Manual annotation is the direct assignment of GO terms to proteins, ncRNA and protein complexes by curators from evidence extracted during the review of published scientific literature, with an appropriate evidence code assigned to give an assessment of the strength of the evidence. GOA files contain a mixture of manual annotation supplied by members of the Gene Ontology Consortium and computationally assigned GO terms describing gene products. Annotation type is clearly indicated by associated evidence codes and there are links to the source data. GO Annotations There is a ReadMe.txt file that explains the different annotation files available. The ingested Gene Annotation File (GAF) is a 17 column tab-delimited file. The file format conforms to the specifications demanded by the GO Consortium and therefore GO IDs and not GO term names are shown. Biolink captured Subject Concept Node (Gene) biolink:Gene id (NCBIGene Entrez ID) Object Concept Node (Gene Ontology Terms) biolink:MolecularActivity id (GO ID) biolink:BiologicalProcess id (GO ID) biolink:CellularComponent id (GO ID) Additional Gene Ontology Term Concept Nodes for possible use? biolink:Pathway id (GO ID) biolink:PhysiologicalProcess id (GO ID) Associations biolink:FunctionalAssociation id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) OR biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (participates_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToCellularComponentAssociation : id (random uuid) subject (gene.id) predicate (located_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Possible Additional Gene to Gene Ontology Term Association? biolink:GeneToGoTermAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Citation Ashburner et al. Gene ontology: tool for the unification of biology. Nat Genet. 2000 May;25(1):25-9. The Gene Ontology Consortium. The Gene Ontology knowledgebase in 2023. Genetics. 2023 May 4;224(1):iyad031","title":"GOA"},{"location":"Sources/goa/#gene-ontology-annotation-goa-database","text":"The Gene Ontology Annotation Database compiles high-quality Gene Ontology (GO) annotations to proteins in the UniProt Knowledgebase (UniProtKB) , RNA molecules from RNACentral and protein complexes from the Complex Portal . Manual annotation is the direct assignment of GO terms to proteins, ncRNA and protein complexes by curators from evidence extracted during the review of published scientific literature, with an appropriate evidence code assigned to give an assessment of the strength of the evidence. GOA files contain a mixture of manual annotation supplied by members of the Gene Ontology Consortium and computationally assigned GO terms describing gene products. Annotation type is clearly indicated by associated evidence codes and there are links to the source data.","title":"Gene Ontology Annotation (GOA) Database"},{"location":"Sources/goa/#go-annotations","text":"There is a ReadMe.txt file that explains the different annotation files available. The ingested Gene Annotation File (GAF) is a 17 column tab-delimited file. The file format conforms to the specifications demanded by the GO Consortium and therefore GO IDs and not GO term names are shown. Biolink captured","title":"GO Annotations"},{"location":"Sources/goa/#subject-concept-node-gene","text":"biolink:Gene id (NCBIGene Entrez ID)","title":"Subject Concept Node (Gene)"},{"location":"Sources/goa/#object-concept-node-gene-ontology-terms","text":"biolink:MolecularActivity id (GO ID) biolink:BiologicalProcess id (GO ID) biolink:CellularComponent id (GO ID)","title":"Object Concept Node (Gene Ontology Terms)"},{"location":"Sources/goa/#additional-gene-ontology-term-concept-nodes-for-possible-use","text":"biolink:Pathway id (GO ID) biolink:PhysiologicalProcess id (GO ID) Associations biolink:FunctionalAssociation id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) OR biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (participates_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToCellularComponentAssociation : id (random uuid) subject (gene.id) predicate (located_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Possible Additional Gene to Gene Ontology Term Association? biolink:GeneToGoTermAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa)","title":"Additional Gene Ontology Term Concept Nodes for possible use?"},{"location":"Sources/goa/#citation","text":"Ashburner et al. Gene ontology: tool for the unification of biology. Nat Genet. 2000 May;25(1):25-9. The Gene Ontology Consortium. The Gene Ontology knowledgebase in 2023. Genetics. 2023 May 4;224(1):iyad031","title":"Citation"},{"location":"Sources/hgnc/","text":"HGNC (HUGO Gene Nomenclature Committee) The HGNC is responsible for approving unique symbols and names for human loci, including protein coding genes, ncRNA genes and pseudogenes, to allow unambiguous scientific communication. HGNC bulk downloads Gene Information This ingest uses HGNC's \"complete set\" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink Captured biolink:Gene id (HGNC identifier) symbol name synonym alias symbol alias name prev symbol prev name xref ensembl gene id omim id in_taxon ([\"NCBITaxon:9606\"]) provided_by ([\"infores:hgnc\"]) Citation HGNC Database, HUGO Gene Nomenclature Committee (HGNC), European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Genome Campus, Hinxton, Cambridge CB10 1SD, United Kingdom www.genenames.org .","title":"HGNC"},{"location":"Sources/hgnc/#hgnc-hugo-gene-nomenclature-committee","text":"The HGNC is responsible for approving unique symbols and names for human loci, including protein coding genes, ncRNA genes and pseudogenes, to allow unambiguous scientific communication. HGNC bulk downloads","title":"HGNC (HUGO Gene Nomenclature Committee)"},{"location":"Sources/hgnc/#gene-information","text":"This ingest uses HGNC's \"complete set\" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink Captured biolink:Gene id (HGNC identifier) symbol name synonym alias symbol alias name prev symbol prev name xref ensembl gene id omim id in_taxon ([\"NCBITaxon:9606\"]) provided_by ([\"infores:hgnc\"])","title":"Gene Information"},{"location":"Sources/hgnc/#citation","text":"HGNC Database, HUGO Gene Nomenclature Committee (HGNC), European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Genome Campus, Hinxton, Cambridge CB10 1SD, United Kingdom www.genenames.org .","title":"Citation"},{"location":"Sources/hpoa/","text":"Human Phenotype Ontology Annotations (HPOA) The Human Phenotype Ontology group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create Biolink associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). There are four HPOA ingests - 'disease-to-phenotype', 'disease-to-mode-of-inheritance', 'gene-to-disease' and 'disease-to-mode-of-inheritance' - that parse out records from the HPO Annotation File . The 'disease-to-phenotype', 'disease-to-mode-of-inheritance' and 'gene-to-disease' parsers currently only process the \"abnormal\" annotations. Association to \"remarkable normality\" may be added in the near future. The 'disease-to-mode-of-inheritance' ingest script parses 'inheritance' record information out from the annotation file. Gene to Disease This ingest replaces the direct OMIM ingest so that we share g2d associations 1:1 with HPO. The mapping between association_type and biolink predicates shown below is the one way in which this ingest is opinionated, but attempts to be a direct translation into the biolink model. genes_to_disease.txt with the following fields: 'ncbi_gene_id' 'gene_symbol' 'association_type' 'disease_id' 'source' Biolink Captured biolink:CorrelatedGeneToDiseaseAssociation or biolink:CausalGeneToDiseaseAssociation (depending on predicate) id (random uuid) subject (ncbi_gene_id) predicate (association_type) MENDELIAN: biolink:causes POLYGENIC: biolink:contributes_to UNKNOWN: biolink:gene_associated_with_condition object (disease_id) primary_knowledge_source (source) medgen: infores:omim orphanet: infores:orphanet aggregator_knowledge_source ([\"infores:monarchinitiative\"]) also for medgen: infores:medgen Disease to Phenotype phenotype.hpoa: A description of this file is found here , has the following fields: 'database_id' 'disease_name' 'qualifier' 'hpo_id' 'reference' 'evidence' 'onset' 'frequency' 'sex' 'modifier' 'aspect' 'biocuration' Note that we're calling this the disease to phenotype file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'P' (phenotypic anomalies) , but ignoring all other Aspects. Frequencies The 'Frequency' field of the aforementioned phenotypes.hpoa file has the following definition, excerpted from its Annotation Format page: 8. Frequency: There are three allowed options for this field. (A) A term-id from the HPO-sub-ontology below the term \u201cFrequency\u201d (HP:0040279). (since December 2016 ; before was a mixture of values). The terms for frequency are in alignment with Orphanet. * (B) A count of patients affected within a cohort. For instance, 7/13 would indicate that 7 of the 13 patients with the specified disease were found to have the phenotypic abnormality referred to by the HPO term in question in the study referred to by the DB_Reference; (C) A percentage value such as 17%. The Disease to Phenotype ingest attempts to remap these raw frequency values onto a suitable HPO term. A simplistic (perhaps erroneous?) assumption is that all such frequencies are conceptually comparable; however, researchers may wish to review the original publications to confirm fitness of purpose of the specific data points to their interpretation - specific values could designate phenotypic frequency at the population level; phenotypic frequency at the cohort level; or simply, be a measure of penetrance of a specific allele within carriers, etc.. Biolink captured biolink:DiseaseToPhenotypicFeatureAssociation id (random uuid) subject (disease.id) predicate (has_phenotype) negated (True if 'qualifier' == \"NOT\") object (phenotypicFeature.id) publications (List[publication.id]) has_evidence (List[Note [1]]), sex_qualifier (Note [2]) onset_qualifier (Onset.id) frequency_qualifier (Note [3]) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Notes: 1. CURIE of [Evidence and Conclusion Ontology( https://bioportal.bioontology.org/ontologies/ECO )] term 2. female -> PATO:0000383, male -> PATO:0000384 or None 3. See the Frequencies section above. Disease to Modes of Inheritance Same as above, we again parse the phenotype.hpoa file . However, we're calling this the 'disease to modes of inheritance' file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'I' (inheritance) , but ignoring all other Aspects. Biolink captured biolink:DiseaseOrPhenotypicFeatureToGeneticInheritanceAssociation id (random uuid) subject (disease.id) predicate (has_mode_of_inheritance) object (geneticInheritance.id) publications (List[publication.id]) has_evidence (List[Note [1]]), aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Gene to Phenotype The gene-to-phenotype ingest processes the tab-delimited HPOA gene_to_phenotype.txt file, which has the following fields: 'ncbi_gene_id' 'gene_symbol' 'hpo_id' 'hpo_name' Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:hpo-annotations) Citation Sebastian K\u00f6hler, Michael Gargano, Nicolas Matentzoglu, Leigh C Carmody, David Lewis-Smith, Nicole A Vasilevsky, Daniel Danis, Ganna Balagura, Gareth Baynam, Amy M Brower, Tiffany J Callahan, Christopher G Chute, Johanna L Est, Peter D Galer, Shiva Ganesan, Matthias Griese, Matthias Haimel, Julia Pazmandi, Marc Hanauer, Nomi L Harris, Michael J Hartnett, Maximilian Hastreiter, Fabian Hauck, Yongqun He, Tim Jeske, Hugh Kearney, Gerhard Kindle, Christoph Klein, Katrin Knoflach, Roland Krause, David Lagorce, Julie A McMurry, Jillian A Miller, Monica C Munoz-Torres, Rebecca L Peters, Christina K Rapp, Ana M Rath, Shahmir A Rind, Avi Z Rosenberg, Michael M Segal, Markus G Seidel, Damian Smedley, Tomer Talmy, Yarlalu Thomas, Samuel A Wiafe, Julie Xian, Zafer Y\u00fcksel, Ingo Helbig, Christopher J Mungall, Melissa A Haendel, Peter N Robinson, The Human Phenotype Ontology in 2021, Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D1207\u2013D1217, https://doi.org/10.1093/nar/gkaa1043","title":"HPOA"},{"location":"Sources/hpoa/#human-phenotype-ontology-annotations-hpoa","text":"The Human Phenotype Ontology group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create Biolink associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). There are four HPOA ingests - 'disease-to-phenotype', 'disease-to-mode-of-inheritance', 'gene-to-disease' and 'disease-to-mode-of-inheritance' - that parse out records from the HPO Annotation File . The 'disease-to-phenotype', 'disease-to-mode-of-inheritance' and 'gene-to-disease' parsers currently only process the \"abnormal\" annotations. Association to \"remarkable normality\" may be added in the near future. The 'disease-to-mode-of-inheritance' ingest script parses 'inheritance' record information out from the annotation file.","title":"Human Phenotype Ontology Annotations (HPOA)"},{"location":"Sources/hpoa/#gene-to-disease","text":"This ingest replaces the direct OMIM ingest so that we share g2d associations 1:1 with HPO. The mapping between association_type and biolink predicates shown below is the one way in which this ingest is opinionated, but attempts to be a direct translation into the biolink model. genes_to_disease.txt with the following fields: 'ncbi_gene_id' 'gene_symbol' 'association_type' 'disease_id' 'source' Biolink Captured biolink:CorrelatedGeneToDiseaseAssociation or biolink:CausalGeneToDiseaseAssociation (depending on predicate) id (random uuid) subject (ncbi_gene_id) predicate (association_type) MENDELIAN: biolink:causes POLYGENIC: biolink:contributes_to UNKNOWN: biolink:gene_associated_with_condition object (disease_id) primary_knowledge_source (source) medgen: infores:omim orphanet: infores:orphanet aggregator_knowledge_source ([\"infores:monarchinitiative\"]) also for medgen: infores:medgen","title":"Gene to Disease"},{"location":"Sources/hpoa/#disease-to-phenotype","text":"phenotype.hpoa: A description of this file is found here , has the following fields: 'database_id' 'disease_name' 'qualifier' 'hpo_id' 'reference' 'evidence' 'onset' 'frequency' 'sex' 'modifier' 'aspect' 'biocuration' Note that we're calling this the disease to phenotype file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'P' (phenotypic anomalies) , but ignoring all other Aspects. Frequencies The 'Frequency' field of the aforementioned phenotypes.hpoa file has the following definition, excerpted from its Annotation Format page: 8. Frequency: There are three allowed options for this field. (A) A term-id from the HPO-sub-ontology below the term \u201cFrequency\u201d (HP:0040279). (since December 2016 ; before was a mixture of values). The terms for frequency are in alignment with Orphanet. * (B) A count of patients affected within a cohort. For instance, 7/13 would indicate that 7 of the 13 patients with the specified disease were found to have the phenotypic abnormality referred to by the HPO term in question in the study referred to by the DB_Reference; (C) A percentage value such as 17%. The Disease to Phenotype ingest attempts to remap these raw frequency values onto a suitable HPO term. A simplistic (perhaps erroneous?) assumption is that all such frequencies are conceptually comparable; however, researchers may wish to review the original publications to confirm fitness of purpose of the specific data points to their interpretation - specific values could designate phenotypic frequency at the population level; phenotypic frequency at the cohort level; or simply, be a measure of penetrance of a specific allele within carriers, etc.. Biolink captured biolink:DiseaseToPhenotypicFeatureAssociation id (random uuid) subject (disease.id) predicate (has_phenotype) negated (True if 'qualifier' == \"NOT\") object (phenotypicFeature.id) publications (List[publication.id]) has_evidence (List[Note [1]]), sex_qualifier (Note [2]) onset_qualifier (Onset.id) frequency_qualifier (Note [3]) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Notes: 1. CURIE of [Evidence and Conclusion Ontology( https://bioportal.bioontology.org/ontologies/ECO )] term 2. female -> PATO:0000383, male -> PATO:0000384 or None 3. See the Frequencies section above.","title":"Disease to Phenotype"},{"location":"Sources/hpoa/#disease-to-modes-of-inheritance","text":"Same as above, we again parse the phenotype.hpoa file . However, we're calling this the 'disease to modes of inheritance' file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'I' (inheritance) , but ignoring all other Aspects. Biolink captured biolink:DiseaseOrPhenotypicFeatureToGeneticInheritanceAssociation id (random uuid) subject (disease.id) predicate (has_mode_of_inheritance) object (geneticInheritance.id) publications (List[publication.id]) has_evidence (List[Note [1]]), aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\")","title":"Disease to Modes of Inheritance"},{"location":"Sources/hpoa/#gene-to-phenotype","text":"The gene-to-phenotype ingest processes the tab-delimited HPOA gene_to_phenotype.txt file, which has the following fields: 'ncbi_gene_id' 'gene_symbol' 'hpo_id' 'hpo_name' Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:hpo-annotations)","title":"Gene to Phenotype"},{"location":"Sources/hpoa/#citation","text":"Sebastian K\u00f6hler, Michael Gargano, Nicolas Matentzoglu, Leigh C Carmody, David Lewis-Smith, Nicole A Vasilevsky, Daniel Danis, Ganna Balagura, Gareth Baynam, Amy M Brower, Tiffany J Callahan, Christopher G Chute, Johanna L Est, Peter D Galer, Shiva Ganesan, Matthias Griese, Matthias Haimel, Julia Pazmandi, Marc Hanauer, Nomi L Harris, Michael J Hartnett, Maximilian Hastreiter, Fabian Hauck, Yongqun He, Tim Jeske, Hugh Kearney, Gerhard Kindle, Christoph Klein, Katrin Knoflach, Roland Krause, David Lagorce, Julie A McMurry, Jillian A Miller, Monica C Munoz-Torres, Rebecca L Peters, Christina K Rapp, Ana M Rath, Shahmir A Rind, Avi Z Rosenberg, Michael M Segal, Markus G Seidel, Damian Smedley, Tomer Talmy, Yarlalu Thomas, Samuel A Wiafe, Julie Xian, Zafer Y\u00fcksel, Ingo Helbig, Christopher J Mungall, Melissa A Haendel, Peter N Robinson, The Human Phenotype Ontology in 2021, Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D1207\u2013D1217, https://doi.org/10.1093/nar/gkaa1043","title":"Citation"},{"location":"Sources/mgi/","text":"Mouse Genome Informatics (MGI) Mouse Genome Informatics (MGI) is the international database resource for the laboratory mouse, providing integrated genetic, genomic, and biological data to facilitate the study of human health and disease. MGI bulk downloads Gene Literature This ingest uses MGI's Reference download file, which contains genes and a tab-delimited list of PubMed IDs in which they are mentioned. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:mgi) Citation Blake JA, Baldarelli R, Kadin JA, Richardson JE, Smith CL, Bult CJ; Mouse Genome Database Group. 2021. Mouse Genome Database (MGD): Knowledgebase for mouse-human comparative biology. Nucleic Acids Res. 2021 Jan 8;49(D1):D981-D987.","title":"Mouse Genome Informatics (MGI)"},{"location":"Sources/mgi/#mouse-genome-informatics-mgi","text":"Mouse Genome Informatics (MGI) is the international database resource for the laboratory mouse, providing integrated genetic, genomic, and biological data to facilitate the study of human health and disease. MGI bulk downloads","title":"Mouse Genome Informatics (MGI)"},{"location":"Sources/mgi/#gene-literature","text":"This ingest uses MGI's Reference download file, which contains genes and a tab-delimited list of PubMed IDs in which they are mentioned. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:mgi)","title":"Gene Literature"},{"location":"Sources/mgi/#citation","text":"Blake JA, Baldarelli R, Kadin JA, Richardson JE, Smith CL, Bult CJ; Mouse Genome Database Group. 2021. Mouse Genome Database (MGD): Knowledgebase for mouse-human comparative biology. Nucleic Acids Res. 2021 Jan 8;49(D1):D981-D987.","title":"Citation"},{"location":"Sources/ncbi/","text":"National Center for Biotechnology Information (NCBI) The NCBI Gene integrates information from a wide range of species. A record may include nomenclature, Reference Sequences (RefSeqs), maps, pathways, variations, phenotypes, and links to genome-, phenotype-, and locus-specific resources worldwide. NCBI bulk downloads Gene Information Genes for all NCBI species (Dog, Cow, Pig, Chicken) are loaded using the ingest file (filtered to only NCBI taxon ID). Biolink Captured biolink:Gene id symbol description in_taxon provided_by ([\"infores:ncbi-gene\"]) Citation National Center for Biotechnology Information (NCBI)[Internet]. Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information; [1988] \u2013 [cited 2024 Dec]. Available from: https://www.ncbi.nlm.nih.gov/","title":"NCBI"},{"location":"Sources/ncbi/#national-center-for-biotechnology-information-ncbi","text":"The NCBI Gene integrates information from a wide range of species. A record may include nomenclature, Reference Sequences (RefSeqs), maps, pathways, variations, phenotypes, and links to genome-, phenotype-, and locus-specific resources worldwide. NCBI bulk downloads","title":"National Center for Biotechnology Information (NCBI)"},{"location":"Sources/ncbi/#gene-information","text":"Genes for all NCBI species (Dog, Cow, Pig, Chicken) are loaded using the ingest file (filtered to only NCBI taxon ID). Biolink Captured biolink:Gene id symbol description in_taxon provided_by ([\"infores:ncbi-gene\"])","title":"Gene Information"},{"location":"Sources/ncbi/#citation","text":"National Center for Biotechnology Information (NCBI)[Internet]. Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information; [1988] \u2013 [cited 2024 Dec]. Available from: https://www.ncbi.nlm.nih.gov/","title":"Citation"},{"location":"Sources/panther/","text":"PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System Panther Gene Orthology Gene orthology analyses generate testable hypothesis about gene function and biological processes using experimental results from other (especially highly studied so-called 'model' species) using protein (and sometimes, simply nucleic acid level) alignments of genomic sequences. The source of gene orthology data for this ingest is from the PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System . Panther was designed to classify proteins (and their genes) in order to facilitate high-throughput analysis. Proteins have been classified according to: - Family and subfamily: families are groups of evolutionarily related proteins; subfamilies are related proteins that also have the same function - Molecular function: the function of the protein by itself or with directly interacting proteins at a biochemical level, e.g. a protein kinase - Biological process: the function of the protein in the context of a larger network of proteins that interact to accomplish a process at the level of the cell or organism, e.g. mitosis. - Pathway: similar to biological process, but a pathway also explicitly specifies the relationships between the interacting molecules. The PANTHER Classifications are the result of human curation as well as sophisticated bioinformatics algorithms. Details of the methods can be found in Mi et al. NAR 2013; Thomas et al., Genome Research 2003 . This ingest uses data derived form the current version (release 16.0) of the Panther Hidden Markov Model (HMM). Panther Gene Orthology bulk data downloads There are various cross-sections of the Panther database which remain be covered by this ingest (Note: T.B.D means \"To Be Done\") Status of Panther Ingest The first iteration of this dataset (committed March 2022) focuses on Reference Genome Gene-to-Gene Orthology Relationships . Additional Panther associations (protein (sub)family pathways, sequences, etc , as generally described below) may be added at a later date. Reference Genome Gene-to-Gene Orthology Relationships Contains the Reference Genomes' Gene-to-Gene Ortholog mappings from Panther analyses. Source File: AllOrthologs.tar.gz . The source file is huge, containing data from all species, many of which are not currently of direct interest to Monarch. For this reason, a Python function filter_panther_orthologs_file was coded within orthology_utils . ALL_ORTHOLOGS_FILE = \"AllOrthologs\" TARGET_SPECIES_ORTHOLOGS = \"TargetOrthologs\" def filter_panther_orthologs_file ( directory : str = '.' , source_filename : str = ALL_ORTHOLOGS_FILE , target_filename : str = TARGET_SPECIES_ORTHOLOGS , number_of_lines : int = 0 ) -> bool : \"\"\" Filters a tar.gz Panther input file against the target list of species. :param directory: str, location of source data file :param source_filename: str, source data file name :param target_filename: str, target data file name :param number_of_lines: int, number of lines parsed; 'all' lines parsed if omitted or set to zero :return: bool, True if filtering was successful; False if unsuccessful \"\"\" ... which could be called with default parameter values in the following manner (if invoked from within the Panther data directory): filter_file () to generate a pruned down TargetOrthologs.tar.gz file with target species (as hardcoded in the catalog of species in the ortholog_utils module). Panther Data Model of Panther Orthologs Data Field Content Gene species1 | DB=id1 | protdb=pdbid1 Ortholog species2 | DB=id2 | protdb=pdbid2 Type of ortholog [LDO, O, P, X ,LDX] see README . Common ancestor for the orthologs taxon name of common ancestor Panther Ortholog ID Panther (sub)family identifier The DB=id# fields - where DB == database namespace and id# is the object identifier - are directly translated, by internal namespace mapping, into gene CURIEs. The species# are abridged labels currently filtered and mapped onto NCBI Taxon identifiers, using an hard-coded dictionary. Biolink classes and properties captured biolink:Gene id (NCBIGene Entrez ID) Note that the Gene source is currently given as Panther, although the real source of a Gene identifier is given by its CURIE namespace. biolink:GeneToGeneHomologyAssociation id (random uuid) subject (gene.id) predicate (orthologous to) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Protein Family and Subfamily Classifications - T.B.D. Contains the PANTHER 16.0 family/subfamily name, with molecular function, biological process, and pathway classifications for every PANTHER protein family and subfamily in the current PANTHER HMM library. Source File: http://data.pantherdb.org/ftp/hmm_classifications/current_release/PANTHER16.0_HMM_classifications Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneFamilyToMolecularFunctionAssociation id (random uuid) subject (gene_family.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToBiologicalProcessAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Pathways - T.B.D. Contains regulatory and metabolic pathways, each with subfamilies and protein sequences mapped to individual pathway components. Source File: http://data.pantherdb.org/ftp/pathway/current_release/SequenceAssociationPathway3.6.5.txt local_name: data/orthology/pathways.tsv Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:Gene id (NCBIGene Entrez ID) in taxon (NCBITaxon ID) source (infores:entrez) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Sequence Classifications - T.B.D. Sequence Classifications files contain the PANTHER family, subfamily, molecular function, biological process, and pathway classifications for the complete proteomes derived from the various genomes, indexed by species (one source file per species). Refer to the Sequence Classification README for details. Only a subset of the available species will be ingested into Monarch at this time, currently: human, mouse, rat, zebrafish, fruit fly, nematode, fission yeast and budding (\"baker's\") yeast. Source File Directory: http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ Biolink classes and properties captured: biolink:Gene id (PANTHER.FAMILY ID) source (infores:panther) biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToGeneFamilyAssociation : id (random uuid) subject (gene.id) predicate (member_of) object (gene_family.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Citation Paul D. Thomas, Dustin Ebert, Anushya Muruganujan, Tremayne Mushayahama, Laurent-Philippe Albou and Huaiyu Mi Protein Society. 2022;31(1):8-22. doi:10.1002/pro.4218","title":"Panther"},{"location":"Sources/panther/#panther-protein-analysis-through-evolutionary-relationships-classification-system","text":"","title":"PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System"},{"location":"Sources/panther/#panther-gene-orthology","text":"Gene orthology analyses generate testable hypothesis about gene function and biological processes using experimental results from other (especially highly studied so-called 'model' species) using protein (and sometimes, simply nucleic acid level) alignments of genomic sequences. The source of gene orthology data for this ingest is from the PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System . Panther was designed to classify proteins (and their genes) in order to facilitate high-throughput analysis. Proteins have been classified according to: - Family and subfamily: families are groups of evolutionarily related proteins; subfamilies are related proteins that also have the same function - Molecular function: the function of the protein by itself or with directly interacting proteins at a biochemical level, e.g. a protein kinase - Biological process: the function of the protein in the context of a larger network of proteins that interact to accomplish a process at the level of the cell or organism, e.g. mitosis. - Pathway: similar to biological process, but a pathway also explicitly specifies the relationships between the interacting molecules. The PANTHER Classifications are the result of human curation as well as sophisticated bioinformatics algorithms. Details of the methods can be found in Mi et al. NAR 2013; Thomas et al., Genome Research 2003 . This ingest uses data derived form the current version (release 16.0) of the Panther Hidden Markov Model (HMM). Panther Gene Orthology bulk data downloads There are various cross-sections of the Panther database which remain be covered by this ingest (Note: T.B.D means \"To Be Done\")","title":"Panther Gene Orthology"},{"location":"Sources/panther/#status-of-panther-ingest","text":"The first iteration of this dataset (committed March 2022) focuses on Reference Genome Gene-to-Gene Orthology Relationships . Additional Panther associations (protein (sub)family pathways, sequences, etc , as generally described below) may be added at a later date.","title":"Status of Panther Ingest"},{"location":"Sources/panther/#reference-genome-gene-to-gene-orthology-relationships","text":"Contains the Reference Genomes' Gene-to-Gene Ortholog mappings from Panther analyses. Source File: AllOrthologs.tar.gz . The source file is huge, containing data from all species, many of which are not currently of direct interest to Monarch. For this reason, a Python function filter_panther_orthologs_file was coded within orthology_utils . ALL_ORTHOLOGS_FILE = \"AllOrthologs\" TARGET_SPECIES_ORTHOLOGS = \"TargetOrthologs\" def filter_panther_orthologs_file ( directory : str = '.' , source_filename : str = ALL_ORTHOLOGS_FILE , target_filename : str = TARGET_SPECIES_ORTHOLOGS , number_of_lines : int = 0 ) -> bool : \"\"\" Filters a tar.gz Panther input file against the target list of species. :param directory: str, location of source data file :param source_filename: str, source data file name :param target_filename: str, target data file name :param number_of_lines: int, number of lines parsed; 'all' lines parsed if omitted or set to zero :return: bool, True if filtering was successful; False if unsuccessful \"\"\" ... which could be called with default parameter values in the following manner (if invoked from within the Panther data directory): filter_file () to generate a pruned down TargetOrthologs.tar.gz file with target species (as hardcoded in the catalog of species in the ortholog_utils module).","title":"Reference Genome Gene-to-Gene Orthology Relationships"},{"location":"Sources/panther/#panther-data-model-of-panther-orthologs","text":"Data Field Content Gene species1 | DB=id1 | protdb=pdbid1 Ortholog species2 | DB=id2 | protdb=pdbid2 Type of ortholog [LDO, O, P, X ,LDX] see README . Common ancestor for the orthologs taxon name of common ancestor Panther Ortholog ID Panther (sub)family identifier The DB=id# fields - where DB == database namespace and id# is the object identifier - are directly translated, by internal namespace mapping, into gene CURIEs. The species# are abridged labels currently filtered and mapped onto NCBI Taxon identifiers, using an hard-coded dictionary.","title":"Panther Data Model of Panther Orthologs"},{"location":"Sources/panther/#biolink-classes-and-properties-captured","text":"biolink:Gene id (NCBIGene Entrez ID) Note that the Gene source is currently given as Panther, although the real source of a Gene identifier is given by its CURIE namespace. biolink:GeneToGeneHomologyAssociation id (random uuid) subject (gene.id) predicate (orthologous to) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Biolink classes and properties captured"},{"location":"Sources/panther/#protein-family-and-subfamily-classifications-tbd","text":"Contains the PANTHER 16.0 family/subfamily name, with molecular function, biological process, and pathway classifications for every PANTHER protein family and subfamily in the current PANTHER HMM library. Source File: http://data.pantherdb.org/ftp/hmm_classifications/current_release/PANTHER16.0_HMM_classifications Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneFamilyToMolecularFunctionAssociation id (random uuid) subject (gene_family.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToBiologicalProcessAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Protein Family and Subfamily Classifications - T.B.D."},{"location":"Sources/panther/#pathways-tbd","text":"Contains regulatory and metabolic pathways, each with subfamilies and protein sequences mapped to individual pathway components. Source File: http://data.pantherdb.org/ftp/pathway/current_release/SequenceAssociationPathway3.6.5.txt local_name: data/orthology/pathways.tsv Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:Gene id (NCBIGene Entrez ID) in taxon (NCBITaxon ID) source (infores:entrez) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Pathways - T.B.D."},{"location":"Sources/panther/#sequence-classifications-tbd","text":"Sequence Classifications files contain the PANTHER family, subfamily, molecular function, biological process, and pathway classifications for the complete proteomes derived from the various genomes, indexed by species (one source file per species). Refer to the Sequence Classification README for details. Only a subset of the available species will be ingested into Monarch at this time, currently: human, mouse, rat, zebrafish, fruit fly, nematode, fission yeast and budding (\"baker's\") yeast. Source File Directory: http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ Biolink classes and properties captured: biolink:Gene id (PANTHER.FAMILY ID) source (infores:panther) biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToGeneFamilyAssociation : id (random uuid) subject (gene.id) predicate (member_of) object (gene_family.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Sequence Classifications - T.B.D."},{"location":"Sources/panther/#citation","text":"Paul D. Thomas, Dustin Ebert, Anushya Muruganujan, Tremayne Mushayahama, Laurent-Philippe Albou and Huaiyu Mi Protein Society. 2022;31(1):8-22. doi:10.1002/pro.4218","title":"Citation"},{"location":"Sources/pombase/","text":"PomBase PomBase is a comprehensive database for the fission yeast Schizosaccharomyces pombe, providing structural and functional annotation, literature curation and access to large-scale data sets. Within this ingest there will be a transformation of gene to phenotypic feature associations, gene entities aren't yet loaded as a part of this ingest, and FYPO ontology terms will be brought in directly from the ontology without transformation. PomBase Bulk Downloads Phaf Format Description Phaf Format LinkML Gene Information PomBase genes are captured directly from the PomBase (names and identifiers)[ https://www.pombase.org/downloads/names-and-identifiers ] set, with synonyms being populated as available and UniProtKB accessions captured as xrefs if available. Biolink Captured biolink:Gene id symbol xref (UniProfKB curie if provided) synonym provided_by([\"infores:pombase\"]) Gene to Phenotype The PHAF download file is extremely well documented. Alleles provided, but not captured, with the assumption that even with an allele specified the gene to phenotype is accurate with a some-some interpretation. Genotype/strain information looks uniform throughout the file, and is not captured. It might be sensible to make presence of genotype information an error condition to be sure that we only get 'clean' gene to phenotype associations. Penetrance and Severity columns are available, but not captured as a part of this ingest. Penetrance values can be either FYPO_EXT terms (FYPO_EXT:0000001, FYPO_EXT:0000002, FYPO_EXT:0000003, FYPO_EXT:0000004), int/float numbers (percentages), or strings (\">98\", \"~10\", \"10-20\"). Severity is represented using one or more FYPO_EXT terms. Biolink Captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (optionally included from condition row) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:pombase) Citation \"Harris MA, Rutherford KM, Hayles J, Lock A, B\u00e4hler J, Oliver S, Mata J, Wood V Fission stories: Using PomBase to understand Schizosaccharomyces pombe biology Genetics, 2021; iyab222\"","title":"Pombase"},{"location":"Sources/pombase/#pombase","text":"PomBase is a comprehensive database for the fission yeast Schizosaccharomyces pombe, providing structural and functional annotation, literature curation and access to large-scale data sets. Within this ingest there will be a transformation of gene to phenotypic feature associations, gene entities aren't yet loaded as a part of this ingest, and FYPO ontology terms will be brought in directly from the ontology without transformation. PomBase Bulk Downloads Phaf Format Description Phaf Format LinkML","title":"PomBase"},{"location":"Sources/pombase/#gene-information","text":"PomBase genes are captured directly from the PomBase (names and identifiers)[ https://www.pombase.org/downloads/names-and-identifiers ] set, with synonyms being populated as available and UniProtKB accessions captured as xrefs if available. Biolink Captured biolink:Gene id symbol xref (UniProfKB curie if provided) synonym provided_by([\"infores:pombase\"])","title":"Gene Information"},{"location":"Sources/pombase/#gene-to-phenotype","text":"The PHAF download file is extremely well documented. Alleles provided, but not captured, with the assumption that even with an allele specified the gene to phenotype is accurate with a some-some interpretation. Genotype/strain information looks uniform throughout the file, and is not captured. It might be sensible to make presence of genotype information an error condition to be sure that we only get 'clean' gene to phenotype associations. Penetrance and Severity columns are available, but not captured as a part of this ingest. Penetrance values can be either FYPO_EXT terms (FYPO_EXT:0000001, FYPO_EXT:0000002, FYPO_EXT:0000003, FYPO_EXT:0000004), int/float numbers (percentages), or strings (\">98\", \"~10\", \"10-20\"). Severity is represented using one or more FYPO_EXT terms. Biolink Captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (optionally included from condition row) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:pombase)","title":"Gene to Phenotype"},{"location":"Sources/pombase/#citation","text":"\"Harris MA, Rutherford KM, Hayles J, Lock A, B\u00e4hler J, Oliver S, Mata J, Wood V Fission stories: Using PomBase to understand Schizosaccharomyces pombe biology Genetics, 2021; iyab222\"","title":"Citation"},{"location":"Sources/reactome/","text":"Reactome Reactome is a free, open-source, curated and peer reviewed pathway database. Our goal is to provide intuitive bioinformatics tools for the visualization, interpretation and analysis of pathway knowledge to support basic research, genome analysis, modeling, systems biology and education. Reactome bulk downloads Pathway This ingest uses Reactome's pathway download file. Biolink captured biolink:Pathway id name in_taxon provided_by ([\"infores:reactome\"]) Gene to Pathway This ingest uses Reactome's gene to pathway download file, which contains all entities and only assocations between pathways and genes that are denoted in some way in the pathyways. Biolink captured biolink:Gene id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (gene.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome) Chemical to Pathway This ingest uses Reactome's chemical to pathway download file, which contains all entities and only assocations between pathways and chemicals that are denoted in some way in the pathyways. Biolink captured biolink:ChemicalEntity id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (chemical.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome) Citation Marc Gillespie, Bijay Jassal, Ralf Stephan, Marija Milacic, Karen Rothfels, Andrea Senff-Ribeiro, Johannes Griss, Cristoffer Sevilla, Lisa Matthews, Chuqiao Gong, Chuan Deng, Thawfeek Varusai, Eliot Ragueneau, Yusra Haider, Bruce May, Veronica Shamovsky, Joel Weiser, Timothy Brunson, Nasim Sanati, Liam Beckman, Xiang Shao, Antonio Fabregat, Konstantinos Sidiropoulos, Julieth Murillo, Guilherme Viteri, Justin Cook, Solomon Shorser, Gary Bader, Emek Demir, Chris Sander, Robin Haw, Guanming Wu, Lincoln Stein, Henning Hermjakob, Peter D\u2019Eustachio, The reactome pathway knowledgebase 2022, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D687\u2013D692, https://doi.org/10.1093/nar/gkab1028","title":"Reactome"},{"location":"Sources/reactome/#reactome","text":"Reactome is a free, open-source, curated and peer reviewed pathway database. Our goal is to provide intuitive bioinformatics tools for the visualization, interpretation and analysis of pathway knowledge to support basic research, genome analysis, modeling, systems biology and education. Reactome bulk downloads","title":"Reactome"},{"location":"Sources/reactome/#pathway","text":"This ingest uses Reactome's pathway download file. Biolink captured biolink:Pathway id name in_taxon provided_by ([\"infores:reactome\"])","title":"Pathway"},{"location":"Sources/reactome/#gene-to-pathway","text":"This ingest uses Reactome's gene to pathway download file, which contains all entities and only assocations between pathways and genes that are denoted in some way in the pathyways. Biolink captured biolink:Gene id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (gene.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome)","title":"Gene to Pathway"},{"location":"Sources/reactome/#chemical-to-pathway","text":"This ingest uses Reactome's chemical to pathway download file, which contains all entities and only assocations between pathways and chemicals that are denoted in some way in the pathyways. Biolink captured biolink:ChemicalEntity id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (chemical.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome)","title":"Chemical to Pathway"},{"location":"Sources/reactome/#citation","text":"Marc Gillespie, Bijay Jassal, Ralf Stephan, Marija Milacic, Karen Rothfels, Andrea Senff-Ribeiro, Johannes Griss, Cristoffer Sevilla, Lisa Matthews, Chuqiao Gong, Chuan Deng, Thawfeek Varusai, Eliot Ragueneau, Yusra Haider, Bruce May, Veronica Shamovsky, Joel Weiser, Timothy Brunson, Nasim Sanati, Liam Beckman, Xiang Shao, Antonio Fabregat, Konstantinos Sidiropoulos, Julieth Murillo, Guilherme Viteri, Justin Cook, Solomon Shorser, Gary Bader, Emek Demir, Chris Sander, Robin Haw, Guanming Wu, Lincoln Stein, Henning Hermjakob, Peter D\u2019Eustachio, The reactome pathway knowledgebase 2022, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D687\u2013D692, https://doi.org/10.1093/nar/gkab1028","title":"Citation"},{"location":"Sources/rgd/","text":"Rat Genome Database (RGD) The Rat Genome Database (RGD) was established in 1999 and is the premier site for genetic, genomic, phenotype, and disease data generated from rat research. In addition, it provides easy access to corresponding human and mouse data for cross-species comparisons. RGD bulk downloads Gene Literature This ingest uses RGD's gene file which contains publication assocations that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Even though it is a gene file, and we have fully populated the gene nodes in the alliance gene information ingest, the RGD file has some information that is not in alliance. Note, there will be a column mismatch warning on this transform because there are two (UNUSED) columns. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:rgd) Citation Vedi M, Smith JR, Thomas Hayman G, Tutaj M, Brodie KC, De Pons JL, Demos WM, Gibson AC, Kaldunski ML, Lamers L, Laulederkind SJF, Thota J, Thorat K, Tutaj MA, Wang SJ, Zacher S, Dwinell MR, Kwitek AE. 2022 updates to the Rat Genome Database: a Findable, Accessible, Interoperable, and Reusable (FAIR) resource. Genetics. 2023 May 4;224(1):iyad042. doi: 10.1093/genetics/iyad042. PMID: 36930729; PMCID: PMC10474928.","title":"Rat Genome Database (RGD)"},{"location":"Sources/rgd/#rat-genome-database-rgd","text":"The Rat Genome Database (RGD) was established in 1999 and is the premier site for genetic, genomic, phenotype, and disease data generated from rat research. In addition, it provides easy access to corresponding human and mouse data for cross-species comparisons. RGD bulk downloads","title":"Rat Genome Database (RGD)"},{"location":"Sources/rgd/#gene-literature","text":"This ingest uses RGD's gene file which contains publication assocations that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Even though it is a gene file, and we have fully populated the gene nodes in the alliance gene information ingest, the RGD file has some information that is not in alliance. Note, there will be a column mismatch warning on this transform because there are two (UNUSED) columns. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:rgd)","title":"Gene Literature"},{"location":"Sources/rgd/#citation","text":"Vedi M, Smith JR, Thomas Hayman G, Tutaj M, Brodie KC, De Pons JL, Demos WM, Gibson AC, Kaldunski ML, Lamers L, Laulederkind SJF, Thota J, Thorat K, Tutaj MA, Wang SJ, Zacher S, Dwinell MR, Kwitek AE. 2022 updates to the Rat Genome Database: a Findable, Accessible, Interoperable, and Reusable (FAIR) resource. Genetics. 2023 May 4;224(1):iyad042. doi: 10.1093/genetics/iyad042. PMID: 36930729; PMCID: PMC10474928.","title":"Citation"},{"location":"Sources/sgd/","text":"Saccharomyces Genome Database (SGD) The Saccharomyces Genome Database (SGD) provides comprehensive integrated biological information for the budding yeast Saccharomyces cerevisiae along with search and analysis tools to explore these data, enabling the discovery of functional relationships between sequence and gene products in fungi and higher organisms. SGD bulk downloads Gene Literature This ingest uses RGD's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:sgd) Citation Cherry JM, Hong EL, Amundsen C, Balakrishnan R, Binkley G, Chan ET, Christie KR, Costanzo MC, Dwight SS, Engel SR, Fisk DG, Hirschman JE, Hitz BC, Karra K, Krieger CJ, Miyasato SR, Nash RS, Park J, Skrzypek MS, Simison M, Weng S, Wong ED (2012) Saccharomyces Genome Database: the genomics resource of budding yeast. Nucleic Acids Res. Jan;40(Database issue):D700-5. [PMID: 22110037]","title":"Saccharomyces Genome Database (SGD)"},{"location":"Sources/sgd/#saccharomyces-genome-database-sgd","text":"The Saccharomyces Genome Database (SGD) provides comprehensive integrated biological information for the budding yeast Saccharomyces cerevisiae along with search and analysis tools to explore these data, enabling the discovery of functional relationships between sequence and gene products in fungi and higher organisms. SGD bulk downloads","title":"Saccharomyces Genome Database (SGD)"},{"location":"Sources/sgd/#gene-literature","text":"This ingest uses RGD's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:sgd)","title":"Gene Literature"},{"location":"Sources/sgd/#citation","text":"Cherry JM, Hong EL, Amundsen C, Balakrishnan R, Binkley G, Chan ET, Christie KR, Costanzo MC, Dwight SS, Engel SR, Fisk DG, Hirschman JE, Hitz BC, Karra K, Krieger CJ, Miyasato SR, Nash RS, Park J, Skrzypek MS, Simison M, Weng S, Wong ED (2012) Saccharomyces Genome Database: the genomics resource of budding yeast. Nucleic Acids Res. Jan;40(Database issue):D700-5. [PMID: 22110037]","title":"Citation"},{"location":"Sources/string/","text":"STRING: functional protein association networks STRING is a database of known and predicted protein-protein interactions . The interactions include direct (physical) and indirect (functional) associations; they stem from computational prediction, from knowledge transfer between organisms, and from interactions aggregated from other (primary) databases. STRING bulk downloads Protein Links This ingest uses a given version (currently, 11.5 ) of the STRING's .protein.links.detailed. .txt.gz files, for a subset of NCBI ID designated species. We filter the input data on the combined_score field (currently with the threshhold recorded in the protein_links.yaml file). The various taxon specific entrez_2_string mapping files are used to map protein subject and concept nodes onto Entrez gene id's. Special note about Entrez mapping files A separate Entrez to String identifier mapping file is not available for Rattus norvegicus (Norway rat, NCBI taxon ID 10116) but the mappings are (less conveniently) available inside the aggregated 'all_organisms' entrez_2_string file . See notes in the STRING section of the download.yaml configuration file for (self explanatory) guidance on how to prepare the required mapping file for use in a local running of the digest. Source File protein1 protein2 neighborhood fusion cooccurence coexpression experimental database textmining combined_score Biolink classes and properties captured Concept Nodes biolink:Gene id (NCBIGene Entrez ID) Associations biolink:PairwiseGeneToGeneInteraction : id (random uuid) subject (gene.id) predicate (interacts_with) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:string) Citation Damian Szklarczyk, Andrea Franceschini, Stefan Wyder, Kristoffer Forslund, Davide Heller, Jaime Huerta-Cepas, Milan Simonovic, Alexander Roth, Alberto Santos, Kalliopi P. Tsafou, Michael Kuhn, Peer Bork, Lars J. Jensen, Christian von Mering, STRING v10: protein\u2013protein interaction networks, integrated over the tree of life, Nucleic Acids Research, Volume 43, Issue D1, 28 January 2015, Pages D447\u2013D452, https://doi.org/10.1093/nar/gku1003","title":"String"},{"location":"Sources/string/#string-functional-protein-association-networks","text":"STRING is a database of known and predicted protein-protein interactions . The interactions include direct (physical) and indirect (functional) associations; they stem from computational prediction, from knowledge transfer between organisms, and from interactions aggregated from other (primary) databases. STRING bulk downloads","title":"STRING: functional protein association networks"},{"location":"Sources/string/#protein-links","text":"This ingest uses a given version (currently, 11.5 ) of the STRING's .protein.links.detailed. .txt.gz files, for a subset of NCBI ID designated species. We filter the input data on the combined_score field (currently with the threshhold recorded in the protein_links.yaml file). The various taxon specific entrez_2_string mapping files are used to map protein subject and concept nodes onto Entrez gene id's.","title":"Protein Links"},{"location":"Sources/string/#special-note-about-entrez-mapping-files","text":"A separate Entrez to String identifier mapping file is not available for Rattus norvegicus (Norway rat, NCBI taxon ID 10116) but the mappings are (less conveniently) available inside the aggregated 'all_organisms' entrez_2_string file . See notes in the STRING section of the download.yaml configuration file for (self explanatory) guidance on how to prepare the required mapping file for use in a local running of the digest.","title":"Special note about Entrez mapping files"},{"location":"Sources/string/#source-file","text":"protein1 protein2 neighborhood fusion cooccurence coexpression experimental database textmining combined_score","title":"Source File"},{"location":"Sources/string/#biolink-classes-and-properties-captured","text":"","title":"Biolink classes and properties captured"},{"location":"Sources/string/#concept-nodes","text":"biolink:Gene id (NCBIGene Entrez ID)","title":"Concept Nodes"},{"location":"Sources/string/#associations","text":"biolink:PairwiseGeneToGeneInteraction : id (random uuid) subject (gene.id) predicate (interacts_with) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:string)","title":"Associations"},{"location":"Sources/string/#citation","text":"Damian Szklarczyk, Andrea Franceschini, Stefan Wyder, Kristoffer Forslund, Davide Heller, Jaime Huerta-Cepas, Milan Simonovic, Alexander Roth, Alberto Santos, Kalliopi P. Tsafou, Michael Kuhn, Peer Bork, Lars J. Jensen, Christian von Mering, STRING v10: protein\u2013protein interaction networks, integrated over the tree of life, Nucleic Acids Research, Volume 43, Issue D1, 28 January 2015, Pages D447\u2013D452, https://doi.org/10.1093/nar/gku1003","title":"Citation"},{"location":"Sources/xenbase/","text":"Xenbase Xenbase is a web-accessible resource that integrates all the diverse biological, genomic, genotype and phenotype data available from Xenopus research. Xenbase Bulk Data Xenbase FTP Gene to Phenotype This ingest is built against a one-off OBAN formatted file, which makes for a transformation which only requries adding a curie prefix and connecting column names to biolink attributes. Evidence codes are provided as ECO terms but not yet captured in the output. Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase) Gene Literature This ingest reads from Xenbase's Genes Associated with Literature file to capture associations between Xenbase's XB-GENEPAGE ids and PMIDs, then relies on a map built from Xenbase's GenepageToGeneId file to create associations from XB-GENE records to PMID records. Biolink captured Gene id Publication id InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase) Citation Fisher et al. 2023, Genetics, 2023;, iyad018, doi:10.1093/genetics/iyad018 (Xenbase / PubMed / Genetics)","title":"Xenbase"},{"location":"Sources/xenbase/#xenbase","text":"Xenbase is a web-accessible resource that integrates all the diverse biological, genomic, genotype and phenotype data available from Xenopus research. Xenbase Bulk Data Xenbase FTP","title":"Xenbase"},{"location":"Sources/xenbase/#gene-to-phenotype","text":"This ingest is built against a one-off OBAN formatted file, which makes for a transformation which only requries adding a curie prefix and connecting column names to biolink attributes. Evidence codes are provided as ECO terms but not yet captured in the output. Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase)","title":"Gene to Phenotype"},{"location":"Sources/xenbase/#gene-literature","text":"This ingest reads from Xenbase's Genes Associated with Literature file to capture associations between Xenbase's XB-GENEPAGE ids and PMIDs, then relies on a map built from Xenbase's GenepageToGeneId file to create associations from XB-GENE records to PMID records. Biolink captured Gene id Publication id InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase)","title":"Gene Literature"},{"location":"Sources/xenbase/#citation","text":"Fisher et al. 2023, Genetics, 2023;, iyad018, doi:10.1093/genetics/iyad018 (Xenbase / PubMed / Genetics)","title":"Citation"},{"location":"Sources/zfin/","text":"ZFIN ZFIN is the Zebrafish Model Organism Database. ZFIN bulk downloads Gene to Phenotype This ingest uses ZFIN's clean gene phenotype download file, which only contains phenotypes which can safely be associated to a single affected gene. This ingest is distinct from the Alliance phenotype index because ZFIN builds Entity-Quality-Entity phenotype statements that can be built from post-composed terms (E1a+E1b+Q+E2a+E2b), Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin) Gene Literature This ingest uses ZFIN's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the ZDB-ID for the publication node rather than a pubmed ID, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin) Citation Bradford, Y.M., Van Slyke, C.E., Ruzicka, L., Singer, A., Eagle, A., Fashena, D., Howe, D.G., Frazer, K., Martin, R., Paddock, H., Pich, C., Ramachandran, S., Westerfield, M. (2022) Zebrafish Information Network, the knowledgebase for Danio rerio research. Genetics. 220(4).","title":"ZFIN"},{"location":"Sources/zfin/#zfin","text":"ZFIN is the Zebrafish Model Organism Database. ZFIN bulk downloads","title":"ZFIN"},{"location":"Sources/zfin/#gene-to-phenotype","text":"This ingest uses ZFIN's clean gene phenotype download file, which only contains phenotypes which can safely be associated to a single affected gene. This ingest is distinct from the Alliance phenotype index because ZFIN builds Entity-Quality-Entity phenotype statements that can be built from post-composed terms (E1a+E1b+Q+E2a+E2b), Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin)","title":"Gene to Phenotype"},{"location":"Sources/zfin/#gene-literature","text":"This ingest uses ZFIN's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the ZDB-ID for the publication node rather than a pubmed ID, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin)","title":"Gene Literature"},{"location":"Sources/zfin/#citation","text":"Bradford, Y.M., Van Slyke, C.E., Ruzicka, L., Singer, A., Eagle, A., Fashena, D., Howe, D.G., Frazer, K., Martin, R., Paddock, H., Pich, C., Ramachandran, S., Westerfield, M. (2022) Zebrafish Information Network, the knowledgebase for Danio rerio research. Genetics. 220(4).","title":"Citation"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Monarch Ingest Overview The Monarch Ingest generates KGX formatted files conforming to the BioLink Model from a wide variety of biomedical data sources. The eventual output of the Monarch Ingest process is the Monarch KG . The latest version of this can be found at data.monarchinitiative.org See also the folder monarch-kg-dev/latest Monarch Ingest is built using Poetry , which will create its own virtual environment. Installation monarch-ingest is a Python 3.8+ package, installable via Poetry . Install Poetry , if you don't already have it: curl -sSL https://install.python-poetry.org | python3 - # Optional: Have poetry create its venvs in your project directories poetry config virtualenvs.in-project true Clone the repo and build the code: git clone git@github.com/monarch-initiative/monarch-ingest Install monarch-ingest: cd monarch-ingest poetry install (Optional) Activate the virtual environment: # This step removes the need to prefix all commands with `poetry run` poetry shell Usage For a detailed tutorial on ingests and how to make one, see the Create an Ingest tab . CLI usage is available in the CLI tab , gcor by running ingest --help . Run the whole pipeline! Download the source data: ingest download --all Run all transforms: ingest transform --all Merge all transformed output into a tar.gz containing one node and one edge file ingest merge Upload the results to the Monarch Ingest Google bucket ingest release","title":"Welcome"},{"location":"#monarch-ingest","text":"","title":"Monarch Ingest"},{"location":"#overview","text":"The Monarch Ingest generates KGX formatted files conforming to the BioLink Model from a wide variety of biomedical data sources. The eventual output of the Monarch Ingest process is the Monarch KG . The latest version of this can be found at data.monarchinitiative.org See also the folder monarch-kg-dev/latest Monarch Ingest is built using Poetry , which will create its own virtual environment.","title":"Overview"},{"location":"#installation","text":"monarch-ingest is a Python 3.8+ package, installable via Poetry . Install Poetry , if you don't already have it: curl -sSL https://install.python-poetry.org | python3 - # Optional: Have poetry create its venvs in your project directories poetry config virtualenvs.in-project true Clone the repo and build the code: git clone git@github.com/monarch-initiative/monarch-ingest Install monarch-ingest: cd monarch-ingest poetry install (Optional) Activate the virtual environment: # This step removes the need to prefix all commands with `poetry run` poetry shell","title":"Installation"},{"location":"#usage","text":"For a detailed tutorial on ingests and how to make one, see the Create an Ingest tab . CLI usage is available in the CLI tab , gcor by running ingest --help . Run the whole pipeline! Download the source data: ingest download --all Run all transforms: ingest transform --all Merge all transformed output into a tar.gz containing one node and one edge file ingest merge Upload the results to the Monarch Ingest Google bucket ingest release","title":"Usage"},{"location":"CLI/","text":"ingest Usage : $ ingest [ OPTIONS ] COMMAND [ ARGS ] ... Options : --version --install-completion : Install completion for the current shell. --show-completion : Show completion for the current shell, to copy it or customize the installation. --help : Show this message and exit. Commands : closure download : Downloads data defined in download.yaml export jsonl merge : Merge nodes and edges into kg release : Copy data to Monarch GCP data buckets solr sqlite transform : Run Koza transformation on specified... ingest closure Usage : $ ingest closure [ OPTIONS ] Options : --help : Show this message and exit. ingest download Downloads data defined in download.yaml Usage : $ ingest download [ OPTIONS ] Options : --ingests TEXT : Which ingests to download data for --all / --no-all : Download all ingest datasets [default: no-all] --help : Show this message and exit. ingest export Usage : $ ingest export [ OPTIONS ] Options : --help : Show this message and exit. ingest jsonl Usage : $ ingest jsonl [ OPTIONS ] Options : --help : Show this message and exit. ingest merge Merge nodes and edges into kg Usage : $ ingest merge [ OPTIONS ] Options : --input-dir TEXT : Directory with nodes and edges to be merged [default: output/transform_output] --output-dir TEXT : Directory to output data [default: output] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose --help : Show this message and exit. ingest release Copy data to Monarch GCP data buckets Usage : $ ingest release [ OPTIONS ] Options : --dir TEXT : Directory with kg to be released [default: output] --kghub / --no-kghub : Also release to kghub S3 bucket [default: no-kghub] --help : Show this message and exit. ingest solr Usage : $ ingest solr [ OPTIONS ] Options : --help : Show this message and exit. ingest sqlite Usage : $ ingest sqlite [ OPTIONS ] Options : --help : Show this message and exit. ingest transform Run Koza transformation on specified Monarch ingests Usage : $ ingest transform [ OPTIONS ] Options : -o, --output-dir TEXT : Directory to output data [default: output] -i, --ingest TEXT : Run a single ingest (see ingests.yaml for a list) --phenio / --no-phenio : Run the phenio transform [default: no-phenio] -a, --all : Ingest all sources -f, --force : Force ingest, even if output exists (on by default for single ingests) --rdf / --no-rdf : Output rdf files along with tsv [default: no-rdf] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose, including Koza logs -l, --log : Write DEBUG level logs to ./logs/ for each ingest -n, --row-limit INTEGER : Number of rows to process --help : Show this message and exit.","title":"CLI"},{"location":"CLI/#ingest","text":"Usage : $ ingest [ OPTIONS ] COMMAND [ ARGS ] ... Options : --version --install-completion : Install completion for the current shell. --show-completion : Show completion for the current shell, to copy it or customize the installation. --help : Show this message and exit. Commands : closure download : Downloads data defined in download.yaml export jsonl merge : Merge nodes and edges into kg release : Copy data to Monarch GCP data buckets solr sqlite transform : Run Koza transformation on specified...","title":"ingest"},{"location":"CLI/#ingest-closure","text":"Usage : $ ingest closure [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest closure"},{"location":"CLI/#ingest-download","text":"Downloads data defined in download.yaml Usage : $ ingest download [ OPTIONS ] Options : --ingests TEXT : Which ingests to download data for --all / --no-all : Download all ingest datasets [default: no-all] --help : Show this message and exit.","title":"ingest download"},{"location":"CLI/#ingest-export","text":"Usage : $ ingest export [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest export"},{"location":"CLI/#ingest-jsonl","text":"Usage : $ ingest jsonl [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest jsonl"},{"location":"CLI/#ingest-merge","text":"Merge nodes and edges into kg Usage : $ ingest merge [ OPTIONS ] Options : --input-dir TEXT : Directory with nodes and edges to be merged [default: output/transform_output] --output-dir TEXT : Directory to output data [default: output] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose --help : Show this message and exit.","title":"ingest merge"},{"location":"CLI/#ingest-release","text":"Copy data to Monarch GCP data buckets Usage : $ ingest release [ OPTIONS ] Options : --dir TEXT : Directory with kg to be released [default: output] --kghub / --no-kghub : Also release to kghub S3 bucket [default: no-kghub] --help : Show this message and exit.","title":"ingest release"},{"location":"CLI/#ingest-solr","text":"Usage : $ ingest solr [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest solr"},{"location":"CLI/#ingest-sqlite","text":"Usage : $ ingest sqlite [ OPTIONS ] Options : --help : Show this message and exit.","title":"ingest sqlite"},{"location":"CLI/#ingest-transform","text":"Run Koza transformation on specified Monarch ingests Usage : $ ingest transform [ OPTIONS ] Options : -o, --output-dir TEXT : Directory to output data [default: output] -i, --ingest TEXT : Run a single ingest (see ingests.yaml for a list) --phenio / --no-phenio : Run the phenio transform [default: no-phenio] -a, --all : Ingest all sources -f, --force : Force ingest, even if output exists (on by default for single ingests) --rdf / --no-rdf : Output rdf files along with tsv [default: no-rdf] -d, --debug / -q, --quiet : Use --quiet to suppress log output, --debug for verbose, including Koza logs -l, --log : Write DEBUG level logs to ./logs/ for each ingest -n, --row-limit INTEGER : Number of rows to process --help : Show this message and exit.","title":"ingest transform"},{"location":"Create-an-Ingest/","text":"What is an Ingest? Ingest Overview An ingest consists of 2 main steps: Downloading the data Transforming the data With 2 post-processing steps: Merging the output into a KGX knowledge graph Releasing the result to the Monarch Initiative Google Cloud bucket Let's go through the process for running an existing monarch ingest! Step 1. Download Download the dataset for your ingest, for example: ingest download --tags ncbi_gene or to download all source data: ingest download --all Step 2. Transform Transform the data, for example: ingest transform --tag ncbi_gene --row-limit 20 --log or ingest transform --all Step 3. Merge This step is typically performed after ingest transform --all , and merges all output node and edge files into a tar.gz containing one node and one edge file: ingest merge Step 4. Release Once you've transformed all the data and merged the output, you can create and upload a release: ingest release -- Now let's look at how to create and add a new ingest! First step: Propose a new Ingest","title":"What is an Ingest?"},{"location":"Create-an-Ingest/#what-is-an-ingest","text":"Ingest Overview An ingest consists of 2 main steps: Downloading the data Transforming the data With 2 post-processing steps: Merging the output into a KGX knowledge graph Releasing the result to the Monarch Initiative Google Cloud bucket Let's go through the process for running an existing monarch ingest! Step 1. Download Download the dataset for your ingest, for example: ingest download --tags ncbi_gene or to download all source data: ingest download --all Step 2. Transform Transform the data, for example: ingest transform --tag ncbi_gene --row-limit 20 --log or ingest transform --all Step 3. Merge This step is typically performed after ingest transform --all , and merges all output node and edge files into a tar.gz containing one node and one edge file: ingest merge Step 4. Release Once you've transformed all the data and merged the output, you can create and upload a release: ingest release -- Now let's look at how to create and add a new ingest! First step: Propose a new Ingest","title":"What is an Ingest?"},{"location":"Create-an-Ingest/1.%20Propose/","text":"Propose Propose an Ingest : create a ticket on GitHub Ingest that includes the name of the source and justification for it's inclusion in Monarch. Assign the ticket to @putmantime & @sagehrke (Monarch PM). Who : Anyone can submit a proposal. Estimate Workload : Utilize planning poker to identify the amount of work the proposed ingest will be. Who : Monarch Technical Team. The Monarch PM will initialize the planning poker vote. Put up for vote & discussion : Use voting on github (thumb up for approve / thumb down for reject). Voting & discussion are open for two weeks. Who : Anyone can vote on a proposal. The Monarch PM will initialize the vote. If voted positive : assign the ingest to a team member, start working on ingest, then create a PR. Skip to Step 6. Who : Monarch Technical Team If voted negative , note why it was downvoted and close the issue. Who : Monarch Technical Team Disseminate the proposed model and gather feedback : Send an email to the Monarch Leads Google Group and the slack ingest channel (kg-monarch) requesting input. All discussions are done in GitHub on the PR. Tag those that need to respond in a comment on the PR. The feedback stage is open for two weeks. Who : Monarch Technical Team member assigned to the PR in Step 4 will disseminate the proposal. Anyone can comment or suggest input. Deploy the new ingest Who : Monarch Technical Team Now let's look at how to create and add a new ingest! First step: Configure","title":"Propose"},{"location":"Create-an-Ingest/1.%20Propose/#propose","text":"Propose an Ingest : create a ticket on GitHub Ingest that includes the name of the source and justification for it's inclusion in Monarch. Assign the ticket to @putmantime & @sagehrke (Monarch PM). Who : Anyone can submit a proposal. Estimate Workload : Utilize planning poker to identify the amount of work the proposed ingest will be. Who : Monarch Technical Team. The Monarch PM will initialize the planning poker vote. Put up for vote & discussion : Use voting on github (thumb up for approve / thumb down for reject). Voting & discussion are open for two weeks. Who : Anyone can vote on a proposal. The Monarch PM will initialize the vote. If voted positive : assign the ingest to a team member, start working on ingest, then create a PR. Skip to Step 6. Who : Monarch Technical Team If voted negative , note why it was downvoted and close the issue. Who : Monarch Technical Team Disseminate the proposed model and gather feedback : Send an email to the Monarch Leads Google Group and the slack ingest channel (kg-monarch) requesting input. All discussions are done in GitHub on the PR. Tag those that need to respond in a comment on the PR. The feedback stage is open for two weeks. Who : Monarch Technical Team member assigned to the PR in Step 4 will disseminate the proposal. Anyone can comment or suggest input. Deploy the new ingest Who : Monarch Technical Team Now let's look at how to create and add a new ingest! First step: Configure","title":"Propose"},{"location":"Create-an-Ingest/2.%20Configure/","text":"Configure Make a directory for your ingest, using the source of the data as the name: mkdir src/monarch_ingest/ingests/ For example: mkdir src/monarch_ingest/ingests/ncbi Add data sources to src/monarch_ingest/download.yaml : # - url : https://.com/downloads/somedata.txt local_name : data//somedata.txt tag : _ For example: # mgi - url : http://www.informatics.jax.org/downloads/reports/MRK_Reference.rpt local_name : data/mgi/MRK_Reference.rpt tag : mgi_publication_to_gene Note: You can now use ingest download --tags or ingest download --all , and your data will be downloaded to the appropriate subdir in data/ Add your ingest to src/monarch_ingest/ingests.yaml : : config : 'ingests//.yaml For example: ncbi_gene : config : 'ingests/ncbi/gene.yaml' Copy the template: cp ingest_template/* src/monarch_ingest/ingests/ Edit metadata.yaml : Update the description, rights link, url, etc and then add your source_file Edit the source file yaml Match the columns or required fields with what's available in the file to be ingested If it's an ingest that exists in Dipper , check out what Dipper does. Check the Biolink Model documentation to look at what you can capture If what we need from an ingest can't be captured in the model yet, make a new Biolink issue Set the header properties If there is no header at all, set header: False If there are comment lines before the header, count them and set skip_lines: {n} -- Next step: Adding documentation","title":"Configure"},{"location":"Create-an-Ingest/2.%20Configure/#configure","text":"Make a directory for your ingest, using the source of the data as the name: mkdir src/monarch_ingest/ingests/ For example: mkdir src/monarch_ingest/ingests/ncbi Add data sources to src/monarch_ingest/download.yaml : # - url : https://.com/downloads/somedata.txt local_name : data//somedata.txt tag : _ For example: # mgi - url : http://www.informatics.jax.org/downloads/reports/MRK_Reference.rpt local_name : data/mgi/MRK_Reference.rpt tag : mgi_publication_to_gene Note: You can now use ingest download --tags or ingest download --all , and your data will be downloaded to the appropriate subdir in data/ Add your ingest to src/monarch_ingest/ingests.yaml : : config : 'ingests//.yaml For example: ncbi_gene : config : 'ingests/ncbi/gene.yaml' Copy the template: cp ingest_template/* src/monarch_ingest/ingests/ Edit metadata.yaml : Update the description, rights link, url, etc and then add your source_file Edit the source file yaml Match the columns or required fields with what's available in the file to be ingested If it's an ingest that exists in Dipper , check out what Dipper does. Check the Biolink Model documentation to look at what you can capture If what we need from an ingest can't be captured in the model yet, make a new Biolink issue Set the header properties If there is no header at all, set header: False If there are comment lines before the header, count them and set skip_lines: {n} -- Next step: Adding documentation","title":"Configure"},{"location":"Create-an-Ingest/3.%20Document/","text":"Document The documentation for an ingest should reflect both the decision-making process that led to the output, and the output itself. Begin by copying the source.md file to the docs/Sources/ folder, renaming it to match the ingest name. Tip This is a great time to look over the columns in the ingest file. Consider what biolink classes are appropriate to represent them, and what fields are available to populate on each. Some helpful resources: Biolink Documentation List of Biolink Associations Use a Jupyter Notebook with Biolink Model Toolkit to do things like get_element_by_mapping('RO:0002410') For ingests migrating from Dipper, check out the documentation and source code -- Next step: Begin implementation","title":"Document"},{"location":"Create-an-Ingest/3.%20Document/#document","text":"The documentation for an ingest should reflect both the decision-making process that led to the output, and the output itself. Begin by copying the source.md file to the docs/Sources/ folder, renaming it to match the ingest name. Tip This is a great time to look over the columns in the ingest file. Consider what biolink classes are appropriate to represent them, and what fields are available to populate on each. Some helpful resources: Biolink Documentation List of Biolink Associations Use a Jupyter Notebook with Biolink Model Toolkit to do things like get_element_by_mapping('RO:0002410') For ingests migrating from Dipper, check out the documentation and source code -- Next step: Begin implementation","title":"Document"},{"location":"Create-an-Ingest/4.%20Implement/","text":"Implement Most Koza scripts can run in flat mode, which means that the transform code itself doesn't need to handle the looping mechanism, and instead the transform code will have a row injected at the top and call the write command at the bottom. In between fields from the incoming row should be mapped to Biolink instances. Imports and setup Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer. from koza.cli_runner import koza_app from biolink.pydanticmodel import Gene # The source name is used for reading and writing source_name = \"gene-information\" Inject the row # inject a single row from the source row = koza_app . get_row ( source_name ) Extras Next up handle any additional set up for the ingest, such as including a map or bringing in the CURIE cleaning service curie_cleaner = koza_app . curie_cleaner eqe2zp = koza_app . get_map ( \"eqe2zp\" ) translation_table = koza_app . translation_table Creating entities At this step, hopefully your documentation is so good that you're just letting your fingers take on the last step of converting what you've already planned into Python syntax. Ideally not much logic will be needed here, and if there's a lot, it might be worth considering whether an ingest (even on the same file) can be split across multiple transforms so that each is as easy to read as possible. Aim to add all properties when creating the instance, but in some cases adding optional lists might need to happen below. from biolink.pydanticmodel import Gene gene = Gene ( id = 'somethingbase:' + row [ 'ID' ], name = row [ 'Name' ] ) # populate any additional optional properties if row [ 'xrefs' ]: gene . xrefs = [ curie_cleaner . clean ( xref ) for xref in row [ 'xrefs' ]] Writing At the end of the script, call the writer. The first argument must be the source_name (so that it will know where to write), entities should be passed in as additional arguments. koza_app . write ( gene , phenotypicFeature , association ) Running your ingest To execute your ingest, you can now run: ingest transform --tag -- Next step: Testing!","title":"Implement"},{"location":"Create-an-Ingest/4.%20Implement/#implement","text":"Most Koza scripts can run in flat mode, which means that the transform code itself doesn't need to handle the looping mechanism, and instead the transform code will have a row injected at the top and call the write command at the bottom. In between fields from the incoming row should be mapped to Biolink instances.","title":"Implement"},{"location":"Create-an-Ingest/4.%20Implement/#imports-and-setup","text":"Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer. from koza.cli_runner import koza_app from biolink.pydanticmodel import Gene # The source name is used for reading and writing source_name = \"gene-information\"","title":"Imports and setup"},{"location":"Create-an-Ingest/4.%20Implement/#inject-the-row","text":"# inject a single row from the source row = koza_app . get_row ( source_name )","title":"Inject the row"},{"location":"Create-an-Ingest/4.%20Implement/#extras","text":"Next up handle any additional set up for the ingest, such as including a map or bringing in the CURIE cleaning service curie_cleaner = koza_app . curie_cleaner eqe2zp = koza_app . get_map ( \"eqe2zp\" ) translation_table = koza_app . translation_table","title":"Extras"},{"location":"Create-an-Ingest/4.%20Implement/#creating-entities","text":"At this step, hopefully your documentation is so good that you're just letting your fingers take on the last step of converting what you've already planned into Python syntax. Ideally not much logic will be needed here, and if there's a lot, it might be worth considering whether an ingest (even on the same file) can be split across multiple transforms so that each is as easy to read as possible. Aim to add all properties when creating the instance, but in some cases adding optional lists might need to happen below. from biolink.pydanticmodel import Gene gene = Gene ( id = 'somethingbase:' + row [ 'ID' ], name = row [ 'Name' ] ) # populate any additional optional properties if row [ 'xrefs' ]: gene . xrefs = [ curie_cleaner . clean ( xref ) for xref in row [ 'xrefs' ]]","title":"Creating entities"},{"location":"Create-an-Ingest/4.%20Implement/#writing","text":"At the end of the script, call the writer. The first argument must be the source_name (so that it will know where to write), entities should be passed in as additional arguments. koza_app . write ( gene , phenotypicFeature , association )","title":"Writing"},{"location":"Create-an-Ingest/4.%20Implement/#running-your-ingest","text":"To execute your ingest, you can now run: ingest transform --tag -- Next step: Testing!","title":"Running your ingest"},{"location":"Create-an-Ingest/5.%20Test/","text":"Testing You may want to start with the test template within ingest_template Basic fixtures First, set up your basic fixtures, taking care to set the correct source name and location for the transform code. import pytest from koza.cli_runner import get_translation_table @pytest . fixture def tt (): return get_translation_table ( \"src/monarch_ingest/translation_table.yaml\" , None ) # This name must match the ingest name in the transform code @pytest . fixture def source_name (): return \"something-to-somethingelse\" # This is the location of the transform code @pytest . fixture def script (): return \"./src/monarch_ingest/ingests/somethingbase/something2somethingelse.py\" A map, if necessary Some ingests will depend on one or more maps, that fixture can be set up here. Note that this fixture must return a map of maps, and that the inner maps will map from an ID to a dictionary representing column headers and values. In the example below, a map is created that maps from a big concatenated natural key (as the ID) for ZP to a single column (called iri ) that contains the ZP ID. This map is then placed into the map cache under the name eqe2zp @pytest . fixture def map_cache (): eqe2zp = { \"0-0-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0004225\" }, \"BSPO:0000112-BFO:0000050-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0011243\" }, \"BSPO:0000000-BFO:0000050-ZFA:0000823-PATO:0000642-BSPO:0000007-BFO:0000050-ZFA:0000823\" : { \"iri\" : \"ZP:0000157\" }, } return { \"eqe2zp\" : eqe2zp } Fixtures for test data Create a fixture that returns a dictionary to represent a single row. As a matter of strategy, this row should probably represent a fairly basic row being ingested. One trick so that you don't have to manually convert from the imput format to a python dictionary format is to run your ingest with a debugger and set a breakpoint just after a row has been injected. If you want a more specific piece of data, check out conditional breakpoints. @pytest . fixture def basic_row (): return { \"ID\" : \"341492416\" , \"Gene Symbol\" : \"pax2a\" , \"Gene ID\" : \"ZDB-GENE-990415-8\" , #... \"Fish Environment ID\" : \"ZDB-GENOX-041102-1385\" , \"Publication ID\" : \"ZDB-PUB-970210-19\" , \"Figure ID\" : \"ZDB-FIG-120307-8\" , } Fixture for transforming a single row This sets up a fixture you can call more than once to independently test different attributes @pytest . fixture def basic_g2p ( mock_koza , source_name , basic_row , script , map_cache , tt ): return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) Test the basics of the ingest Confirm that entities are created matching the expectations on the row # A simple end-to-end test is to confirm that the IDs are set on def test_gene ( basic_g2p ): gene = basic_g2p [ 0 ] assert gene assert gene . id == \"ZFIN:ZDB-GENE-990415-8\" def test_phenotypic_feature ( basic_g2p ): phenotypic_feature = basic_g2p [ 1 ] assert phenotypic_feature assert phenotypic_feature . id == \"ZP:0004225\" def test_association ( basic_g2p ): association = basic_g2p [ 2 ] assert association assert association . subject == \"ZFIN:ZDB-GENE-990415-8\" assert association . object == \"ZP:0004225\" assert association . publications assert association . publications [ 0 ] == \"ZFIN:ZDB-PUB-970210-19\" Test against an alternate row For any branching within the transform code, it's a good idea to test against all of the paths through the code. It's possible to set conditional breakpoints to find real examples in the code that will hit each code path, but it may be more practical to modify the basic row as a new fixture The example below creates a row with additional columns filled in. @pytest . fixture def postcomposed ( mock_koza , source_name , basic_row , script , map_cache , tt ): basic_row [ \"Affected Structure or Process 1 subterm ID\" ] = \"BSPO:0000112\" basic_row [ \"Post-composed Relationship ID\" ] = \"BFO:0000050\" basic_row [ \"Affected Structure or Process 1 superterm ID\" ] = \"ZFA:0000042\" return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) Parameterized tests Mixing parameterization and fixtures changes the approach a little. In this case it makes more sense to alter the row using a parameter and then create the entities within the same method. The test below is intended to confirm that when the tag column has any of the specified values, the row will be ignored (confirmed because no entities are created). @pytest . mark . parametrize ( \"tag\" , [ \"normal\" , \"exacerbated\" , \"ameliorated\" ]) def test_excluded_tags ( mock_koza , source_name , basic_row , script , map_cache , tt , tag ): basic_row [ \"Phenotype Tag\" ] = tag entities = mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) assert len ( entities ) == 0","title":"Testing"},{"location":"Create-an-Ingest/5.%20Test/#testing","text":"You may want to start with the test template within ingest_template","title":"Testing"},{"location":"Create-an-Ingest/5.%20Test/#basic-fixtures","text":"First, set up your basic fixtures, taking care to set the correct source name and location for the transform code. import pytest from koza.cli_runner import get_translation_table @pytest . fixture def tt (): return get_translation_table ( \"src/monarch_ingest/translation_table.yaml\" , None ) # This name must match the ingest name in the transform code @pytest . fixture def source_name (): return \"something-to-somethingelse\" # This is the location of the transform code @pytest . fixture def script (): return \"./src/monarch_ingest/ingests/somethingbase/something2somethingelse.py\"","title":"Basic fixtures"},{"location":"Create-an-Ingest/5.%20Test/#a-map-if-necessary","text":"Some ingests will depend on one or more maps, that fixture can be set up here. Note that this fixture must return a map of maps, and that the inner maps will map from an ID to a dictionary representing column headers and values. In the example below, a map is created that maps from a big concatenated natural key (as the ID) for ZP to a single column (called iri ) that contains the ZP ID. This map is then placed into the map cache under the name eqe2zp @pytest . fixture def map_cache (): eqe2zp = { \"0-0-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0004225\" }, \"BSPO:0000112-BFO:0000050-ZFA:0000042-PATO:0000638-0-0-0\" : { \"iri\" : \"ZP:0011243\" }, \"BSPO:0000000-BFO:0000050-ZFA:0000823-PATO:0000642-BSPO:0000007-BFO:0000050-ZFA:0000823\" : { \"iri\" : \"ZP:0000157\" }, } return { \"eqe2zp\" : eqe2zp }","title":"A map, if necessary"},{"location":"Create-an-Ingest/5.%20Test/#fixtures-for-test-data","text":"Create a fixture that returns a dictionary to represent a single row. As a matter of strategy, this row should probably represent a fairly basic row being ingested. One trick so that you don't have to manually convert from the imput format to a python dictionary format is to run your ingest with a debugger and set a breakpoint just after a row has been injected. If you want a more specific piece of data, check out conditional breakpoints. @pytest . fixture def basic_row (): return { \"ID\" : \"341492416\" , \"Gene Symbol\" : \"pax2a\" , \"Gene ID\" : \"ZDB-GENE-990415-8\" , #... \"Fish Environment ID\" : \"ZDB-GENOX-041102-1385\" , \"Publication ID\" : \"ZDB-PUB-970210-19\" , \"Figure ID\" : \"ZDB-FIG-120307-8\" , }","title":"Fixtures for test data"},{"location":"Create-an-Ingest/5.%20Test/#fixture-for-transforming-a-single-row","text":"This sets up a fixture you can call more than once to independently test different attributes @pytest . fixture def basic_g2p ( mock_koza , source_name , basic_row , script , map_cache , tt ): return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , )","title":"Fixture for transforming a single row"},{"location":"Create-an-Ingest/5.%20Test/#test-the-basics-of-the-ingest","text":"Confirm that entities are created matching the expectations on the row # A simple end-to-end test is to confirm that the IDs are set on def test_gene ( basic_g2p ): gene = basic_g2p [ 0 ] assert gene assert gene . id == \"ZFIN:ZDB-GENE-990415-8\" def test_phenotypic_feature ( basic_g2p ): phenotypic_feature = basic_g2p [ 1 ] assert phenotypic_feature assert phenotypic_feature . id == \"ZP:0004225\" def test_association ( basic_g2p ): association = basic_g2p [ 2 ] assert association assert association . subject == \"ZFIN:ZDB-GENE-990415-8\" assert association . object == \"ZP:0004225\" assert association . publications assert association . publications [ 0 ] == \"ZFIN:ZDB-PUB-970210-19\"","title":"Test the basics of the ingest"},{"location":"Create-an-Ingest/5.%20Test/#test-against-an-alternate-row","text":"For any branching within the transform code, it's a good idea to test against all of the paths through the code. It's possible to set conditional breakpoints to find real examples in the code that will hit each code path, but it may be more practical to modify the basic row as a new fixture The example below creates a row with additional columns filled in. @pytest . fixture def postcomposed ( mock_koza , source_name , basic_row , script , map_cache , tt ): basic_row [ \"Affected Structure or Process 1 subterm ID\" ] = \"BSPO:0000112\" basic_row [ \"Post-composed Relationship ID\" ] = \"BFO:0000050\" basic_row [ \"Affected Structure or Process 1 superterm ID\" ] = \"ZFA:0000042\" return mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , )","title":"Test against an alternate row"},{"location":"Create-an-Ingest/5.%20Test/#parameterized-tests","text":"Mixing parameterization and fixtures changes the approach a little. In this case it makes more sense to alter the row using a parameter and then create the entities within the same method. The test below is intended to confirm that when the tag column has any of the specified values, the row will be ignored (confirmed because no entities are created). @pytest . mark . parametrize ( \"tag\" , [ \"normal\" , \"exacerbated\" , \"ameliorated\" ]) def test_excluded_tags ( mock_koza , source_name , basic_row , script , map_cache , tt , tag ): basic_row [ \"Phenotype Tag\" ] = tag entities = mock_koza ( source_name , iter ([ basic_row ]), script , map_cache = map_cache , translation_table = tt , ) assert len ( entities ) == 0","title":"Parameterized tests"},{"location":"KG-Build-Process/kg-build-process/","text":"Monarch KG Build Process Download A weekly job indepent from the KG build process runs to download data sources and store then on a cloud bucket. This replaces DipperCache from the old pipeline. KGHub Downloader reads from downloads.yaml to download each file. Some post-processing is done in a shell script before the files are uploaded to the cloud bucket. At the start of the main ingest build, data files are copied from the cloud bucket. Transform A call to the ingest command line tool runs each source ingest defined in ingest.yaml , producing both KGX tsv and RDF nt output. Source Ingests Ingests are documented individually in the Sources section of this documentation. Ingests are either node or edge specific, and use IDs as defined in the source data files without additional re-mapping of identifiers. The primary role they have is to represent sources in biolink model and KGX format, and secondarily they may also subset from the source files. The output of individual ingests can be found in the transform_output directory in each release. Phenio-KG Ontologies in Monarch are built first as Phenio , then converted into the biolink model and represented as KGX in kg-phenio . The ingest CLI has transform_phenio method then performs some further filtering on the kg-phenio node and edge files. Limiting to nodes and edges that match a subset of curie namespaces, and limiting node property columns to a relevant subset. Merge With all transforms complete, the individual kgx node and edge files in output/transform_output can be combined into a merged graph. This is done by the merge command in the ingest CLI. At this point, the individual node and edge KGX files from the transforms may not have matching IDs, and in fact, we may have edges that point to nodes that are not present in our canonical node sources (e.g. a STRING edge that points to an ENSEMBL gene that can't be mapped to HGNC). The merge process is broken down into concatenation, mapping, and finally a QC filter step. We developed a tool called cat merge Concatenate The first step just loads all node kgx files into one dataframe, and all edge kgx files into another. Map The mapping step replaces subject and object IDs in edge files using SSSOM mapping files, with the IDs from the intial ingests stored in original_subject and original_object fields. Mappings for genes are generated in our monarch-gene-mapping process, and are available at data.monarchinitiative.org . Diseases are mapped using the MONDO SSSOM. This step is requires that the subject of the SSSOM file be our canonical ID, and the object be the non-canonical ID. There is room for improvement here. QC Filter After edges have been mapped, it's important to cull the graph that point to nodes that don't exist in the graph. The QC filtering step performs joins against the node table/dataframe to split out these edges into their own kgx file ( monarch-kg-dangling-edges.tsv that can be used for QC purposes. A group of edges that wind up in this file could be due to a number of reasons: * We're missing an ontology or other node source that is required for an ingest/source: this is something we want to fix \ud83d\udc4e * We're missing mapping necessary to translate between an edge ingest and our canonical node sources: this is something we want to fix \ud83d\udc4e * The edge ingest includes edges which can't be mapped to our canonical node sources: this is a feature! \ud83d\udc4d We have a visualization of this split between connected and dangling edges for each ingest on our QC Dashboard that we can use to problem-solve our mappings and node sources. Neo4j A neo4j dump is created using the merged tar.gz file using KGX's neo4j loader and a docker container. This process is defined directly in the Jenkinsfile . Denormalize For Solr (and secondarily SQLite) we produce a denormalized edge file, which includes additional details for the subjects and objects of each edge, including the category, namespace/prefix, and ontology ancestor closures following the GOLR pattern (ID and label closure lists). The closure file is generated by relation-graph and are included in the kg-phenio download. The after_download script makes a filtered version that only includes rdfs:subClassOf , BFO:0000050 , and UPHENO:0000001 . SQLite A SQLite database file is produced by loading node and edge files into a SQLite database using a simple shell script , along with the primary node and edge tables, edge tables for danging and denormalized edges are included as well. Solr Our solr index is loaded directly from the node kgx tsv file and the denormalized edge tsv file using LinkML-Solr . The LinkML schema for the Solr index is lives in the monarch-py data access library (see documentation for Entity and Association classes). LinkML-Solr starts Solr in docker via the lsolr command, defines the Solr schema based on the LinkML Schema and then bulk loads the data. Currently, a small amount of additional Solr configuration ( defining new field types , and copy-fields declarations to fill them) is done via curl commands in shell scripts. Our solr load process is defined in scripts/load_solr.sh","title":"KG Build Process"},{"location":"KG-Build-Process/kg-build-process/#monarch-kg-build-process","text":"","title":"Monarch KG Build Process"},{"location":"KG-Build-Process/kg-build-process/#download","text":"A weekly job indepent from the KG build process runs to download data sources and store then on a cloud bucket. This replaces DipperCache from the old pipeline. KGHub Downloader reads from downloads.yaml to download each file. Some post-processing is done in a shell script before the files are uploaded to the cloud bucket. At the start of the main ingest build, data files are copied from the cloud bucket.","title":"Download"},{"location":"KG-Build-Process/kg-build-process/#transform","text":"A call to the ingest command line tool runs each source ingest defined in ingest.yaml , producing both KGX tsv and RDF nt output.","title":"Transform"},{"location":"KG-Build-Process/kg-build-process/#source-ingests","text":"Ingests are documented individually in the Sources section of this documentation. Ingests are either node or edge specific, and use IDs as defined in the source data files without additional re-mapping of identifiers. The primary role they have is to represent sources in biolink model and KGX format, and secondarily they may also subset from the source files. The output of individual ingests can be found in the transform_output directory in each release.","title":"Source Ingests"},{"location":"KG-Build-Process/kg-build-process/#phenio-kg","text":"Ontologies in Monarch are built first as Phenio , then converted into the biolink model and represented as KGX in kg-phenio . The ingest CLI has transform_phenio method then performs some further filtering on the kg-phenio node and edge files. Limiting to nodes and edges that match a subset of curie namespaces, and limiting node property columns to a relevant subset.","title":"Phenio-KG"},{"location":"KG-Build-Process/kg-build-process/#merge","text":"With all transforms complete, the individual kgx node and edge files in output/transform_output can be combined into a merged graph. This is done by the merge command in the ingest CLI. At this point, the individual node and edge KGX files from the transforms may not have matching IDs, and in fact, we may have edges that point to nodes that are not present in our canonical node sources (e.g. a STRING edge that points to an ENSEMBL gene that can't be mapped to HGNC). The merge process is broken down into concatenation, mapping, and finally a QC filter step. We developed a tool called cat merge","title":"Merge"},{"location":"KG-Build-Process/kg-build-process/#concatenate","text":"The first step just loads all node kgx files into one dataframe, and all edge kgx files into another.","title":"Concatenate"},{"location":"KG-Build-Process/kg-build-process/#map","text":"The mapping step replaces subject and object IDs in edge files using SSSOM mapping files, with the IDs from the intial ingests stored in original_subject and original_object fields. Mappings for genes are generated in our monarch-gene-mapping process, and are available at data.monarchinitiative.org . Diseases are mapped using the MONDO SSSOM. This step is requires that the subject of the SSSOM file be our canonical ID, and the object be the non-canonical ID. There is room for improvement here.","title":"Map"},{"location":"KG-Build-Process/kg-build-process/#qc-filter","text":"After edges have been mapped, it's important to cull the graph that point to nodes that don't exist in the graph. The QC filtering step performs joins against the node table/dataframe to split out these edges into their own kgx file ( monarch-kg-dangling-edges.tsv that can be used for QC purposes. A group of edges that wind up in this file could be due to a number of reasons: * We're missing an ontology or other node source that is required for an ingest/source: this is something we want to fix \ud83d\udc4e * We're missing mapping necessary to translate between an edge ingest and our canonical node sources: this is something we want to fix \ud83d\udc4e * The edge ingest includes edges which can't be mapped to our canonical node sources: this is a feature! \ud83d\udc4d We have a visualization of this split between connected and dangling edges for each ingest on our QC Dashboard that we can use to problem-solve our mappings and node sources.","title":"QC Filter"},{"location":"KG-Build-Process/kg-build-process/#neo4j","text":"A neo4j dump is created using the merged tar.gz file using KGX's neo4j loader and a docker container. This process is defined directly in the Jenkinsfile .","title":"Neo4j"},{"location":"KG-Build-Process/kg-build-process/#denormalize","text":"For Solr (and secondarily SQLite) we produce a denormalized edge file, which includes additional details for the subjects and objects of each edge, including the category, namespace/prefix, and ontology ancestor closures following the GOLR pattern (ID and label closure lists). The closure file is generated by relation-graph and are included in the kg-phenio download. The after_download script makes a filtered version that only includes rdfs:subClassOf , BFO:0000050 , and UPHENO:0000001 .","title":"Denormalize"},{"location":"KG-Build-Process/kg-build-process/#sqlite","text":"A SQLite database file is produced by loading node and edge files into a SQLite database using a simple shell script , along with the primary node and edge tables, edge tables for danging and denormalized edges are included as well.","title":"SQLite"},{"location":"KG-Build-Process/kg-build-process/#solr","text":"Our solr index is loaded directly from the node kgx tsv file and the denormalized edge tsv file using LinkML-Solr . The LinkML schema for the Solr index is lives in the monarch-py data access library (see documentation for Entity and Association classes). LinkML-Solr starts Solr in docker via the lsolr command, defines the Solr schema based on the LinkML Schema and then bulk loads the data. Currently, a small amount of additional Solr configuration ( defining new field types , and copy-fields declarations to fill them) is done via curl commands in shell scripts. Our solr load process is defined in scripts/load_solr.sh","title":"Solr"},{"location":"Principles/modeling-principles/","text":"Modeling Principles Conforms to Schema The Monarch Biolink Specification is an implementation of the Biolink Model. The KG must be conformant with The Monarch Biolink Specification. Node Normalization The final KG must have Nodes normalized to the canonical prefix for any given node type. The canonical prefix should be determined by The Monarch Biolink Model Specification. Authoratative Source Providers of Associations are not the authoratative sources for the Nodes in general. Nodes should be ingested from their own authoratative source, seperate from edge ingests. Genes and Proteins Genes and reference Proteins shall be treated as equivalent. When collapsing nodes give the Gene Id the priority, original_subject = UniProt Id. If in future there is a need to represent Isoforms, then UniProt Isoform Ids should be used. Variants Variant to Disease/Phenotype Associations may be rolled up to the Gene level. If they are rolled up, then a subject_modifier = Variant Id. Gene to Disease Associations Gene to Disease Associations should come from high quality sources that have been vetted by domain experts within Monarch. Gene to Disease Associations must not confuse single Gene causal Mendelian Associations with otherwise associated Genes. (e.g. contributing or associated Genes)","title":"Principles"},{"location":"Principles/modeling-principles/#modeling-principles","text":"","title":"Modeling Principles"},{"location":"Principles/modeling-principles/#conforms-to-schema","text":"The Monarch Biolink Specification is an implementation of the Biolink Model. The KG must be conformant with The Monarch Biolink Specification.","title":"Conforms to Schema"},{"location":"Principles/modeling-principles/#node-normalization","text":"The final KG must have Nodes normalized to the canonical prefix for any given node type. The canonical prefix should be determined by The Monarch Biolink Model Specification.","title":"Node Normalization"},{"location":"Principles/modeling-principles/#authoratative-source","text":"Providers of Associations are not the authoratative sources for the Nodes in general. Nodes should be ingested from their own authoratative source, seperate from edge ingests.","title":"Authoratative Source"},{"location":"Principles/modeling-principles/#genes-and-proteins","text":"Genes and reference Proteins shall be treated as equivalent. When collapsing nodes give the Gene Id the priority, original_subject = UniProt Id. If in future there is a need to represent Isoforms, then UniProt Isoform Ids should be used.","title":"Genes and Proteins"},{"location":"Principles/modeling-principles/#variants","text":"Variant to Disease/Phenotype Associations may be rolled up to the Gene level. If they are rolled up, then a subject_modifier = Variant Id.","title":"Variants"},{"location":"Principles/modeling-principles/#gene-to-disease-associations","text":"Gene to Disease Associations should come from high quality sources that have been vetted by domain experts within Monarch. Gene to Disease Associations must not confuse single Gene causal Mendelian Associations with otherwise associated Genes. (e.g. contributing or associated Genes)","title":"Gene to Disease Associations"},{"location":"Sources/","text":"Data Sources This section contains detailed information on all datasets and ontologies ingested to create the Monarch knowledge graph. To learn more about a specific dataset/ontology, click on the source name in the list to the left.","title":"Overview"},{"location":"Sources/#data-sources","text":"This section contains detailed information on all datasets and ontologies ingested to create the Monarch knowledge graph. To learn more about a specific dataset/ontology, click on the source name in the list to the left.","title":"Data Sources"},{"location":"Sources/alliance/","text":"Alliance The Alliance of Genome Resources contains a subset of model organism data from member databases that is harmonized to the same model. Over time, as the alliance adds additional data types, individual MOD ingests can be replaced by collective Alliance ingest. The Alliance has bulk data downloads, ingest data formats, and an API. The preference should be bulk downloads first, followed by ingest formats, finally by API calls. In some cases it may continue to be more practical to load from individual MODs when data is not yet fully harmonized in the Alliance. Alliance Bulk Downloads Alliance schemas Gene Information Genes for all Alliance species (Human, Rat, Mouse, Fish, Fly, Worm, Yeast, Frog) are loaded using the BGI formatted ingest files, as there are no Gene export files. Biolink captured biolink:Gene id symbol name in_taxon source synonyms xref Gene to Phenotype Phenotype for the subset of Alliance species which use phenotype ontologies (Human, Rat, Mouse, Worm) are loaded using the phenotype ingest format , since there is not yet a phenotype export file from the Alliance. This file contains both Gene and Allele phenotypes, so a single column TSV is produced from BGI files listing Gene IDs to check the category and only genes are included. Environmental conditions are present for some species and are captured using the qualifier. Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (condition terms) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source']) Gene Expression This is the full data model of the Alliance file ingested; however, not all fields are currently used in the current ingest (in most cases, these fields are not yet set in the input data sets; see the gene_to_expression.yaml file) Species SpeciesID GeneID GeneSymbol Location StageTerm AssayID AssayTermName CellularComponentID CellularComponentTerm CellularComponentQualifierIDs CellularComponentQualifierTermNames SubStructureID SubStructureName SubStructureQualifierIDs SubStructureQualifierTermNames AnatomyTermID AnatomyTermName AnatomyTermQualifierIDs AnatomyTermQualifierTermNames SourceURL Source Reference Discussion Group : https://www.alliancegenome.org/working-groups#expression Download : https://www.alliancegenome.org/downloads#expression Biolink captured biolink:Gene id (row['GeneID']) name (row['GeneSymbol']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:AnatomicalEntity id (row['AnatomyTermID']) name (row['AnatomyTermName']) source ( infores mapped from row['Source']) biolink:CellularComponent # is_a: anatomical entity... id (row['CellularComponentID']) name (row['CellularComponentTerm']) source ( infores mapped from row['Source']) biolink:LifeStage id (CURIE heuristically inferred from row['SpeciesID'] and row['StageTerm']) name (row['StageTerm']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:GeneToExpressionSiteAssociation id (random uuid) subject (Gene.id) predicates (biolink:expressed_in) object (AnatomicalEntity.id or CellularComponent.id) stage qualifier (LifeStage.id) # if specified; None otherwise has evidence (row['AssayID']) # e.g. taken from MMO - \"measurement method ontology\" publications (row['Reference']) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source']) Literature The Alliance has a well defined literature ingest format that aligns publications from MOD members. Mapping of Alliance publication category to biolink category Alliance category Biolink publication type Research Article IAO:0000013 Review Article IAO:0000013 Thesis IAO:0000311 Book IAO:0000311 Other IAO:0000311 Preprint IAO:0000013 Conference Publication IAO:0000311 Personal Communication IAO:0000311 Direct Data Submission IAO:0000311 Internal Process Reference IAO:0000311 Unknown IAO:0000311 Retraction IAO:0000311 This ingest doesn't make an effort to sort these publication categories into more specific classes than biolink:Publication, but does set the type. Biolink captured biolink:Publication id (primaryId) name (title) summary (abstract) authors (authors.name flattened as a comma separated string) xref (crossReferences.id) mesh terms (meshTerms.meshHeadingTerm , meshTerms.meshQualifierTerm) type (IAO:0000311 for publication, IAO:0000013 for article) creation date (datePublished) keywords (keywords) Citation Harmonizing model organism data in the Alliance of Genome Resources. 2022. Alliance of Genome Resources Consortium. Genetics, Volume 220, Issue 4, April 2022. Published Online: 25 February 2022. doi: doi.org/10.1093/genetics/iyac022. PMID: 35380658; PMCID: PMC8982023.","title":"Alliance"},{"location":"Sources/alliance/#alliance","text":"The Alliance of Genome Resources contains a subset of model organism data from member databases that is harmonized to the same model. Over time, as the alliance adds additional data types, individual MOD ingests can be replaced by collective Alliance ingest. The Alliance has bulk data downloads, ingest data formats, and an API. The preference should be bulk downloads first, followed by ingest formats, finally by API calls. In some cases it may continue to be more practical to load from individual MODs when data is not yet fully harmonized in the Alliance. Alliance Bulk Downloads Alliance schemas","title":"Alliance"},{"location":"Sources/alliance/#gene-information","text":"Genes for all Alliance species (Human, Rat, Mouse, Fish, Fly, Worm, Yeast, Frog) are loaded using the BGI formatted ingest files, as there are no Gene export files. Biolink captured biolink:Gene id symbol name in_taxon source synonyms xref","title":"Gene Information"},{"location":"Sources/alliance/#gene-to-phenotype","text":"Phenotype for the subset of Alliance species which use phenotype ontologies (Human, Rat, Mouse, Worm) are loaded using the phenotype ingest format , since there is not yet a phenotype export file from the Alliance. This file contains both Gene and Allele phenotypes, so a single column TSV is produced from BGI files listing Gene IDs to check the category and only genes are included. Environmental conditions are present for some species and are captured using the qualifier. Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (condition terms) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source'])","title":"Gene to Phenotype"},{"location":"Sources/alliance/#gene-expression","text":"This is the full data model of the Alliance file ingested; however, not all fields are currently used in the current ingest (in most cases, these fields are not yet set in the input data sets; see the gene_to_expression.yaml file) Species SpeciesID GeneID GeneSymbol Location StageTerm AssayID AssayTermName CellularComponentID CellularComponentTerm CellularComponentQualifierIDs CellularComponentQualifierTermNames SubStructureID SubStructureName SubStructureQualifierIDs SubStructureQualifierTermNames AnatomyTermID AnatomyTermName AnatomyTermQualifierIDs AnatomyTermQualifierTermNames SourceURL Source Reference Discussion Group : https://www.alliancegenome.org/working-groups#expression Download : https://www.alliancegenome.org/downloads#expression Biolink captured biolink:Gene id (row['GeneID']) name (row['GeneSymbol']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:AnatomicalEntity id (row['AnatomyTermID']) name (row['AnatomyTermName']) source ( infores mapped from row['Source']) biolink:CellularComponent # is_a: anatomical entity... id (row['CellularComponentID']) name (row['CellularComponentTerm']) source ( infores mapped from row['Source']) biolink:LifeStage id (CURIE heuristically inferred from row['SpeciesID'] and row['StageTerm']) name (row['StageTerm']) in taxon (row['SpeciesID']) source ( infores mapped from row['Source']) biolink:GeneToExpressionSiteAssociation id (random uuid) subject (Gene.id) predicates (biolink:expressed_in) object (AnatomicalEntity.id or CellularComponent.id) stage qualifier (LifeStage.id) # if specified; None otherwise has evidence (row['AssayID']) # e.g. taken from MMO - \"measurement method ontology\" publications (row['Reference']) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:alliancegenome\"]) primary_knowledge_source ( infores mapped from row['Source'])","title":"Gene Expression"},{"location":"Sources/alliance/#literature","text":"The Alliance has a well defined literature ingest format that aligns publications from MOD members. Mapping of Alliance publication category to biolink category Alliance category Biolink publication type Research Article IAO:0000013 Review Article IAO:0000013 Thesis IAO:0000311 Book IAO:0000311 Other IAO:0000311 Preprint IAO:0000013 Conference Publication IAO:0000311 Personal Communication IAO:0000311 Direct Data Submission IAO:0000311 Internal Process Reference IAO:0000311 Unknown IAO:0000311 Retraction IAO:0000311 This ingest doesn't make an effort to sort these publication categories into more specific classes than biolink:Publication, but does set the type. Biolink captured biolink:Publication id (primaryId) name (title) summary (abstract) authors (authors.name flattened as a comma separated string) xref (crossReferences.id) mesh terms (meshTerms.meshHeadingTerm , meshTerms.meshQualifierTerm) type (IAO:0000311 for publication, IAO:0000013 for article) creation date (datePublished) keywords (keywords)","title":"Literature"},{"location":"Sources/alliance/#citation","text":"Harmonizing model organism data in the Alliance of Genome Resources. 2022. Alliance of Genome Resources Consortium. Genetics, Volume 220, Issue 4, April 2022. Published Online: 25 February 2022. doi: doi.org/10.1093/genetics/iyac022. PMID: 35380658; PMCID: PMC8982023.","title":"Citation"},{"location":"Sources/bgee/","text":"BGee Bgee is a database for retrieval and comparison of gene expression patterns across multiple animal species, produced from multiple data types (bulk RNA-Seq, single-cell RNA-Seq, Affymetrix, in situ hybridization, and EST data) and from multiple data sets (including GTEx data). Gene Expression This is the full data model of the Bgee simple gene expression file; however, not all fields are currently used in the current ingest. Files are named by Species ID. \"Gene name\" Anatomical entity ID \"Anatomical entity name\" Expression Call quality FDR Expression score Expression rank Biolink Captured biolink:GeneToExpressionSiteAssociation id (random uuid, generated) subject ( Gene ID ) predicates (biolink:expressed_in, constant) object ( Anatomical entity ID ) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:bgee\"]) Decisions and Discussion We elected to use the simple gene expression file for ease of use and because the advanced doesn't contain much more data we are likely to use. We could potentially import has evidence from the advanced file comparing Affimetrix expression and RNA-Seq expression but this doesn't seem valuable at this time. Stage and Strain information is also available in all_conditions file. We have elected to not import the stage information due to multiple duplicate edges based on strain. Citation \"Bastian FB, Roux J, Niknejad A, Comte A, Fonseca Costa SS, Mendes de Farias T, Moretti S, Parmentier G, Rech de Laval V, Rosikiewicz M, Wollbrett J, Echchiki A, Escoriza A, Gharib W, Gonzales-Porta M, Jarosz Y, Laurenczy B, Moret P, Person E, Roelli P, Sanjeev K, Seppey M, Robinson-Rechavi M. The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals in Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831-D847\"","title":"BGee"},{"location":"Sources/bgee/#bgee","text":"Bgee is a database for retrieval and comparison of gene expression patterns across multiple animal species, produced from multiple data types (bulk RNA-Seq, single-cell RNA-Seq, Affymetrix, in situ hybridization, and EST data) and from multiple data sets (including GTEx data).","title":"BGee"},{"location":"Sources/bgee/#gene-expression","text":"This is the full data model of the Bgee simple gene expression file; however, not all fields are currently used in the current ingest. Files are named by Species ID. \"Gene name\" Anatomical entity ID \"Anatomical entity name\" Expression Call quality FDR Expression score Expression rank Biolink Captured biolink:GeneToExpressionSiteAssociation id (random uuid, generated) subject ( Gene ID ) predicates (biolink:expressed_in, constant) object ( Anatomical entity ID ) aggregating_knowledge_source ([\"infores:monarchinitiative\", \"infores:bgee\"]) Decisions and Discussion We elected to use the simple gene expression file for ease of use and because the advanced doesn't contain much more data we are likely to use. We could potentially import has evidence from the advanced file comparing Affimetrix expression and RNA-Seq expression but this doesn't seem valuable at this time. Stage and Strain information is also available in all_conditions file. We have elected to not import the stage information due to multiple duplicate edges based on strain.","title":"Gene Expression"},{"location":"Sources/bgee/#citation","text":"\"Bastian FB, Roux J, Niknejad A, Comte A, Fonseca Costa SS, Mendes de Farias T, Moretti S, Parmentier G, Rech de Laval V, Rosikiewicz M, Wollbrett J, Echchiki A, Escoriza A, Gharib W, Gonzales-Porta M, Jarosz Y, Laurenczy B, Moret P, Person E, Roelli P, Sanjeev K, Seppey M, Robinson-Rechavi M. The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals in Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831-D847\"","title":"Citation"},{"location":"Sources/ctd/","text":"Comparative Toxicogenomics Database (CTD) CTD is a robust, publicly available database that aims to advance understanding about how environmental exposures affect human health. It provides manually curated information about chemical\u2013gene/protein interactions, chemical\u2013disease and gene\u2013disease relationships. These data are integrated with functional and pathway data to aid in development of hypotheses about the mechanisms underlying environmentally influenced diseases. CTD Bulk Downloads Chemical to Disease This ingest takes only the chemical to disease rows where a direct evidence label is applied, and creates ChemicalEntity and Disease nodes connected by a ChemicalToDiseaseOrPhenotypicFeatureAssociation. The the chemical ID row is expected to need a 'MESH:' prefix added, the disease id is used as-is. Rows are included only if the direct evidence field is 'therapeutic' and the biolink:affects predicate is used to avoid making too strong a claim. Biolink Captured ChemicalToDiseaseOrPhenotypicFeatureAssociation id (random uuid) subject (chemical id) predicate ( biolink:affects ) object (disease id) publication (pubmed ids provided by file) aggregating_knowledge_source ( [\"infores:monarchinitiative\"] ) primary_knowledge_source ( infores:ctd ) Citation Davis AP, Wiegers TC, Johnson RJ, Sciaky D, Wiegers J, Mattingly CJ Comparative Toxicogenomics Database (CTD): update 2023. Nucleic Acids Res. 2022 Sep 28.","title":"CTD"},{"location":"Sources/ctd/#comparative-toxicogenomics-database-ctd","text":"CTD is a robust, publicly available database that aims to advance understanding about how environmental exposures affect human health. It provides manually curated information about chemical\u2013gene/protein interactions, chemical\u2013disease and gene\u2013disease relationships. These data are integrated with functional and pathway data to aid in development of hypotheses about the mechanisms underlying environmentally influenced diseases. CTD Bulk Downloads Chemical to Disease This ingest takes only the chemical to disease rows where a direct evidence label is applied, and creates ChemicalEntity and Disease nodes connected by a ChemicalToDiseaseOrPhenotypicFeatureAssociation. The the chemical ID row is expected to need a 'MESH:' prefix added, the disease id is used as-is. Rows are included only if the direct evidence field is 'therapeutic' and the biolink:affects predicate is used to avoid making too strong a claim. Biolink Captured ChemicalToDiseaseOrPhenotypicFeatureAssociation id (random uuid) subject (chemical id) predicate ( biolink:affects ) object (disease id) publication (pubmed ids provided by file) aggregating_knowledge_source ( [\"infores:monarchinitiative\"] ) primary_knowledge_source ( infores:ctd )","title":"Comparative Toxicogenomics Database (CTD)"},{"location":"Sources/ctd/#citation","text":"Davis AP, Wiegers TC, Johnson RJ, Sciaky D, Wiegers J, Mattingly CJ Comparative Toxicogenomics Database (CTD): update 2023. Nucleic Acids Res. 2022 Sep 28.","title":"Citation"},{"location":"Sources/dictybase/","text":"Dictybase Dictybase is a comprehensive database for the ameboid protozoan Dictyostelium discoideum , which is a powerful model system for genetic and functional analysis of gene function. Dictybase Bulk Downloads Gene Information Dictybase genes in the Gene to Phenotype ingest (below) are either directly identified from their gene identifier, mapped directly to NCBI Dictyostelium discoideum gene identifier mappings or mapped indirectly from the Dictybase identifier, names and synonyms mappings , with synonyms being populated as available (Note: full gene product information is not captured at this time). Gene to Phenotype Data is available in a well-documented easy-to-parse GAF-like format with associations to an UPHENO-compliant ontology. Phenotypes are linked to Strains, and the Strains are linked to Genes. Biolink Captured biolink:Gene 'id' (NCBI or Dictybase) 'category' 'name' 'symbol' 'in_taxon' 'source' biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) category (GeneToPhenotypicFeatureAssociation) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:dictybase) Citation Fey, P., Dodson, R., Basu, S., Chisholm, R. L. (2013). 'One Stop Shop for Everything Dictyostelium: dictyBase and the Dicty Stock Center'. Dictyostelium discoideum Protocols. Methods Mol. Biol. 983:59-92, edited by Ludwig Eichinger and Francisco Rivero.","title":"Dictybase"},{"location":"Sources/dictybase/#dictybase","text":"Dictybase is a comprehensive database for the ameboid protozoan Dictyostelium discoideum , which is a powerful model system for genetic and functional analysis of gene function. Dictybase Bulk Downloads","title":"Dictybase"},{"location":"Sources/dictybase/#gene-information","text":"Dictybase genes in the Gene to Phenotype ingest (below) are either directly identified from their gene identifier, mapped directly to NCBI Dictyostelium discoideum gene identifier mappings or mapped indirectly from the Dictybase identifier, names and synonyms mappings , with synonyms being populated as available (Note: full gene product information is not captured at this time).","title":"Gene Information"},{"location":"Sources/dictybase/#gene-to-phenotype","text":"Data is available in a well-documented easy-to-parse GAF-like format with associations to an UPHENO-compliant ontology. Phenotypes are linked to Strains, and the Strains are linked to Genes. Biolink Captured biolink:Gene 'id' (NCBI or Dictybase) 'category' 'name' 'symbol' 'in_taxon' 'source' biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) category (GeneToPhenotypicFeatureAssociation) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:dictybase)","title":"Gene to Phenotype"},{"location":"Sources/dictybase/#citation","text":"Fey, P., Dodson, R., Basu, S., Chisholm, R. L. (2013). 'One Stop Shop for Everything Dictyostelium: dictyBase and the Dicty Stock Center'. Dictyostelium discoideum Protocols. Methods Mol. Biol. 983:59-92, edited by Ludwig Eichinger and Francisco Rivero.","title":"Citation"},{"location":"Sources/flybase/","text":"FlyBase is the model organism database providing integrated genetic, genomic, phenomic, and biological data for Drosophila melanogaster. FlyBase bulk downloads Gene Literature This ingest uses FlyBase's publication-to-gene download file, which contains all entities and only assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the FlyBase_publication_id for the publication node if PubMed_id is not available, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:flybase) Citation Gramates LS, Agapite J, Attrill H, Calvi BR, Crosby M, dos Santos G Goodman JL, Goutte-Gattat D, Jenkins V, Kaufman T, Larkin A, Matthews B, Millburn G, Strelets VB, and the FlyBase Consortium (2022) FlyBase: a guided tour of highlighted features. Genetics, Volume 220, Issue 4, April 2022, iyac035","title":"Flybase"},{"location":"Sources/flybase/#gene-literature","text":"This ingest uses FlyBase's publication-to-gene download file, which contains all entities and only assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the FlyBase_publication_id for the publication node if PubMed_id is not available, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:flybase)","title":"Gene Literature"},{"location":"Sources/flybase/#citation","text":"Gramates LS, Agapite J, Attrill H, Calvi BR, Crosby M, dos Santos G Goodman JL, Goutte-Gattat D, Jenkins V, Kaufman T, Larkin A, Matthews B, Millburn G, Strelets VB, and the FlyBase Consortium (2022) FlyBase: a guided tour of highlighted features. Genetics, Volume 220, Issue 4, April 2022, iyac035","title":"Citation"},{"location":"Sources/goa/","text":"Gene Ontology Annotation (GOA) Database The Gene Ontology Annotation Database compiles high-quality Gene Ontology (GO) annotations to proteins in the UniProt Knowledgebase (UniProtKB) , RNA molecules from RNACentral and protein complexes from the Complex Portal . Manual annotation is the direct assignment of GO terms to proteins, ncRNA and protein complexes by curators from evidence extracted during the review of published scientific literature, with an appropriate evidence code assigned to give an assessment of the strength of the evidence. GOA files contain a mixture of manual annotation supplied by members of the Gene Ontology Consortium and computationally assigned GO terms describing gene products. Annotation type is clearly indicated by associated evidence codes and there are links to the source data. GO Annotations There is a ReadMe.txt file that explains the different annotation files available. The ingested Gene Annotation File (GAF) is a 17 column tab-delimited file. The file format conforms to the specifications demanded by the GO Consortium and therefore GO IDs and not GO term names are shown. Biolink captured Subject Concept Node (Gene) biolink:Gene id (NCBIGene Entrez ID) Object Concept Node (Gene Ontology Terms) biolink:MolecularActivity id (GO ID) biolink:BiologicalProcess id (GO ID) biolink:CellularComponent id (GO ID) Additional Gene Ontology Term Concept Nodes for possible use? biolink:Pathway id (GO ID) biolink:PhysiologicalProcess id (GO ID) Associations biolink:FunctionalAssociation id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) OR biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (participates_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToCellularComponentAssociation : id (random uuid) subject (gene.id) predicate (located_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Possible Additional Gene to Gene Ontology Term Association? biolink:GeneToGoTermAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Citation Ashburner et al. Gene ontology: tool for the unification of biology. Nat Genet. 2000 May;25(1):25-9. The Gene Ontology Consortium. The Gene Ontology knowledgebase in 2023. Genetics. 2023 May 4;224(1):iyad031","title":"GOA"},{"location":"Sources/goa/#gene-ontology-annotation-goa-database","text":"The Gene Ontology Annotation Database compiles high-quality Gene Ontology (GO) annotations to proteins in the UniProt Knowledgebase (UniProtKB) , RNA molecules from RNACentral and protein complexes from the Complex Portal . Manual annotation is the direct assignment of GO terms to proteins, ncRNA and protein complexes by curators from evidence extracted during the review of published scientific literature, with an appropriate evidence code assigned to give an assessment of the strength of the evidence. GOA files contain a mixture of manual annotation supplied by members of the Gene Ontology Consortium and computationally assigned GO terms describing gene products. Annotation type is clearly indicated by associated evidence codes and there are links to the source data.","title":"Gene Ontology Annotation (GOA) Database"},{"location":"Sources/goa/#go-annotations","text":"There is a ReadMe.txt file that explains the different annotation files available. The ingested Gene Annotation File (GAF) is a 17 column tab-delimited file. The file format conforms to the specifications demanded by the GO Consortium and therefore GO IDs and not GO term names are shown. Biolink captured","title":"GO Annotations"},{"location":"Sources/goa/#subject-concept-node-gene","text":"biolink:Gene id (NCBIGene Entrez ID)","title":"Subject Concept Node (Gene)"},{"location":"Sources/goa/#object-concept-node-gene-ontology-terms","text":"biolink:MolecularActivity id (GO ID) biolink:BiologicalProcess id (GO ID) biolink:CellularComponent id (GO ID)","title":"Object Concept Node (Gene Ontology Terms)"},{"location":"Sources/goa/#additional-gene-ontology-term-concept-nodes-for-possible-use","text":"biolink:Pathway id (GO ID) biolink:PhysiologicalProcess id (GO ID) Associations biolink:FunctionalAssociation id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) OR biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (participates_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) biolink:MacromolecularMachineToCellularComponentAssociation : id (random uuid) subject (gene.id) predicate (located_in) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa) Possible Additional Gene to Gene Ontology Term Association? biolink:GeneToGoTermAssociation : id (random uuid) subject (gene.id) predicate (related_to) object (go_term.id) negated has_evidence aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:goa)","title":"Additional Gene Ontology Term Concept Nodes for possible use?"},{"location":"Sources/goa/#citation","text":"Ashburner et al. Gene ontology: tool for the unification of biology. Nat Genet. 2000 May;25(1):25-9. The Gene Ontology Consortium. The Gene Ontology knowledgebase in 2023. Genetics. 2023 May 4;224(1):iyad031","title":"Citation"},{"location":"Sources/hgnc/","text":"HGNC (HUGO Gene Nomenclature Committee) The HGNC is responsible for approving unique symbols and names for human loci, including protein coding genes, ncRNA genes and pseudogenes, to allow unambiguous scientific communication. HGNC bulk downloads Gene Information This ingest uses HGNC's \"complete set\" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink Captured biolink:Gene id (HGNC identifier) symbol name synonym alias symbol alias name prev symbol prev name xref ensembl gene id omim id in_taxon ([\"NCBITaxon:9606\"]) provided_by ([\"infores:hgnc\"]) Citation HGNC Database, HUGO Gene Nomenclature Committee (HGNC), European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Genome Campus, Hinxton, Cambridge CB10 1SD, United Kingdom www.genenames.org .","title":"HGNC"},{"location":"Sources/hgnc/#hgnc-hugo-gene-nomenclature-committee","text":"The HGNC is responsible for approving unique symbols and names for human loci, including protein coding genes, ncRNA genes and pseudogenes, to allow unambiguous scientific communication. HGNC bulk downloads","title":"HGNC (HUGO Gene Nomenclature Committee)"},{"location":"Sources/hgnc/#gene-information","text":"This ingest uses HGNC's \"complete set\" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink Captured biolink:Gene id (HGNC identifier) symbol name synonym alias symbol alias name prev symbol prev name xref ensembl gene id omim id in_taxon ([\"NCBITaxon:9606\"]) provided_by ([\"infores:hgnc\"])","title":"Gene Information"},{"location":"Sources/hgnc/#citation","text":"HGNC Database, HUGO Gene Nomenclature Committee (HGNC), European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI), Wellcome Genome Campus, Hinxton, Cambridge CB10 1SD, United Kingdom www.genenames.org .","title":"Citation"},{"location":"Sources/hpoa/","text":"Human Phenotype Ontology Annotations (HPOA) The Human Phenotype Ontology group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create Biolink associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). There are four HPOA ingests - 'disease-to-phenotype', 'disease-to-mode-of-inheritance', 'gene-to-disease' and 'disease-to-mode-of-inheritance' - that parse out records from the HPO Annotation File . The 'disease-to-phenotype', 'disease-to-mode-of-inheritance' and 'gene-to-disease' parsers currently only process the \"abnormal\" annotations. Association to \"remarkable normality\" may be added in the near future. The 'disease-to-mode-of-inheritance' ingest script parses 'inheritance' record information out from the annotation file. Gene to Disease This ingest replaces the direct OMIM ingest so that we share g2d associations 1:1 with HPO. The mapping between association_type and biolink predicates shown below is the one way in which this ingest is opinionated, but attempts to be a direct translation into the biolink model. genes_to_disease.txt with the following fields: 'ncbi_gene_id' 'gene_symbol' 'association_type' 'disease_id' 'source' Biolink Captured biolink:CorrelatedGeneToDiseaseAssociation or biolink:CausalGeneToDiseaseAssociation (depending on predicate) id (random uuid) subject (ncbi_gene_id) predicate (association_type) MENDELIAN: biolink:causes POLYGENIC: biolink:contributes_to UNKNOWN: biolink:gene_associated_with_condition object (disease_id) primary_knowledge_source (source) medgen: infores:omim orphanet: infores:orphanet aggregator_knowledge_source ([\"infores:monarchinitiative\"]) also for medgen: infores:medgen Disease to Phenotype phenotype.hpoa: A description of this file is found here , has the following fields: 'database_id' 'disease_name' 'qualifier' 'hpo_id' 'reference' 'evidence' 'onset' 'frequency' 'sex' 'modifier' 'aspect' 'biocuration' Note that we're calling this the disease to phenotype file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'P' (phenotypic anomalies) , but ignoring all other Aspects. Frequencies The 'Frequency' field of the aforementioned phenotypes.hpoa file has the following definition, excerpted from its Annotation Format page: 8. Frequency: There are three allowed options for this field. (A) A term-id from the HPO-sub-ontology below the term \u201cFrequency\u201d (HP:0040279). (since December 2016 ; before was a mixture of values). The terms for frequency are in alignment with Orphanet. * (B) A count of patients affected within a cohort. For instance, 7/13 would indicate that 7 of the 13 patients with the specified disease were found to have the phenotypic abnormality referred to by the HPO term in question in the study referred to by the DB_Reference; (C) A percentage value such as 17%. The Disease to Phenotype ingest attempts to remap these raw frequency values onto a suitable HPO term. A simplistic (perhaps erroneous?) assumption is that all such frequencies are conceptually comparable; however, researchers may wish to review the original publications to confirm fitness of purpose of the specific data points to their interpretation - specific values could designate phenotypic frequency at the population level; phenotypic frequency at the cohort level; or simply, be a measure of penetrance of a specific allele within carriers, etc.. Biolink captured biolink:DiseaseToPhenotypicFeatureAssociation id (random uuid) subject (disease.id) predicate (has_phenotype) negated (True if 'qualifier' == \"NOT\") object (phenotypicFeature.id) publications (List[publication.id]) has_evidence (List[Note [1]]), sex_qualifier (Note [2]) onset_qualifier (Onset.id) frequency_qualifier (Note [3]) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Notes: 1. CURIE of [Evidence and Conclusion Ontology( https://bioportal.bioontology.org/ontologies/ECO )] term 2. female -> PATO:0000383, male -> PATO:0000384 or None 3. See the Frequencies section above. Disease to Modes of Inheritance Same as above, we again parse the phenotype.hpoa file . However, we're calling this the 'disease to modes of inheritance' file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'I' (inheritance) , but ignoring all other Aspects. Biolink captured biolink:DiseaseOrPhenotypicFeatureToGeneticInheritanceAssociation id (random uuid) subject (disease.id) predicate (has_mode_of_inheritance) object (geneticInheritance.id) publications (List[publication.id]) has_evidence (List[Note [1]]), aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Gene to Phenotype The gene-to-phenotype ingest processes the tab-delimited HPOA gene_to_phenotype.txt file, which has the following fields: 'ncbi_gene_id' 'gene_symbol' 'hpo_id' 'hpo_name' Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:hpo-annotations) Citation Sebastian K\u00f6hler, Michael Gargano, Nicolas Matentzoglu, Leigh C Carmody, David Lewis-Smith, Nicole A Vasilevsky, Daniel Danis, Ganna Balagura, Gareth Baynam, Amy M Brower, Tiffany J Callahan, Christopher G Chute, Johanna L Est, Peter D Galer, Shiva Ganesan, Matthias Griese, Matthias Haimel, Julia Pazmandi, Marc Hanauer, Nomi L Harris, Michael J Hartnett, Maximilian Hastreiter, Fabian Hauck, Yongqun He, Tim Jeske, Hugh Kearney, Gerhard Kindle, Christoph Klein, Katrin Knoflach, Roland Krause, David Lagorce, Julie A McMurry, Jillian A Miller, Monica C Munoz-Torres, Rebecca L Peters, Christina K Rapp, Ana M Rath, Shahmir A Rind, Avi Z Rosenberg, Michael M Segal, Markus G Seidel, Damian Smedley, Tomer Talmy, Yarlalu Thomas, Samuel A Wiafe, Julie Xian, Zafer Y\u00fcksel, Ingo Helbig, Christopher J Mungall, Melissa A Haendel, Peter N Robinson, The Human Phenotype Ontology in 2021, Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D1207\u2013D1217, https://doi.org/10.1093/nar/gkaa1043","title":"HPOA"},{"location":"Sources/hpoa/#human-phenotype-ontology-annotations-hpoa","text":"The Human Phenotype Ontology group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create Biolink associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). There are four HPOA ingests - 'disease-to-phenotype', 'disease-to-mode-of-inheritance', 'gene-to-disease' and 'disease-to-mode-of-inheritance' - that parse out records from the HPO Annotation File . The 'disease-to-phenotype', 'disease-to-mode-of-inheritance' and 'gene-to-disease' parsers currently only process the \"abnormal\" annotations. Association to \"remarkable normality\" may be added in the near future. The 'disease-to-mode-of-inheritance' ingest script parses 'inheritance' record information out from the annotation file.","title":"Human Phenotype Ontology Annotations (HPOA)"},{"location":"Sources/hpoa/#gene-to-disease","text":"This ingest replaces the direct OMIM ingest so that we share g2d associations 1:1 with HPO. The mapping between association_type and biolink predicates shown below is the one way in which this ingest is opinionated, but attempts to be a direct translation into the biolink model. genes_to_disease.txt with the following fields: 'ncbi_gene_id' 'gene_symbol' 'association_type' 'disease_id' 'source' Biolink Captured biolink:CorrelatedGeneToDiseaseAssociation or biolink:CausalGeneToDiseaseAssociation (depending on predicate) id (random uuid) subject (ncbi_gene_id) predicate (association_type) MENDELIAN: biolink:causes POLYGENIC: biolink:contributes_to UNKNOWN: biolink:gene_associated_with_condition object (disease_id) primary_knowledge_source (source) medgen: infores:omim orphanet: infores:orphanet aggregator_knowledge_source ([\"infores:monarchinitiative\"]) also for medgen: infores:medgen","title":"Gene to Disease"},{"location":"Sources/hpoa/#disease-to-phenotype","text":"phenotype.hpoa: A description of this file is found here , has the following fields: 'database_id' 'disease_name' 'qualifier' 'hpo_id' 'reference' 'evidence' 'onset' 'frequency' 'sex' 'modifier' 'aspect' 'biocuration' Note that we're calling this the disease to phenotype file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'P' (phenotypic anomalies) , but ignoring all other Aspects. Frequencies The 'Frequency' field of the aforementioned phenotypes.hpoa file has the following definition, excerpted from its Annotation Format page: 8. Frequency: There are three allowed options for this field. (A) A term-id from the HPO-sub-ontology below the term \u201cFrequency\u201d (HP:0040279). (since December 2016 ; before was a mixture of values). The terms for frequency are in alignment with Orphanet. * (B) A count of patients affected within a cohort. For instance, 7/13 would indicate that 7 of the 13 patients with the specified disease were found to have the phenotypic abnormality referred to by the HPO term in question in the study referred to by the DB_Reference; (C) A percentage value such as 17%. The Disease to Phenotype ingest attempts to remap these raw frequency values onto a suitable HPO term. A simplistic (perhaps erroneous?) assumption is that all such frequencies are conceptually comparable; however, researchers may wish to review the original publications to confirm fitness of purpose of the specific data points to their interpretation - specific values could designate phenotypic frequency at the population level; phenotypic frequency at the cohort level; or simply, be a measure of penetrance of a specific allele within carriers, etc.. Biolink captured biolink:DiseaseToPhenotypicFeatureAssociation id (random uuid) subject (disease.id) predicate (has_phenotype) negated (True if 'qualifier' == \"NOT\") object (phenotypicFeature.id) publications (List[publication.id]) has_evidence (List[Note [1]]), sex_qualifier (Note [2]) onset_qualifier (Onset.id) frequency_qualifier (Note [3]) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\") Notes: 1. CURIE of [Evidence and Conclusion Ontology( https://bioportal.bioontology.org/ontologies/ECO )] term 2. female -> PATO:0000383, male -> PATO:0000384 or None 3. See the Frequencies section above.","title":"Disease to Phenotype"},{"location":"Sources/hpoa/#disease-to-modes-of-inheritance","text":"Same as above, we again parse the phenotype.hpoa file . However, we're calling this the 'disease to modes of inheritance' file because - using the YAML file filter configuration for the ingest - we are only parsing rows with Aspect == 'I' (inheritance) , but ignoring all other Aspects. Biolink captured biolink:DiseaseOrPhenotypicFeatureToGeneticInheritanceAssociation id (random uuid) subject (disease.id) predicate (has_mode_of_inheritance) object (geneticInheritance.id) publications (List[publication.id]) has_evidence (List[Note [1]]), aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (\"infores:hpo-annotations\")","title":"Disease to Modes of Inheritance"},{"location":"Sources/hpoa/#gene-to-phenotype","text":"The gene-to-phenotype ingest processes the tab-delimited HPOA gene_to_phenotype.txt file, which has the following fields: 'ncbi_gene_id' 'gene_symbol' 'hpo_id' 'hpo_name' Biolink captured biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:hpo-annotations)","title":"Gene to Phenotype"},{"location":"Sources/hpoa/#citation","text":"Sebastian K\u00f6hler, Michael Gargano, Nicolas Matentzoglu, Leigh C Carmody, David Lewis-Smith, Nicole A Vasilevsky, Daniel Danis, Ganna Balagura, Gareth Baynam, Amy M Brower, Tiffany J Callahan, Christopher G Chute, Johanna L Est, Peter D Galer, Shiva Ganesan, Matthias Griese, Matthias Haimel, Julia Pazmandi, Marc Hanauer, Nomi L Harris, Michael J Hartnett, Maximilian Hastreiter, Fabian Hauck, Yongqun He, Tim Jeske, Hugh Kearney, Gerhard Kindle, Christoph Klein, Katrin Knoflach, Roland Krause, David Lagorce, Julie A McMurry, Jillian A Miller, Monica C Munoz-Torres, Rebecca L Peters, Christina K Rapp, Ana M Rath, Shahmir A Rind, Avi Z Rosenberg, Michael M Segal, Markus G Seidel, Damian Smedley, Tomer Talmy, Yarlalu Thomas, Samuel A Wiafe, Julie Xian, Zafer Y\u00fcksel, Ingo Helbig, Christopher J Mungall, Melissa A Haendel, Peter N Robinson, The Human Phenotype Ontology in 2021, Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D1207\u2013D1217, https://doi.org/10.1093/nar/gkaa1043","title":"Citation"},{"location":"Sources/mgi/","text":"Mouse Genome Informatics (MGI) Mouse Genome Informatics (MGI) is the international database resource for the laboratory mouse, providing integrated genetic, genomic, and biological data to facilitate the study of human health and disease. MGI bulk downloads Gene Literature This ingest uses MGI's Reference download file, which contains genes and a tab-delimited list of PubMed IDs in which they are mentioned. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:mgi) Citation Blake JA, Baldarelli R, Kadin JA, Richardson JE, Smith CL, Bult CJ; Mouse Genome Database Group. 2021. Mouse Genome Database (MGD): Knowledgebase for mouse-human comparative biology. Nucleic Acids Res. 2021 Jan 8;49(D1):D981-D987.","title":"Mouse Genome Informatics (MGI)"},{"location":"Sources/mgi/#mouse-genome-informatics-mgi","text":"Mouse Genome Informatics (MGI) is the international database resource for the laboratory mouse, providing integrated genetic, genomic, and biological data to facilitate the study of human health and disease. MGI bulk downloads","title":"Mouse Genome Informatics (MGI)"},{"location":"Sources/mgi/#gene-literature","text":"This ingest uses MGI's Reference download file, which contains genes and a tab-delimited list of PubMed IDs in which they are mentioned. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:mgi)","title":"Gene Literature"},{"location":"Sources/mgi/#citation","text":"Blake JA, Baldarelli R, Kadin JA, Richardson JE, Smith CL, Bult CJ; Mouse Genome Database Group. 2021. Mouse Genome Database (MGD): Knowledgebase for mouse-human comparative biology. Nucleic Acids Res. 2021 Jan 8;49(D1):D981-D987.","title":"Citation"},{"location":"Sources/ncbi/","text":"National Center for Biotechnology Information (NCBI) The NCBI Gene integrates information from a wide range of species. A record may include nomenclature, Reference Sequences (RefSeqs), maps, pathways, variations, phenotypes, and links to genome-, phenotype-, and locus-specific resources worldwide. NCBI bulk downloads Gene Information Genes for all NCBI species (Dog, Cow, Pig, Chicken) are loaded using the ingest file (filtered to only NCBI taxon ID). Biolink Captured biolink:Gene id symbol description in_taxon provided_by ([\"infores:ncbi-gene\"]) Citation National Center for Biotechnology Information (NCBI)[Internet]. Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information; [1988] \u2013 [cited 2024 Dec]. Available from: https://www.ncbi.nlm.nih.gov/","title":"NCBI"},{"location":"Sources/ncbi/#national-center-for-biotechnology-information-ncbi","text":"The NCBI Gene integrates information from a wide range of species. A record may include nomenclature, Reference Sequences (RefSeqs), maps, pathways, variations, phenotypes, and links to genome-, phenotype-, and locus-specific resources worldwide. NCBI bulk downloads","title":"National Center for Biotechnology Information (NCBI)"},{"location":"Sources/ncbi/#gene-information","text":"Genes for all NCBI species (Dog, Cow, Pig, Chicken) are loaded using the ingest file (filtered to only NCBI taxon ID). Biolink Captured biolink:Gene id symbol description in_taxon provided_by ([\"infores:ncbi-gene\"])","title":"Gene Information"},{"location":"Sources/ncbi/#citation","text":"National Center for Biotechnology Information (NCBI)[Internet]. Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information; [1988] \u2013 [cited 2024 Dec]. Available from: https://www.ncbi.nlm.nih.gov/","title":"Citation"},{"location":"Sources/panther/","text":"PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System Panther Gene Orthology Gene orthology analyses generate testable hypothesis about gene function and biological processes using experimental results from other (especially highly studied so-called 'model' species) using protein (and sometimes, simply nucleic acid level) alignments of genomic sequences. The source of gene orthology data for this ingest is from the PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System . Panther was designed to classify proteins (and their genes) in order to facilitate high-throughput analysis. Proteins have been classified according to: - Family and subfamily: families are groups of evolutionarily related proteins; subfamilies are related proteins that also have the same function - Molecular function: the function of the protein by itself or with directly interacting proteins at a biochemical level, e.g. a protein kinase - Biological process: the function of the protein in the context of a larger network of proteins that interact to accomplish a process at the level of the cell or organism, e.g. mitosis. - Pathway: similar to biological process, but a pathway also explicitly specifies the relationships between the interacting molecules. The PANTHER Classifications are the result of human curation as well as sophisticated bioinformatics algorithms. Details of the methods can be found in Mi et al. NAR 2013; Thomas et al., Genome Research 2003 . This ingest uses data derived form the current version (release 16.0) of the Panther Hidden Markov Model (HMM). Panther Gene Orthology bulk data downloads There are various cross-sections of the Panther database which remain be covered by this ingest (Note: T.B.D means \"To Be Done\") Status of Panther Ingest The first iteration of this dataset (committed March 2022) focuses on Reference Genome Gene-to-Gene Orthology Relationships . Additional Panther associations (protein (sub)family pathways, sequences, etc , as generally described below) may be added at a later date. Reference Genome Gene-to-Gene Orthology Relationships Contains the Reference Genomes' Gene-to-Gene Ortholog mappings from Panther analyses. Source File: AllOrthologs.tar.gz . The source file is huge, containing data from all species, many of which are not currently of direct interest to Monarch. For this reason, a Python function filter_panther_orthologs_file was coded within orthology_utils . ALL_ORTHOLOGS_FILE = \"AllOrthologs\" TARGET_SPECIES_ORTHOLOGS = \"TargetOrthologs\" def filter_panther_orthologs_file ( directory : str = '.' , source_filename : str = ALL_ORTHOLOGS_FILE , target_filename : str = TARGET_SPECIES_ORTHOLOGS , number_of_lines : int = 0 ) -> bool : \"\"\" Filters a tar.gz Panther input file against the target list of species. :param directory: str, location of source data file :param source_filename: str, source data file name :param target_filename: str, target data file name :param number_of_lines: int, number of lines parsed; 'all' lines parsed if omitted or set to zero :return: bool, True if filtering was successful; False if unsuccessful \"\"\" ... which could be called with default parameter values in the following manner (if invoked from within the Panther data directory): filter_file () to generate a pruned down TargetOrthologs.tar.gz file with target species (as hardcoded in the catalog of species in the ortholog_utils module). Panther Data Model of Panther Orthologs Data Field Content Gene species1 | DB=id1 | protdb=pdbid1 Ortholog species2 | DB=id2 | protdb=pdbid2 Type of ortholog [LDO, O, P, X ,LDX] see README . Common ancestor for the orthologs taxon name of common ancestor Panther Ortholog ID Panther (sub)family identifier The DB=id# fields - where DB == database namespace and id# is the object identifier - are directly translated, by internal namespace mapping, into gene CURIEs. The species# are abridged labels currently filtered and mapped onto NCBI Taxon identifiers, using an hard-coded dictionary. Biolink classes and properties captured biolink:Gene id (NCBIGene Entrez ID) Note that the Gene source is currently given as Panther, although the real source of a Gene identifier is given by its CURIE namespace. biolink:GeneToGeneHomologyAssociation id (random uuid) subject (gene.id) predicate (orthologous to) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Protein Family and Subfamily Classifications - T.B.D. Contains the PANTHER 16.0 family/subfamily name, with molecular function, biological process, and pathway classifications for every PANTHER protein family and subfamily in the current PANTHER HMM library. Source File: http://data.pantherdb.org/ftp/hmm_classifications/current_release/PANTHER16.0_HMM_classifications Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneFamilyToMolecularFunctionAssociation id (random uuid) subject (gene_family.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToBiologicalProcessAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Pathways - T.B.D. Contains regulatory and metabolic pathways, each with subfamilies and protein sequences mapped to individual pathway components. Source File: http://data.pantherdb.org/ftp/pathway/current_release/SequenceAssociationPathway3.6.5.txt local_name: data/orthology/pathways.tsv Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:Gene id (NCBIGene Entrez ID) in taxon (NCBITaxon ID) source (infores:entrez) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Sequence Classifications - T.B.D. Sequence Classifications files contain the PANTHER family, subfamily, molecular function, biological process, and pathway classifications for the complete proteomes derived from the various genomes, indexed by species (one source file per species). Refer to the Sequence Classification README for details. Only a subset of the available species will be ingested into Monarch at this time, currently: human, mouse, rat, zebrafish, fruit fly, nematode, fission yeast and budding (\"baker's\") yeast. Source File Directory: http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ Biolink classes and properties captured: biolink:Gene id (PANTHER.FAMILY ID) source (infores:panther) biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToGeneFamilyAssociation : id (random uuid) subject (gene.id) predicate (member_of) object (gene_family.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) Citation Paul D. Thomas, Dustin Ebert, Anushya Muruganujan, Tremayne Mushayahama, Laurent-Philippe Albou and Huaiyu Mi Protein Society. 2022;31(1):8-22. doi:10.1002/pro.4218","title":"Panther"},{"location":"Sources/panther/#panther-protein-analysis-through-evolutionary-relationships-classification-system","text":"","title":"PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System"},{"location":"Sources/panther/#panther-gene-orthology","text":"Gene orthology analyses generate testable hypothesis about gene function and biological processes using experimental results from other (especially highly studied so-called 'model' species) using protein (and sometimes, simply nucleic acid level) alignments of genomic sequences. The source of gene orthology data for this ingest is from the PANTHER (Protein ANalysis THrough Evolutionary Relationships) Classification System . Panther was designed to classify proteins (and their genes) in order to facilitate high-throughput analysis. Proteins have been classified according to: - Family and subfamily: families are groups of evolutionarily related proteins; subfamilies are related proteins that also have the same function - Molecular function: the function of the protein by itself or with directly interacting proteins at a biochemical level, e.g. a protein kinase - Biological process: the function of the protein in the context of a larger network of proteins that interact to accomplish a process at the level of the cell or organism, e.g. mitosis. - Pathway: similar to biological process, but a pathway also explicitly specifies the relationships between the interacting molecules. The PANTHER Classifications are the result of human curation as well as sophisticated bioinformatics algorithms. Details of the methods can be found in Mi et al. NAR 2013; Thomas et al., Genome Research 2003 . This ingest uses data derived form the current version (release 16.0) of the Panther Hidden Markov Model (HMM). Panther Gene Orthology bulk data downloads There are various cross-sections of the Panther database which remain be covered by this ingest (Note: T.B.D means \"To Be Done\")","title":"Panther Gene Orthology"},{"location":"Sources/panther/#status-of-panther-ingest","text":"The first iteration of this dataset (committed March 2022) focuses on Reference Genome Gene-to-Gene Orthology Relationships . Additional Panther associations (protein (sub)family pathways, sequences, etc , as generally described below) may be added at a later date.","title":"Status of Panther Ingest"},{"location":"Sources/panther/#reference-genome-gene-to-gene-orthology-relationships","text":"Contains the Reference Genomes' Gene-to-Gene Ortholog mappings from Panther analyses. Source File: AllOrthologs.tar.gz . The source file is huge, containing data from all species, many of which are not currently of direct interest to Monarch. For this reason, a Python function filter_panther_orthologs_file was coded within orthology_utils . ALL_ORTHOLOGS_FILE = \"AllOrthologs\" TARGET_SPECIES_ORTHOLOGS = \"TargetOrthologs\" def filter_panther_orthologs_file ( directory : str = '.' , source_filename : str = ALL_ORTHOLOGS_FILE , target_filename : str = TARGET_SPECIES_ORTHOLOGS , number_of_lines : int = 0 ) -> bool : \"\"\" Filters a tar.gz Panther input file against the target list of species. :param directory: str, location of source data file :param source_filename: str, source data file name :param target_filename: str, target data file name :param number_of_lines: int, number of lines parsed; 'all' lines parsed if omitted or set to zero :return: bool, True if filtering was successful; False if unsuccessful \"\"\" ... which could be called with default parameter values in the following manner (if invoked from within the Panther data directory): filter_file () to generate a pruned down TargetOrthologs.tar.gz file with target species (as hardcoded in the catalog of species in the ortholog_utils module).","title":"Reference Genome Gene-to-Gene Orthology Relationships"},{"location":"Sources/panther/#panther-data-model-of-panther-orthologs","text":"Data Field Content Gene species1 | DB=id1 | protdb=pdbid1 Ortholog species2 | DB=id2 | protdb=pdbid2 Type of ortholog [LDO, O, P, X ,LDX] see README . Common ancestor for the orthologs taxon name of common ancestor Panther Ortholog ID Panther (sub)family identifier The DB=id# fields - where DB == database namespace and id# is the object identifier - are directly translated, by internal namespace mapping, into gene CURIEs. The species# are abridged labels currently filtered and mapped onto NCBI Taxon identifiers, using an hard-coded dictionary.","title":"Panther Data Model of Panther Orthologs"},{"location":"Sources/panther/#biolink-classes-and-properties-captured","text":"biolink:Gene id (NCBIGene Entrez ID) Note that the Gene source is currently given as Panther, although the real source of a Gene identifier is given by its CURIE namespace. biolink:GeneToGeneHomologyAssociation id (random uuid) subject (gene.id) predicate (orthologous to) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Biolink classes and properties captured"},{"location":"Sources/panther/#protein-family-and-subfamily-classifications-tbd","text":"Contains the PANTHER 16.0 family/subfamily name, with molecular function, biological process, and pathway classifications for every PANTHER protein family and subfamily in the current PANTHER HMM library. Source File: http://data.pantherdb.org/ftp/hmm_classifications/current_release/PANTHER16.0_HMM_classifications Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneFamilyToMolecularFunctionAssociation id (random uuid) subject (gene_family.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToBiologicalProcessAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Protein Family and Subfamily Classifications - T.B.D."},{"location":"Sources/panther/#pathways-tbd","text":"Contains regulatory and metabolic pathways, each with subfamilies and protein sequences mapped to individual pathway components. Source File: http://data.pantherdb.org/ftp/pathway/current_release/SequenceAssociationPathway3.6.5.txt local_name: data/orthology/pathways.tsv Biolink classes and properties captured: biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:Gene id (NCBIGene Entrez ID) in taxon (NCBITaxon ID) source (infores:entrez) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneFamilyToPathwayAssociation id (random uuid) subject (gene_family.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Pathways - T.B.D."},{"location":"Sources/panther/#sequence-classifications-tbd","text":"Sequence Classifications files contain the PANTHER family, subfamily, molecular function, biological process, and pathway classifications for the complete proteomes derived from the various genomes, indexed by species (one source file per species). Refer to the Sequence Classification README for details. Only a subset of the available species will be ingested into Monarch at this time, currently: human, mouse, rat, zebrafish, fruit fly, nematode, fission yeast and budding (\"baker's\") yeast. Source File Directory: http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ Biolink classes and properties captured: biolink:Gene id (PANTHER.FAMILY ID) source (infores:panther) biolink:GeneFamily id (PANTHER.FAMILY ID) source (infores:panther) biolink:MolecularActivity id (GO ID) source (go) biolink:BiologicalProcess id (GO ID) source (go) biolink:Pathway id (PANTHER.PATHWAY) source (infores:panther) biolink:GeneToGeneFamilyAssociation : id (random uuid) subject (gene.id) predicate (member_of) object (gene_family.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToMolecularActivityAssociation : id (random uuid) subject (gene.id) predicate (enables) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:MacromolecularMachineToBiologicalProcessAssociation : id (random uuid) subject (gene.id) predicate (involved_in) object (go_term.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther) biolink:GeneToPathwayAssociation id (random uuid) subject (gene.id) predicate (involved_in) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:panther)","title":"Sequence Classifications - T.B.D."},{"location":"Sources/panther/#citation","text":"Paul D. Thomas, Dustin Ebert, Anushya Muruganujan, Tremayne Mushayahama, Laurent-Philippe Albou and Huaiyu Mi Protein Society. 2022;31(1):8-22. doi:10.1002/pro.4218","title":"Citation"},{"location":"Sources/phenio/","text":"Phenio An ontology for accessing and comparing knowledge concerning phenotypes across species and genetic backgrounds. For more information, see: NCATS Translater Phenio Overview KGHub Phenio Monarch Phenio Documentation Source Code https://github.com/monarch-initiative/phenio","title":"Phenio"},{"location":"Sources/phenio/#phenio","text":"An ontology for accessing and comparing knowledge concerning phenotypes across species and genetic backgrounds. For more information, see: NCATS Translater Phenio Overview KGHub Phenio Monarch Phenio Documentation","title":"Phenio"},{"location":"Sources/phenio/#source-code","text":"https://github.com/monarch-initiative/phenio","title":"Source Code"},{"location":"Sources/pombase/","text":"PomBase PomBase is a comprehensive database for the fission yeast Schizosaccharomyces pombe, providing structural and functional annotation, literature curation and access to large-scale data sets. Within this ingest there will be a transformation of gene to phenotypic feature associations, gene entities aren't yet loaded as a part of this ingest, and FYPO ontology terms will be brought in directly from the ontology without transformation. PomBase Bulk Downloads Phaf Format Description Phaf Format LinkML Gene Information PomBase genes are captured directly from the PomBase (names and identifiers)[ https://www.pombase.org/downloads/names-and-identifiers ] set, with synonyms being populated as available and UniProtKB accessions captured as xrefs if available. Biolink Captured biolink:Gene id symbol xref (UniProfKB curie if provided) synonym provided_by([\"infores:pombase\"]) Gene to Phenotype The PHAF download file is extremely well documented. Alleles provided, but not captured, with the assumption that even with an allele specified the gene to phenotype is accurate with a some-some interpretation. Genotype/strain information looks uniform throughout the file, and is not captured. It might be sensible to make presence of genotype information an error condition to be sure that we only get 'clean' gene to phenotype associations. Penetrance and Severity columns are available, but not captured as a part of this ingest. Penetrance values can be either FYPO_EXT terms (FYPO_EXT:0000001, FYPO_EXT:0000002, FYPO_EXT:0000003, FYPO_EXT:0000004), int/float numbers (percentages), or strings (\">98\", \"~10\", \"10-20\"). Severity is represented using one or more FYPO_EXT terms. Biolink Captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (optionally included from condition row) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:pombase) Citation \"Harris MA, Rutherford KM, Hayles J, Lock A, B\u00e4hler J, Oliver S, Mata J, Wood V Fission stories: Using PomBase to understand Schizosaccharomyces pombe biology Genetics, 2021; iyab222\"","title":"Pombase"},{"location":"Sources/pombase/#pombase","text":"PomBase is a comprehensive database for the fission yeast Schizosaccharomyces pombe, providing structural and functional annotation, literature curation and access to large-scale data sets. Within this ingest there will be a transformation of gene to phenotypic feature associations, gene entities aren't yet loaded as a part of this ingest, and FYPO ontology terms will be brought in directly from the ontology without transformation. PomBase Bulk Downloads Phaf Format Description Phaf Format LinkML","title":"PomBase"},{"location":"Sources/pombase/#gene-information","text":"PomBase genes are captured directly from the PomBase (names and identifiers)[ https://www.pombase.org/downloads/names-and-identifiers ] set, with synonyms being populated as available and UniProtKB accessions captured as xrefs if available. Biolink Captured biolink:Gene id symbol xref (UniProfKB curie if provided) synonym provided_by([\"infores:pombase\"])","title":"Gene Information"},{"location":"Sources/pombase/#gene-to-phenotype","text":"The PHAF download file is extremely well documented. Alleles provided, but not captured, with the assumption that even with an allele specified the gene to phenotype is accurate with a some-some interpretation. Genotype/strain information looks uniform throughout the file, and is not captured. It might be sensible to make presence of genotype information an error condition to be sure that we only get 'clean' gene to phenotype associations. Penetrance and Severity columns are available, but not captured as a part of this ingest. Penetrance values can be either FYPO_EXT terms (FYPO_EXT:0000001, FYPO_EXT:0000002, FYPO_EXT:0000003, FYPO_EXT:0000004), int/float numbers (percentages), or strings (\">98\", \"~10\", \"10-20\"). Severity is represented using one or more FYPO_EXT terms. Biolink Captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications qualifiers (optionally included from condition row) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:pombase)","title":"Gene to Phenotype"},{"location":"Sources/pombase/#citation","text":"\"Harris MA, Rutherford KM, Hayles J, Lock A, B\u00e4hler J, Oliver S, Mata J, Wood V Fission stories: Using PomBase to understand Schizosaccharomyces pombe biology Genetics, 2021; iyab222\"","title":"Citation"},{"location":"Sources/reactome/","text":"Reactome Reactome is a free, open-source, curated and peer reviewed pathway database. Our goal is to provide intuitive bioinformatics tools for the visualization, interpretation and analysis of pathway knowledge to support basic research, genome analysis, modeling, systems biology and education. Reactome bulk downloads Pathway This ingest uses Reactome's pathway download file. Biolink captured biolink:Pathway id name in_taxon provided_by ([\"infores:reactome\"]) Gene to Pathway This ingest uses Reactome's gene to pathway download file, which contains all entities and only assocations between pathways and genes that are denoted in some way in the pathyways. Biolink captured biolink:Gene id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (gene.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome) Chemical to Pathway This ingest uses Reactome's chemical to pathway download file, which contains all entities and only assocations between pathways and chemicals that are denoted in some way in the pathyways. Biolink captured biolink:ChemicalEntity id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (chemical.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome) Citation Marc Gillespie, Bijay Jassal, Ralf Stephan, Marija Milacic, Karen Rothfels, Andrea Senff-Ribeiro, Johannes Griss, Cristoffer Sevilla, Lisa Matthews, Chuqiao Gong, Chuan Deng, Thawfeek Varusai, Eliot Ragueneau, Yusra Haider, Bruce May, Veronica Shamovsky, Joel Weiser, Timothy Brunson, Nasim Sanati, Liam Beckman, Xiang Shao, Antonio Fabregat, Konstantinos Sidiropoulos, Julieth Murillo, Guilherme Viteri, Justin Cook, Solomon Shorser, Gary Bader, Emek Demir, Chris Sander, Robin Haw, Guanming Wu, Lincoln Stein, Henning Hermjakob, Peter D\u2019Eustachio, The reactome pathway knowledgebase 2022, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D687\u2013D692, https://doi.org/10.1093/nar/gkab1028","title":"Reactome"},{"location":"Sources/reactome/#reactome","text":"Reactome is a free, open-source, curated and peer reviewed pathway database. Our goal is to provide intuitive bioinformatics tools for the visualization, interpretation and analysis of pathway knowledge to support basic research, genome analysis, modeling, systems biology and education. Reactome bulk downloads","title":"Reactome"},{"location":"Sources/reactome/#pathway","text":"This ingest uses Reactome's pathway download file. Biolink captured biolink:Pathway id name in_taxon provided_by ([\"infores:reactome\"])","title":"Pathway"},{"location":"Sources/reactome/#gene-to-pathway","text":"This ingest uses Reactome's gene to pathway download file, which contains all entities and only assocations between pathways and genes that are denoted in some way in the pathyways. Biolink captured biolink:Gene id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (gene.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome)","title":"Gene to Pathway"},{"location":"Sources/reactome/#chemical-to-pathway","text":"This ingest uses Reactome's chemical to pathway download file, which contains all entities and only assocations between pathways and chemicals that are denoted in some way in the pathyways. Biolink captured biolink:ChemicalEntity id biolink:Pathway id biolink:ChemicalToPathwayAssociation id (random uuid) subject (chemical.id) predicate (mentions) object (pathway.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:reactome)","title":"Chemical to Pathway"},{"location":"Sources/reactome/#citation","text":"Marc Gillespie, Bijay Jassal, Ralf Stephan, Marija Milacic, Karen Rothfels, Andrea Senff-Ribeiro, Johannes Griss, Cristoffer Sevilla, Lisa Matthews, Chuqiao Gong, Chuan Deng, Thawfeek Varusai, Eliot Ragueneau, Yusra Haider, Bruce May, Veronica Shamovsky, Joel Weiser, Timothy Brunson, Nasim Sanati, Liam Beckman, Xiang Shao, Antonio Fabregat, Konstantinos Sidiropoulos, Julieth Murillo, Guilherme Viteri, Justin Cook, Solomon Shorser, Gary Bader, Emek Demir, Chris Sander, Robin Haw, Guanming Wu, Lincoln Stein, Henning Hermjakob, Peter D\u2019Eustachio, The reactome pathway knowledgebase 2022, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D687\u2013D692, https://doi.org/10.1093/nar/gkab1028","title":"Citation"},{"location":"Sources/rgd/","text":"Rat Genome Database (RGD) The Rat Genome Database (RGD) was established in 1999 and is the premier site for genetic, genomic, phenotype, and disease data generated from rat research. In addition, it provides easy access to corresponding human and mouse data for cross-species comparisons. RGD bulk downloads Gene Literature This ingest uses RGD's gene file which contains publication assocations that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Even though it is a gene file, and we have fully populated the gene nodes in the alliance gene information ingest, the RGD file has some information that is not in alliance. Note, there will be a column mismatch warning on this transform because there are two (UNUSED) columns. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:rgd) Citation Vedi M, Smith JR, Thomas Hayman G, Tutaj M, Brodie KC, De Pons JL, Demos WM, Gibson AC, Kaldunski ML, Lamers L, Laulederkind SJF, Thota J, Thorat K, Tutaj MA, Wang SJ, Zacher S, Dwinell MR, Kwitek AE. 2022 updates to the Rat Genome Database: a Findable, Accessible, Interoperable, and Reusable (FAIR) resource. Genetics. 2023 May 4;224(1):iyad042. doi: 10.1093/genetics/iyad042. PMID: 36930729; PMCID: PMC10474928.","title":"Rat Genome Database (RGD)"},{"location":"Sources/rgd/#rat-genome-database-rgd","text":"The Rat Genome Database (RGD) was established in 1999 and is the premier site for genetic, genomic, phenotype, and disease data generated from rat research. In addition, it provides easy access to corresponding human and mouse data for cross-species comparisons. RGD bulk downloads","title":"Rat Genome Database (RGD)"},{"location":"Sources/rgd/#gene-literature","text":"This ingest uses RGD's gene file which contains publication assocations that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Even though it is a gene file, and we have fully populated the gene nodes in the alliance gene information ingest, the RGD file has some information that is not in alliance. Note, there will be a column mismatch warning on this transform because there are two (UNUSED) columns. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:rgd)","title":"Gene Literature"},{"location":"Sources/rgd/#citation","text":"Vedi M, Smith JR, Thomas Hayman G, Tutaj M, Brodie KC, De Pons JL, Demos WM, Gibson AC, Kaldunski ML, Lamers L, Laulederkind SJF, Thota J, Thorat K, Tutaj MA, Wang SJ, Zacher S, Dwinell MR, Kwitek AE. 2022 updates to the Rat Genome Database: a Findable, Accessible, Interoperable, and Reusable (FAIR) resource. Genetics. 2023 May 4;224(1):iyad042. doi: 10.1093/genetics/iyad042. PMID: 36930729; PMCID: PMC10474928.","title":"Citation"},{"location":"Sources/sgd/","text":"Saccharomyces Genome Database (SGD) The Saccharomyces Genome Database (SGD) provides comprehensive integrated biological information for the budding yeast Saccharomyces cerevisiae along with search and analysis tools to explore these data, enabling the discovery of functional relationships between sequence and gene products in fungi and higher organisms. SGD bulk downloads Gene Literature This ingest uses RGD's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:sgd) Citation Cherry JM, Hong EL, Amundsen C, Balakrishnan R, Binkley G, Chan ET, Christie KR, Costanzo MC, Dwight SS, Engel SR, Fisk DG, Hirschman JE, Hitz BC, Karra K, Krieger CJ, Miyasato SR, Nash RS, Park J, Skrzypek MS, Simison M, Weng S, Wong ED (2012) Saccharomyces Genome Database: the genomics resource of budding yeast. Nucleic Acids Res. Jan;40(Database issue):D700-5. [PMID: 22110037]","title":"Saccharomyces Genome Database (SGD)"},{"location":"Sources/sgd/#saccharomyces-genome-database-sgd","text":"The Saccharomyces Genome Database (SGD) provides comprehensive integrated biological information for the budding yeast Saccharomyces cerevisiae along with search and analysis tools to explore these data, enabling the discovery of functional relationships between sequence and gene products in fungi and higher organisms. SGD bulk downloads","title":"Saccharomyces Genome Database (SGD)"},{"location":"Sources/sgd/#gene-literature","text":"This ingest uses RGD's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:sgd)","title":"Gene Literature"},{"location":"Sources/sgd/#citation","text":"Cherry JM, Hong EL, Amundsen C, Balakrishnan R, Binkley G, Chan ET, Christie KR, Costanzo MC, Dwight SS, Engel SR, Fisk DG, Hirschman JE, Hitz BC, Karra K, Krieger CJ, Miyasato SR, Nash RS, Park J, Skrzypek MS, Simison M, Weng S, Wong ED (2012) Saccharomyces Genome Database: the genomics resource of budding yeast. Nucleic Acids Res. Jan;40(Database issue):D700-5. [PMID: 22110037]","title":"Citation"},{"location":"Sources/string/","text":"STRING: functional protein association networks STRING is a database of known and predicted protein-protein interactions . The interactions include direct (physical) and indirect (functional) associations; they stem from computational prediction, from knowledge transfer between organisms, and from interactions aggregated from other (primary) databases. STRING bulk downloads Protein Links This ingest uses a given version (currently, 11.5 ) of the STRING's .protein.links.detailed. .txt.gz files, for a subset of NCBI ID designated species. We filter the input data on the combined_score field (currently with the threshhold recorded in the protein_links.yaml file). The various taxon specific entrez_2_string mapping files are used to map protein subject and concept nodes onto Entrez gene id's. Special note about Entrez mapping files A separate Entrez to String identifier mapping file is not available for Rattus norvegicus (Norway rat, NCBI taxon ID 10116) but the mappings are (less conveniently) available inside the aggregated 'all_organisms' entrez_2_string file . See notes in the STRING section of the download.yaml configuration file for (self explanatory) guidance on how to prepare the required mapping file for use in a local running of the digest. Source File protein1 protein2 neighborhood fusion cooccurence coexpression experimental database textmining combined_score Biolink classes and properties captured Concept Nodes biolink:Gene id (NCBIGene Entrez ID) Associations biolink:PairwiseGeneToGeneInteraction : id (random uuid) subject (gene.id) predicate (interacts_with) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:string) Citation Damian Szklarczyk, Andrea Franceschini, Stefan Wyder, Kristoffer Forslund, Davide Heller, Jaime Huerta-Cepas, Milan Simonovic, Alexander Roth, Alberto Santos, Kalliopi P. Tsafou, Michael Kuhn, Peer Bork, Lars J. Jensen, Christian von Mering, STRING v10: protein\u2013protein interaction networks, integrated over the tree of life, Nucleic Acids Research, Volume 43, Issue D1, 28 January 2015, Pages D447\u2013D452, https://doi.org/10.1093/nar/gku1003","title":"String"},{"location":"Sources/string/#string-functional-protein-association-networks","text":"STRING is a database of known and predicted protein-protein interactions . The interactions include direct (physical) and indirect (functional) associations; they stem from computational prediction, from knowledge transfer between organisms, and from interactions aggregated from other (primary) databases. STRING bulk downloads","title":"STRING: functional protein association networks"},{"location":"Sources/string/#protein-links","text":"This ingest uses a given version (currently, 11.5 ) of the STRING's .protein.links.detailed. .txt.gz files, for a subset of NCBI ID designated species. We filter the input data on the combined_score field (currently with the threshhold recorded in the protein_links.yaml file). The various taxon specific entrez_2_string mapping files are used to map protein subject and concept nodes onto Entrez gene id's.","title":"Protein Links"},{"location":"Sources/string/#special-note-about-entrez-mapping-files","text":"A separate Entrez to String identifier mapping file is not available for Rattus norvegicus (Norway rat, NCBI taxon ID 10116) but the mappings are (less conveniently) available inside the aggregated 'all_organisms' entrez_2_string file . See notes in the STRING section of the download.yaml configuration file for (self explanatory) guidance on how to prepare the required mapping file for use in a local running of the digest.","title":"Special note about Entrez mapping files"},{"location":"Sources/string/#source-file","text":"protein1 protein2 neighborhood fusion cooccurence coexpression experimental database textmining combined_score","title":"Source File"},{"location":"Sources/string/#biolink-classes-and-properties-captured","text":"","title":"Biolink classes and properties captured"},{"location":"Sources/string/#concept-nodes","text":"biolink:Gene id (NCBIGene Entrez ID)","title":"Concept Nodes"},{"location":"Sources/string/#associations","text":"biolink:PairwiseGeneToGeneInteraction : id (random uuid) subject (gene.id) predicate (interacts_with) object (gene.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:string)","title":"Associations"},{"location":"Sources/string/#citation","text":"Damian Szklarczyk, Andrea Franceschini, Stefan Wyder, Kristoffer Forslund, Davide Heller, Jaime Huerta-Cepas, Milan Simonovic, Alexander Roth, Alberto Santos, Kalliopi P. Tsafou, Michael Kuhn, Peer Bork, Lars J. Jensen, Christian von Mering, STRING v10: protein\u2013protein interaction networks, integrated over the tree of life, Nucleic Acids Research, Volume 43, Issue D1, 28 January 2015, Pages D447\u2013D452, https://doi.org/10.1093/nar/gku1003","title":"Citation"},{"location":"Sources/xenbase/","text":"Xenbase Xenbase is a web-accessible resource that integrates all the diverse biological, genomic, genotype and phenotype data available from Xenopus research. Xenbase Bulk Data Xenbase FTP Gene to Phenotype This ingest is built against a one-off OBAN formatted file, which makes for a transformation which only requries adding a curie prefix and connecting column names to biolink attributes. Evidence codes are provided as ECO terms but not yet captured in the output. Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase) Gene Literature This ingest reads from Xenbase's Genes Associated with Literature file to capture associations between Xenbase's XB-GENEPAGE ids and PMIDs, then relies on a map built from Xenbase's GenepageToGeneId file to create associations from XB-GENE records to PMID records. Biolink captured Gene id Publication id InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase) Citation Fisher et al. 2023, Genetics, 2023;, iyad018, doi:10.1093/genetics/iyad018 (Xenbase / PubMed / Genetics)","title":"Xenbase"},{"location":"Sources/xenbase/#xenbase","text":"Xenbase is a web-accessible resource that integrates all the diverse biological, genomic, genotype and phenotype data available from Xenopus research. Xenbase Bulk Data Xenbase FTP","title":"Xenbase"},{"location":"Sources/xenbase/#gene-to-phenotype","text":"This ingest is built against a one-off OBAN formatted file, which makes for a transformation which only requries adding a curie prefix and connecting column names to biolink attributes. Evidence codes are provided as ECO terms but not yet captured in the output. Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase)","title":"Gene to Phenotype"},{"location":"Sources/xenbase/#gene-literature","text":"This ingest reads from Xenbase's Genes Associated with Literature file to capture associations between Xenbase's XB-GENEPAGE ids and PMIDs, then relies on a map built from Xenbase's GenepageToGeneId file to create associations from XB-GENE records to PMID records. Biolink captured Gene id Publication id InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:xenbase)","title":"Gene Literature"},{"location":"Sources/xenbase/#citation","text":"Fisher et al. 2023, Genetics, 2023;, iyad018, doi:10.1093/genetics/iyad018 (Xenbase / PubMed / Genetics)","title":"Citation"},{"location":"Sources/zfin/","text":"ZFIN ZFIN is the Zebrafish Model Organism Database. ZFIN bulk downloads Gene to Phenotype This ingest uses ZFIN's clean gene phenotype download file, which only contains phenotypes which can safely be associated to a single affected gene. This ingest is distinct from the Alliance phenotype index because ZFIN builds Entity-Quality-Entity phenotype statements that can be built from post-composed terms (E1a+E1b+Q+E2a+E2b), Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin) Gene Literature This ingest uses ZFIN's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the ZDB-ID for the publication node rather than a pubmed ID, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin) Citation Bradford, Y.M., Van Slyke, C.E., Ruzicka, L., Singer, A., Eagle, A., Fashena, D., Howe, D.G., Frazer, K., Martin, R., Paddock, H., Pich, C., Ramachandran, S., Westerfield, M. (2022) Zebrafish Information Network, the knowledgebase for Danio rerio research. Genetics. 220(4).","title":"ZFIN"},{"location":"Sources/zfin/#zfin","text":"ZFIN is the Zebrafish Model Organism Database. ZFIN bulk downloads","title":"ZFIN"},{"location":"Sources/zfin/#gene-to-phenotype","text":"This ingest uses ZFIN's clean gene phenotype download file, which only contains phenotypes which can safely be associated to a single affected gene. This ingest is distinct from the Alliance phenotype index because ZFIN builds Entity-Quality-Entity phenotype statements that can be built from post-composed terms (E1a+E1b+Q+E2a+E2b), Biolink captured biolink:Gene id biolink:PhenotypicFeature id biolink:GeneToPhenotypicFeatureAssociation id (random uuid) subject (gene.id) predicate (has_phenotype) object (phenotypicFeature.id) publications aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin)","title":"Gene to Phenotype"},{"location":"Sources/zfin/#gene-literature","text":"This ingest uses ZFIN's gene to publication download file, which only contains assocations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. We have also opted to use the ZDB-ID for the publication node rather than a pubmed ID, on the assumption that kgx will clique merge them later. Biolink captured biolink:Gene id biolink:Publication id biolink:InformationContentEntityToNamedThingAssociation id (random uuid) subject (gene.id) predicate (mentions) object (publication.id) aggregating_knowledge_source ([\"infores:monarchinitiative\"]) primary_knowledge_source (infores:zfin)","title":"Gene Literature"},{"location":"Sources/zfin/#citation","text":"Bradford, Y.M., Van Slyke, C.E., Ruzicka, L., Singer, A., Eagle, A., Fashena, D., Howe, D.G., Frazer, K., Martin, R., Paddock, H., Pich, C., Ramachandran, S., Westerfield, M. (2022) Zebrafish Information Network, the knowledgebase for Danio rerio research. Genetics. 220(4).","title":"Citation"}]}
\ No newline at end of file
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 3d7a26cf..2cddf0ea 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -2,147 +2,152 @@
https://monarch-initiative.github.io/monarch-ingest/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/CLI/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/1.%20Propose/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/2.%20Configure/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/3.%20Document/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/4.%20Implement/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Create-an-Ingest/5.%20Test/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/KG-Build-Process/kg-build-process/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Principles/modeling-principles/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/alliance/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/bgee/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/ctd/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/dictybase/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/flybase/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/goa/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/hgnc/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/hpoa/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/mgi/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/ncbi/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/panther/
- 2023-11-16
+ 2023-11-22
+ daily
+
+
+ https://monarch-initiative.github.io/monarch-ingest/Sources/phenio/
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/pombase/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/reactome/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/rgd/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/sgd/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/string/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/xenbase/
- 2023-11-16
+ 2023-11-22
daily
https://monarch-initiative.github.io/monarch-ingest/Sources/zfin/
- 2023-11-16
+ 2023-11-22
daily
\ No newline at end of file
diff --git a/docs/sitemap.xml.gz b/docs/sitemap.xml.gz
index ecd35cd1..089c7bd1 100644
Binary files a/docs/sitemap.xml.gz and b/docs/sitemap.xml.gz differ