From e5d62d472defd9ebf62cbdca1c081175411dd14f Mon Sep 17 00:00:00 2001 From: glass-ships Date: Wed, 24 Jan 2024 09:52:47 -0700 Subject: [PATCH 1/2] Download mappings from data.mi.org --- src/monarch_ingest/cli_utils.py | 32 ++++++----- src/monarch_ingest/download.yaml | 15 ++++-- src/monarch_ingest/main.py | 93 ++++++++++++++++++-------------- 3 files changed, 81 insertions(+), 59 deletions(-) diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py index 7bbe8ad5..8986e60d 100644 --- a/src/monarch_ingest/cli_utils.py +++ b/src/monarch_ingest/cli_utils.py @@ -303,7 +303,7 @@ def merge_files( mappings = [] mappings.append("data/monarch/mondo.sssom.tsv") mappings.append("data/monarch/gene_mappings.sssom.tsv") - mappings.append("data/monarch/chebi-mesh.biomappings.sssom.tsv") + mappings.append("data/monarch/mesh_chebi_biomappings.sssom.tsv") logger.info("Merging knowledge graph...") @@ -316,18 +316,22 @@ def apply_closure( output_dir: str = OUTPUT_DIR, ): output_file = f"{output_dir}/{name}-denormalized-edges.tsv" - add_closure(kg_archive=f"{output_dir}/{name}.tar.gz", - closure_file=closure_file, - output_file=output_file, - fields=['subject', - 'object', - 'qualifiers', - 'frequency_qualifier', - 'onset_qualifier', - 'sex_qualifier', - 'stage_qualifier'], - evidence_fields=['has_evidence', 'publications'], - grouping_fields=['subject', 'negated', 'predicate', 'object']) + add_closure( + kg_archive=f"{output_dir}/{name}.tar.gz", + closure_file=closure_file, + output_file=output_file, + fields=[ + "subject", + "object", + "qualifiers", + "frequency_qualifier", + "onset_qualifier", + "sex_qualifier", + "stage_qualifier", + ], + evidence_fields=["has_evidence", "publications"], + grouping_fields=["subject", "negated", "predicate", "object"], + ) sh.pigz(output_file, force=True) @@ -397,9 +401,11 @@ def load_jsonl(): os.remove("output/monarch-kg_nodes.jsonl") os.remove("output/monarch-kg_edges.jsonl") + def export_tsv(): export() + def do_release(dir: str = OUTPUT_DIR, kghub: bool = False): import datetime diff --git a/src/monarch_ingest/download.yaml b/src/monarch_ingest/download.yaml index cb2103a1..0920c0c1 100644 --- a/src/monarch_ingest/download.yaml +++ b/src/monarch_ingest/download.yaml @@ -293,16 +293,22 @@ local_name: data/mgi/MRK_Reference.rpt tag: mgi_publication_to_gene -### Monarch +### Mapping files - - url: 'http://data.monarchinitiative.org/monarch-gene-mapping/latest/gene_mappings.sssom.tsv' - # url: 'http://storage.googleapis.com/data-public-monarchinitiative/monarch-gene-mapping/latest/gene_mappings.tsv' + url: 'https://data.monarchinitiative.org/mappings/latest/gene_mappings.sssom.tsv' local_name: data/monarch/gene_mappings.sssom.tsv tag: mapping - - url: https://raw.githubusercontent.com/monarch-initiative/mondo/master/src/ontology/mappings/mondo.sssom.tsv + url: https://data.monarchinitiative.org/mappings/latest/mondo.sssom.tsv local_name: data/monarch/mondo.sssom.tsv tag: mapping + +- + url: https://data.monarchinitiative.org/mappings/latest/mesh_chebi_biomappings.sssom.tsv + local_name: data/monarch/mesh_chebi_biomappings.sssom.tsv + tag: mapping + +### Monarch - url: https://kg-hub.berkeleybop.io/kg-phenio/current/kg-phenio.tar.gz local_name: data/monarch/kg-phenio.tar.gz @@ -315,7 +321,6 @@ url: https://s3.amazonaws.com/bbop-sqlite/phenio.db.gz local_name: data/monarch/phenio.db.gz tag: phenio - # - # url: https://ci.monarchinitiative.org/job/monarch-ontology-json-sri/lastSuccessfulBuild/artifact/build/monarch-ontology-final.json # local_name: data/monarch/monarch.json diff --git a/src/monarch_ingest/main.py b/src/monarch_ingest/main.py index 9396dc8a..6d97991f 100644 --- a/src/monarch_ingest/main.py +++ b/src/monarch_ingest/main.py @@ -2,19 +2,20 @@ from kghub_downloader.download_utils import download_from_yaml from monarch_ingest.cli_utils import ( - apply_closure, + apply_closure, do_release, export_tsv, load_jsonl, - load_sqlite, - load_solr, - merge_files, - transform_one, - transform_phenio, - transform_all, + load_sqlite, + load_solr, + merge_files, + transform_one, + transform_phenio, + transform_all, ) import typer + typer_app = typer.Typer() OUTPUT_DIR = "output" @@ -24,28 +25,26 @@ def callback(version: Optional[bool] = typer.Option(None, "--version", is_eager=True)): if version: from monarch_ingest import __version__ + typer.echo(f"monarch_ingest version: {__version__}") - raise typer.Exit() + raise typer.Exit() @typer_app.command() def download( - ingests: Optional[List[str]] = typer.Option(None, help="Which ingests to download data for"), - all: bool = typer.Option(False, help="Download all ingest datasets") - ): + ingests: Optional[List[str]] = typer.Option(None, help="Which ingests to download data for"), + all: bool = typer.Option(False, help="Download all ingest datasets"), +): """Downloads data defined in download.yaml""" if ingests: download_from_yaml( - yaml_file='src/monarch_ingest/download.yaml', - output_dir='.', + yaml_file="src/monarch_ingest/download.yaml", + output_dir=".", tags=ingests, ) elif all: - download_from_yaml( - yaml_file='src/monarch_ingest/download.yaml', - output_dir='.' - ) + download_from_yaml(yaml_file="src/monarch_ingest/download.yaml", output_dir=".") @typer_app.command() @@ -55,9 +54,16 @@ def transform( ingest: str = typer.Option(None, "--ingest", "-i", help="Run a single ingest (see ingests.yaml for a list)"), phenio: bool = typer.Option(False, help="Run the phenio transform"), all: bool = typer.Option(False, "--all", "-a", help="Ingest all sources"), - force: bool = typer.Option(False, "--force", "-f", help="Force ingest, even if output exists (on by default for single ingests)"), + force: bool = typer.Option( + False, "--force", "-f", help="Force ingest, even if output exists (on by default for single ingests)" + ), rdf: bool = typer.Option(False, help="Output rdf files along with tsv"), - verbose: Optional[bool] = typer.Option(None, "--debug/--quiet", "-d/-q", help="Use --quiet to suppress log output, --debug for verbose, including Koza logs"), + verbose: Optional[bool] = typer.Option( + None, + "--debug/--quiet", + "-d/-q", + help="Use --quiet to suppress log output, --debug for verbose, including Koza logs", + ), log: bool = typer.Option(False, "--log", "-l", help="Write DEBUG level logs to ./logs/ for each ingest"), row_limit: int = typer.Option(None, "--row-limit", "-n", help="Number of rows to process"), # parallel: int = typer.Option(None, "--parallel", "-p", help="Utilize Dask to perform multiple ingests in parallel"), @@ -65,38 +71,39 @@ def transform( """Run Koza transformation on specified Monarch ingests""" if phenio: - transform_phenio( - output_dir=output_dir, - force=force, - verbose=verbose - ) + transform_phenio(output_dir=output_dir, force=force, verbose=verbose) elif ingest: transform_one( - ingest = ingest, - output_dir = output_dir, - row_limit = row_limit, - rdf = rdf, - force = True if force is None else force, - verbose = verbose, - log = log, + ingest=ingest, + output_dir=output_dir, + row_limit=row_limit, + rdf=rdf, + force=True if force is None else force, + verbose=verbose, + log=log, ) elif all: transform_all( - output_dir = output_dir, - row_limit = row_limit, - rdf = rdf, - force = force, + output_dir=output_dir, + row_limit=row_limit, + rdf=rdf, + force=force, verbose=verbose, - log = log, + log=log, ) @typer_app.command() def merge( - input_dir: str = typer.Option(f"{OUTPUT_DIR}/transform_output", help="Directory with nodes and edges to be merged",), + input_dir: str = typer.Option( + f"{OUTPUT_DIR}/transform_output", + help="Directory with nodes and edges to be merged", + ), output_dir: str = typer.Option(f"{OUTPUT_DIR}", help="Directory to output data"), - verbose: Optional[bool] = typer.Option(None, "--debug/--quiet", "-d/-q", help="Use --quiet to suppress log output, --debug for verbose"), - ): + verbose: Optional[bool] = typer.Option( + None, "--debug/--quiet", "-d/-q", help="Use --quiet to suppress log output, --debug for verbose" + ), +): """Merge nodes and edges into kg""" merge_files(input_dir=input_dir, output_dir=output_dir, verbose=verbose) @@ -105,10 +112,12 @@ def merge( def closure(): apply_closure() + @typer_app.command() def jsonl(): load_jsonl() + @typer_app.command() def sqlite(): load_sqlite() @@ -118,15 +127,17 @@ def sqlite(): def solr(): load_solr() + @typer_app.command() def export(): export_tsv() + @typer_app.command() def release( dir: str = typer.Option(f"{OUTPUT_DIR}", help="Directory with kg to be released"), - kghub: bool = typer.Option(False, help="Also release to kghub S3 bucket") - ): + kghub: bool = typer.Option(False, help="Also release to kghub S3 bucket"), +): """Copy data to Monarch GCP data buckets""" do_release(dir, kghub) From ded65ef4e3f0886922e9289621e4ab46c85ddf37 Mon Sep 17 00:00:00 2001 From: glass-ships Date: Wed, 24 Jan 2024 11:34:50 -0700 Subject: [PATCH 2/2] add biogrid to docs in mkdocs.yaml --- mkdocs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index e6186a72..7130af0b 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -20,6 +20,7 @@ nav: - Overview: 'Sources/index.md' - Alliance: 'Sources/alliance.md' - BGee: 'Sources/bgee.md' + - BioGrid: 'Sources/biogrid.md' - CTD: 'Sources/ctd.md' # - Dictybase: 'Sources/dictybase.md' # - Flybase: 'Sources/flybase.md'