Skip to content

Commit

Permalink
Merge pull request #588 from monarch-initiative/duckdb-export
Browse files Browse the repository at this point in the history
Create exploded association exports that traverses closures
  • Loading branch information
kevinschaper authored Jun 7, 2024
2 parents 134ae2f + 7761c93 commit 2cbd1b1
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 242 deletions.
5 changes: 5 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ pipeline {
sh 'poetry run ingest export'
}
}
stage('prepare release') {
steps {
sh 'poetry run ingest prepare-release'
}
}
stage('upload files') {
steps {
sh 'poetry run ingest release --kghub'
Expand Down
2 changes: 1 addition & 1 deletion scripts/load_solr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ chmod -R a+rX solr-data

tar czf solr.tar.gz -C solr-data data
mv solr.tar.gz output/
pigz --force output/monarch-kg-denormalized-edges.tsv

35 changes: 25 additions & 10 deletions src/monarch_ingest/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,6 @@ def apply_closure(
grouping_fields=["subject", "negated", "predicate", "object"],
)
sh.mv(database, f"{output_dir}/")
sh.pigz(f"{output_dir}/{database}", force=True)
sh.pigz(edges_output_file, force=True)
sh.pigz(nodes_output_file, force=True)


def load_sqlite():
sh.bash("scripts/load_sqlite.sh")
Expand All @@ -420,7 +416,11 @@ def load_jsonl():
edge_path = tar.getmember("monarch-kg_edges.tsv")

with tar.extractfile(node_path) as node_file: # type: ignore
nodes_df = pandas.read_csv(node_file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)
nodes_df = pandas.read_csv(node_file, sep="\t",
dtype="string",
lineterminator="\n",
quoting=csv.QUOTE_NONE,
comment=None)
nodes_df["category"] = nodes_df["category"].map(class_ancestor_dict)
# for each column in nodes_df, if schemaview says it's multivalued, convert the contents to a list splitting on |
for col in nodes_df.columns:
Expand All @@ -434,7 +434,12 @@ def load_jsonl():

with tar.extractfile(edge_path) as edge_file: # type: ignore
edges_df = pandas.read_csv(
edge_file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment="#"
edge_file,
sep="\t",
dtype="string",
lineterminator="\n",
quoting=csv.QUOTE_NONE,
comment=None
)
edges_df["category"] = edges_df["category"].map(class_ancestor_dict)

Expand All @@ -445,10 +450,6 @@ def load_jsonl():
if slot and slot.multivalued and col != "category":
edges_df[col] = edges_df[col].str.split("|")

# Prefixing only these two fields is an odd thing that Translator needs, so
# they're being duplicated with the prefixes here
edges_df["biolink:primary_knowledge_source"] = edges_df["primary_knowledge_source"]
edges_df["biolink:aggregator_knowledge_source"] = edges_df["aggregator_knowledge_source"]
edges_df.to_json("output/monarch-kg_edges.jsonl", orient="records", lines=True)
del edges_df
gc.collect()
Expand All @@ -465,8 +466,22 @@ def load_jsonl():
def export_tsv():
export()

def do_prepare_release(dir: str = OUTPUT_DIR):

compressed_artifacts = [
'output/monarch-kg.duckdb',
'output/monarch-kg-denormalized-edges.tsv',
'output/monarch-kg-denormalized-nodes.tsv',
]

for artifact in compressed_artifacts:
if Path(artifact).exists() and not Path(f"{artifact}.gz").exists():
sh.pigz(artifact, force=True)

def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):

# ensure that files that should be compressed are

with open(f"{dir}/metadata.yaml", "r") as f:
versions = yaml.load(f, Loader=yaml.FullLoader)

Expand Down
44 changes: 23 additions & 21 deletions src/monarch_ingest/data-dump-config.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,36 @@
---
disease_associations:
disease_phenotype.all.tsv.gz:
- 'category:"biolink:DiseaseToPhenotypicFeatureAssociation"'
exploded: true
category: 'biolink:DiseaseToPhenotypicFeatureAssociation'
gene_associations:
gene_phenotype.all.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
exploded: true
category: "biolink:GeneToPhenotypicFeatureAssociation"
gene_phenotype.4896.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:4896"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:4896"
gene_phenotype.6239.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:6239"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:6239"
gene_phenotype.7955.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:7955"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:7955"
gene_phenotype.8355.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:8355"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:8355"
gene_phenotype.8364.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:8364"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:8364"
gene_phenotype.9606.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:9606"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:9606"
gene_phenotype.10090.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:10090"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:10090"
gene_phenotype.10116.tsv.gz:
- 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
- 'subject_taxon:"NCBITaxon:10116"'
category: "biolink:GeneToPhenotypicFeatureAssociation"
subject_taxon: "NCBITaxon:10116"
# Leaving as a placeholder because we don't have fly g2p right now
# gene_phenotype.7227.tsv.gz:
# - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
Expand All @@ -46,7 +48,7 @@ gene_associations:
# - '-subject_taxon:"NCBITaxon:10090"'
# - '-subject_taxon:"NCBITaxon:10116"'
gene_disease.9606.tsv.gz:
- 'category:"biolink:CausalGeneToDiseaseAssociation"'
- 'subject_taxon:"NCBITaxon:9606"'
category: "biolink:CausalGeneToDiseaseAssociation"
subject_taxon: "NCBITaxon:9606"
gene_disease.noncausal.tsv.gz:
- 'category:"biolink:CorrelatedGeneToDiseaseAssociation"'
category: "biolink:CorrelatedGeneToDiseaseAssociation"
4 changes: 4 additions & 0 deletions src/monarch_ingest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from kghub_downloader.download_utils import download_from_yaml
from monarch_ingest.cli_utils import (
apply_closure,
do_prepare_release,
do_release,
export_tsv,
get_data_versions,
Expand Down Expand Up @@ -163,6 +164,9 @@ def solr():
def export():
export_tsv()

@typer_app.command()
def prepare_release():
do_prepare_release();

@typer_app.command()
def release(
Expand Down
Loading

0 comments on commit 2cbd1b1

Please sign in to comment.