Merge pull request #588 from monarch-initiative/duckdb-export

Create exploded association exports that traverses closures
monarch-initiative · Jun 7, 2024 · 2cbd1b1 · 2cbd1b1
2 parents 134ae2f + 7761c93
commit 2cbd1b1
Show file tree

Hide file tree

Showing 6 changed files with 187 additions and 242 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -99,6 +99,11 @@ pipeline {
                 sh 'poetry run ingest export'
             }
         }
+        stage('prepare release') {
+            steps {
+                sh 'poetry run ingest prepare-release'
+            }
+        }
         stage('upload files') {
             steps {
                 sh 'poetry run ingest release --kghub'

diff --git a/scripts/load_solr.sh b/scripts/load_solr.sh
@@ -86,4 +86,4 @@ chmod -R a+rX solr-data
 
 tar czf solr.tar.gz -C solr-data data
 mv solr.tar.gz output/
-pigz --force output/monarch-kg-denormalized-edges.tsv
+
diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py
@@ -390,10 +390,6 @@ def apply_closure(
         grouping_fields=["subject", "negated", "predicate", "object"],
     )
     sh.mv(database, f"{output_dir}/")
-    sh.pigz(f"{output_dir}/{database}", force=True)
-    sh.pigz(edges_output_file, force=True)
-    sh.pigz(nodes_output_file, force=True)
-
 
 def load_sqlite():
     sh.bash("scripts/load_sqlite.sh")
@@ -420,7 +416,11 @@ def load_jsonl():
         edge_path = tar.getmember("monarch-kg_edges.tsv")
 
         with tar.extractfile(node_path) as node_file:  # type: ignore
-            nodes_df = pandas.read_csv(node_file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE)
+            nodes_df = pandas.read_csv(node_file, sep="\t",
+                                       dtype="string",
+                                       lineterminator="\n",
+                                       quoting=csv.QUOTE_NONE,
+                                       comment=None)
             nodes_df["category"] = nodes_df["category"].map(class_ancestor_dict)
             # for each column in nodes_df, if schemaview says it's multivalued, convert the contents to a list splitting on |
             for col in nodes_df.columns:
@@ -434,7 +434,12 @@ def load_jsonl():
 
         with tar.extractfile(edge_path) as edge_file:  # type: ignore
             edges_df = pandas.read_csv(
-                edge_file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE, comment="#"
+                edge_file,
+                sep="\t",
+                dtype="string",
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+                comment=None
             )
             edges_df["category"] = edges_df["category"].map(class_ancestor_dict)
 
@@ -445,10 +450,6 @@ def load_jsonl():
                     if slot and slot.multivalued and col != "category":
                         edges_df[col] = edges_df[col].str.split("|")
 
-            # Prefixing only these two fields is an odd thing that Translator needs, so
-            # they're being duplicated with the prefixes here
-            edges_df["biolink:primary_knowledge_source"] = edges_df["primary_knowledge_source"]
-            edges_df["biolink:aggregator_knowledge_source"] = edges_df["aggregator_knowledge_source"]
             edges_df.to_json("output/monarch-kg_edges.jsonl", orient="records", lines=True)
             del edges_df
             gc.collect()
@@ -465,8 +466,22 @@ def load_jsonl():
 def export_tsv():
     export()
 
+def do_prepare_release(dir: str = OUTPUT_DIR):
+
+    compressed_artifacts = [
+        'output/monarch-kg.duckdb',
+        'output/monarch-kg-denormalized-edges.tsv',
+        'output/monarch-kg-denormalized-nodes.tsv',
+    ]
+
+    for artifact in compressed_artifacts:
+        if Path(artifact).exists() and not Path(f"{artifact}.gz").exists():
+            sh.pigz(artifact, force=True)
 
 def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):
+
+    # ensure that files that should be compressed are
+
     with open(f"{dir}/metadata.yaml", "r") as f:
         versions = yaml.load(f, Loader=yaml.FullLoader)
 

diff --git a/src/monarch_ingest/data-dump-config.yaml b/src/monarch_ingest/data-dump-config.yaml
@@ -1,34 +1,36 @@
 ---
 disease_associations:
   disease_phenotype.all.tsv.gz:
-  - 'category:"biolink:DiseaseToPhenotypicFeatureAssociation"'
+    exploded: true
+    category: 'biolink:DiseaseToPhenotypicFeatureAssociation'
 gene_associations:
   gene_phenotype.all.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
+    exploded: true
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
   gene_phenotype.4896.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:4896"'
+      category: "biolink:GeneToPhenotypicFeatureAssociation"
+      subject_taxon: "NCBITaxon:4896"
   gene_phenotype.6239.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:6239"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:6239"
   gene_phenotype.7955.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:7955"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:7955"
   gene_phenotype.8355.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:8355"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:8355"
   gene_phenotype.8364.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:8364"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:8364"
   gene_phenotype.9606.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:9606"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:9606"
   gene_phenotype.10090.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:10090"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:10090"
   gene_phenotype.10116.tsv.gz:
-  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
-  - 'subject_taxon:"NCBITaxon:10116"'
+    category: "biolink:GeneToPhenotypicFeatureAssociation"
+    subject_taxon: "NCBITaxon:10116"
 # Leaving as a placeholder because we don't have fly g2p right now
 #  gene_phenotype.7227.tsv.gz:
 #  - 'category:"biolink:GeneToPhenotypicFeatureAssociation"'
@@ -46,7 +48,7 @@ gene_associations:
 #  - '-subject_taxon:"NCBITaxon:10090"'
 #  - '-subject_taxon:"NCBITaxon:10116"'
   gene_disease.9606.tsv.gz:
-  - 'category:"biolink:CausalGeneToDiseaseAssociation"'
-  - 'subject_taxon:"NCBITaxon:9606"'
+    category: "biolink:CausalGeneToDiseaseAssociation"
+    subject_taxon: "NCBITaxon:9606"
   gene_disease.noncausal.tsv.gz:
-  - 'category:"biolink:CorrelatedGeneToDiseaseAssociation"'
+    category: "biolink:CorrelatedGeneToDiseaseAssociation"
diff --git a/src/monarch_ingest/main.py b/src/monarch_ingest/main.py
@@ -5,6 +5,7 @@
 from kghub_downloader.download_utils import download_from_yaml
 from monarch_ingest.cli_utils import (
     apply_closure,
+    do_prepare_release,
     do_release,
     export_tsv,
     get_data_versions,
@@ -163,6 +164,9 @@ def solr():
 def export():
     export_tsv()
 
+@typer_app.command()
+def prepare_release():
+    do_prepare_release();
 
 @typer_app.command()
 def release(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,4 +86,4 @@ chmod -R a+rX solr-data

		tar czf solr.tar.gz -C solr-data data
		mv solr.tar.gz output/
		pigz --force output/monarch-kg-denormalized-edges.tsv