Remove the last bit of solr from the export code, add a prepare-relea…

…se step to compress anything that needs to be compressed to eventually get rid of all of the compressing and decompressing steps.
monarch-initiative · Jun 5, 2024 · 7761c93 · 7761c93
1 parent 45c7470
commit 7761c93
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 26 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -99,6 +99,11 @@ pipeline {
                 sh 'poetry run ingest export'
             }
         }
+        stage('prepare release') {
+            steps {
+                sh 'poetry run ingest prepare-release'
+            }
+        }
         stage('upload files') {
             steps {
                 sh 'poetry run ingest release --kghub'

diff --git a/scripts/load_solr.sh b/scripts/load_solr.sh
@@ -86,4 +86,4 @@ chmod -R a+rX solr-data
 
 tar czf solr.tar.gz -C solr-data data
 mv solr.tar.gz output/
-pigz --force output/monarch-kg-denormalized-edges.tsv
+
diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py
@@ -390,11 +390,6 @@ def apply_closure(
         grouping_fields=["subject", "negated", "predicate", "object"],
     )
     sh.mv(database, f"{output_dir}/")
-    # TODO: need to move this compress step to being after the export
-    # sh.pigz(f"{output_dir}/{database}", force=True)
-    sh.pigz(edges_output_file, force=True)
-    sh.pigz(nodes_output_file, force=True)
-
 
 def load_sqlite():
     sh.bash("scripts/load_sqlite.sh")
@@ -471,9 +466,22 @@ def load_jsonl():
 def export_tsv():
     export()
 
+def do_prepare_release(dir: str = OUTPUT_DIR):
+
+    compressed_artifacts = [
+        'output/monarch-kg.duckdb',
+        'output/monarch-kg-denormalized-edges.tsv',
+        'output/monarch-kg-denormalized-nodes.tsv',
+    ]
 
+    for artifact in compressed_artifacts:
+        if Path(artifact).exists() and not Path(f"{artifact}.gz").exists():
+            sh.pigz(artifact, force=True)
 
 def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):
+
+    # ensure that files that should be compressed are
+
     with open(f"{dir}/metadata.yaml", "r") as f:
         versions = yaml.load(f, Loader=yaml.FullLoader)
 

diff --git a/src/monarch_ingest/main.py b/src/monarch_ingest/main.py
@@ -5,6 +5,7 @@
 from kghub_downloader.download_utils import download_from_yaml
 from monarch_ingest.cli_utils import (
     apply_closure,
+    do_prepare_release,
     do_release,
     export_tsv,
     get_data_versions,
@@ -163,6 +164,9 @@ def solr():
 def export():
     export_tsv()
 
+@typer_app.command()
+def prepare_release():
+    do_prepare_release();
 
 @typer_app.command()
 def release(

diff --git a/src/monarch_ingest/utils/export_utils.py b/src/monarch_ingest/utils/export_utils.py
@@ -75,11 +75,17 @@ def export(
     config_file: str = "./src/monarch_ingest/data-dump-config.yaml",
     output_dir: str = "./output/tsv/",
     output_format: OutputType = OutputType.tsv,
-    solr_url: str = "http://localhost:8983/solr/association/select",
+    database_file = 'output/monarch-kg.duckdb'
 ):
 
     if output_format not in OUTPUT_TYPES:
         raise ValueError(f"output format not supported, supported formats are {OUTPUT_TYPES}")
+
+    if Path(f'{database_file}.gz').exists():
+        with gzip.open(f'{database_file}.gz', 'rb') as f_in:
+            with open(database_file, 'wb') as f_out:
+                f_out.write(f_in.read())
+
     database = duckdb.connect('output/monarch-kg.duckdb')
     dir_path = Path(output_dir)
 
@@ -88,28 +94,10 @@ def export(
     dump_dir = dir_path / association_dir
     dump_dir.mkdir(parents=True, exist_ok=True)
 
-    # wt=json&facet=true&json.nl=arrarr&rows=0&q=*:*&facet.field=association_type
-    assoc_params = {
-        'q': '*:*',
-        'wt': 'json',
-        'json.nl': 'arrarr',
-        'rows': 0,
-        'facet': 'true',
-        'facet.field': 'category',
-    }
-
-    solr_request = requests.get(solr_url, params=assoc_params)
-    response = solr_request.json()
-    solr_request.close()
-
     for association_category in get_association_categories(database):
         category_name = camel_to_snake(re.sub(r'biolink:', '', association_category))
-        # quote the facet value because of the biolink: prefix
-        association_category = f'"{association_category}"'
-        print(association_category)
         file = f"{category_name}.all.{output_format.value}.gz"
         dump_file = str(dump_dir / file)
-        filters = ['category:{}'.format(association_category)]
         export_annotations(database=database,
                                           fields=get_fields(association_category),
                                           category=association_category,
@@ -166,7 +154,7 @@ def export_annotations(database, fields: List[str], output_file: str, category:
         WHERE category = '{category}' {taxon_filter}
     ) to '{output_file}' (header, delimiter '\t')
     """
-    # database.execute(sql)
+    database.execute(sql)
 
 def export_exploded_annotations(database,
                                                          fields: List[str],
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,4 +86,4 @@ chmod -R a+rX solr-data

		tar czf solr.tar.gz -C solr-data data
		mv solr.tar.gz output/
		pigz --force output/monarch-kg-denormalized-edges.tsv