diff --git a/Jenkinsfile b/Jenkinsfile index 9d1b27cd..3c431e67 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -99,6 +99,11 @@ pipeline { sh 'poetry run ingest export' } } + stage('prepare release') { + steps { + sh 'poetry run ingest prepare-release' + } + } stage('upload files') { steps { sh 'poetry run ingest release --kghub' diff --git a/scripts/load_solr.sh b/scripts/load_solr.sh index 6434af62..1cd08883 100755 --- a/scripts/load_solr.sh +++ b/scripts/load_solr.sh @@ -86,4 +86,4 @@ chmod -R a+rX solr-data tar czf solr.tar.gz -C solr-data data mv solr.tar.gz output/ -pigz --force output/monarch-kg-denormalized-edges.tsv + diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py index 23f45ac3..e36a7e8c 100644 --- a/src/monarch_ingest/cli_utils.py +++ b/src/monarch_ingest/cli_utils.py @@ -390,11 +390,6 @@ def apply_closure( grouping_fields=["subject", "negated", "predicate", "object"], ) sh.mv(database, f"{output_dir}/") - # TODO: need to move this compress step to being after the export - # sh.pigz(f"{output_dir}/{database}", force=True) - sh.pigz(edges_output_file, force=True) - sh.pigz(nodes_output_file, force=True) - def load_sqlite(): sh.bash("scripts/load_sqlite.sh") @@ -471,9 +466,22 @@ def load_jsonl(): def export_tsv(): export() +def do_prepare_release(dir: str = OUTPUT_DIR): + + compressed_artifacts = [ + 'output/monarch-kg.duckdb', + 'output/monarch-kg-denormalized-edges.tsv', + 'output/monarch-kg-denormalized-nodes.tsv', + ] + for artifact in compressed_artifacts: + if Path(artifact).exists() and not Path(f"{artifact}.gz").exists(): + sh.pigz(artifact, force=True) def do_release(dir: str = OUTPUT_DIR, kghub: bool = False): + + # ensure that files that should be compressed are + with open(f"{dir}/metadata.yaml", "r") as f: versions = yaml.load(f, Loader=yaml.FullLoader) diff --git a/src/monarch_ingest/main.py b/src/monarch_ingest/main.py index fdbb4c84..0e9ceb71 100644 --- a/src/monarch_ingest/main.py +++ b/src/monarch_ingest/main.py @@ -5,6 +5,7 @@ from kghub_downloader.download_utils import download_from_yaml from monarch_ingest.cli_utils import ( apply_closure, + do_prepare_release, do_release, export_tsv, get_data_versions, @@ -163,6 +164,9 @@ def solr(): def export(): export_tsv() +@typer_app.command() +def prepare_release(): + do_prepare_release(); @typer_app.command() def release( diff --git a/src/monarch_ingest/utils/export_utils.py b/src/monarch_ingest/utils/export_utils.py index a9c217fb..4af7ea9c 100755 --- a/src/monarch_ingest/utils/export_utils.py +++ b/src/monarch_ingest/utils/export_utils.py @@ -75,11 +75,17 @@ def export( config_file: str = "./src/monarch_ingest/data-dump-config.yaml", output_dir: str = "./output/tsv/", output_format: OutputType = OutputType.tsv, - solr_url: str = "http://localhost:8983/solr/association/select", + database_file = 'output/monarch-kg.duckdb' ): if output_format not in OUTPUT_TYPES: raise ValueError(f"output format not supported, supported formats are {OUTPUT_TYPES}") + + if Path(f'{database_file}.gz').exists(): + with gzip.open(f'{database_file}.gz', 'rb') as f_in: + with open(database_file, 'wb') as f_out: + f_out.write(f_in.read()) + database = duckdb.connect('output/monarch-kg.duckdb') dir_path = Path(output_dir) @@ -88,28 +94,10 @@ def export( dump_dir = dir_path / association_dir dump_dir.mkdir(parents=True, exist_ok=True) - # wt=json&facet=true&json.nl=arrarr&rows=0&q=*:*&facet.field=association_type - assoc_params = { - 'q': '*:*', - 'wt': 'json', - 'json.nl': 'arrarr', - 'rows': 0, - 'facet': 'true', - 'facet.field': 'category', - } - - solr_request = requests.get(solr_url, params=assoc_params) - response = solr_request.json() - solr_request.close() - for association_category in get_association_categories(database): category_name = camel_to_snake(re.sub(r'biolink:', '', association_category)) - # quote the facet value because of the biolink: prefix - association_category = f'"{association_category}"' - print(association_category) file = f"{category_name}.all.{output_format.value}.gz" dump_file = str(dump_dir / file) - filters = ['category:{}'.format(association_category)] export_annotations(database=database, fields=get_fields(association_category), category=association_category, @@ -166,7 +154,7 @@ def export_annotations(database, fields: List[str], output_file: str, category: WHERE category = '{category}' {taxon_filter} ) to '{output_file}' (header, delimiter '\t') """ - # database.execute(sql) + database.execute(sql) def export_exploded_annotations(database, fields: List[str],