Skip to content

Commit

Permalink
Remove the last bit of solr from the export code, add a prepare-relea…
Browse files Browse the repository at this point in the history
…se step to compress anything that needs to be compressed to eventually get rid of all of the compressing and decompressing steps.
  • Loading branch information
kevinschaper committed Jun 5, 2024
1 parent 45c7470 commit 7761c93
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 26 deletions.
5 changes: 5 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ pipeline {
sh 'poetry run ingest export'
}
}
stage('prepare release') {
steps {
sh 'poetry run ingest prepare-release'
}
}
stage('upload files') {
steps {
sh 'poetry run ingest release --kghub'
Expand Down
2 changes: 1 addition & 1 deletion scripts/load_solr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ chmod -R a+rX solr-data

tar czf solr.tar.gz -C solr-data data
mv solr.tar.gz output/
pigz --force output/monarch-kg-denormalized-edges.tsv

18 changes: 13 additions & 5 deletions src/monarch_ingest/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,11 +390,6 @@ def apply_closure(
grouping_fields=["subject", "negated", "predicate", "object"],
)
sh.mv(database, f"{output_dir}/")
# TODO: need to move this compress step to being after the export
# sh.pigz(f"{output_dir}/{database}", force=True)
sh.pigz(edges_output_file, force=True)
sh.pigz(nodes_output_file, force=True)


def load_sqlite():
sh.bash("scripts/load_sqlite.sh")
Expand Down Expand Up @@ -471,9 +466,22 @@ def load_jsonl():
def export_tsv():
export()

def do_prepare_release(dir: str = OUTPUT_DIR):

compressed_artifacts = [
'output/monarch-kg.duckdb',
'output/monarch-kg-denormalized-edges.tsv',
'output/monarch-kg-denormalized-nodes.tsv',
]

for artifact in compressed_artifacts:
if Path(artifact).exists() and not Path(f"{artifact}.gz").exists():
sh.pigz(artifact, force=True)

def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):

# ensure that files that should be compressed are

with open(f"{dir}/metadata.yaml", "r") as f:
versions = yaml.load(f, Loader=yaml.FullLoader)

Expand Down
4 changes: 4 additions & 0 deletions src/monarch_ingest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from kghub_downloader.download_utils import download_from_yaml
from monarch_ingest.cli_utils import (
apply_closure,
do_prepare_release,
do_release,
export_tsv,
get_data_versions,
Expand Down Expand Up @@ -163,6 +164,9 @@ def solr():
def export():
export_tsv()

@typer_app.command()
def prepare_release():
do_prepare_release();

@typer_app.command()
def release(
Expand Down
28 changes: 8 additions & 20 deletions src/monarch_ingest/utils/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,17 @@ def export(
config_file: str = "./src/monarch_ingest/data-dump-config.yaml",
output_dir: str = "./output/tsv/",
output_format: OutputType = OutputType.tsv,
solr_url: str = "http://localhost:8983/solr/association/select",
database_file = 'output/monarch-kg.duckdb'
):

if output_format not in OUTPUT_TYPES:
raise ValueError(f"output format not supported, supported formats are {OUTPUT_TYPES}")

if Path(f'{database_file}.gz').exists():
with gzip.open(f'{database_file}.gz', 'rb') as f_in:
with open(database_file, 'wb') as f_out:
f_out.write(f_in.read())

database = duckdb.connect('output/monarch-kg.duckdb')
dir_path = Path(output_dir)

Expand All @@ -88,28 +94,10 @@ def export(
dump_dir = dir_path / association_dir
dump_dir.mkdir(parents=True, exist_ok=True)

# wt=json&facet=true&json.nl=arrarr&rows=0&q=*:*&facet.field=association_type
assoc_params = {
'q': '*:*',
'wt': 'json',
'json.nl': 'arrarr',
'rows': 0,
'facet': 'true',
'facet.field': 'category',
}

solr_request = requests.get(solr_url, params=assoc_params)
response = solr_request.json()
solr_request.close()

for association_category in get_association_categories(database):
category_name = camel_to_snake(re.sub(r'biolink:', '', association_category))
# quote the facet value because of the biolink: prefix
association_category = f'"{association_category}"'
print(association_category)
file = f"{category_name}.all.{output_format.value}.gz"
dump_file = str(dump_dir / file)
filters = ['category:{}'.format(association_category)]
export_annotations(database=database,
fields=get_fields(association_category),
category=association_category,
Expand Down Expand Up @@ -166,7 +154,7 @@ def export_annotations(database, fields: List[str], output_file: str, category:
WHERE category = '{category}' {taxon_filter}
) to '{output_file}' (header, delimiter '\t')
"""
# database.execute(sql)
database.execute(sql)

def export_exploded_annotations(database,
fields: List[str],
Expand Down

0 comments on commit 7761c93

Please sign in to comment.