From 5e9151b953c583cac48393cf0c375f5d95f48947 Mon Sep 17 00:00:00 2001 From: Daniel Korn Date: Thu, 19 Sep 2024 13:03:18 -0400 Subject: [PATCH] Changed Monarch KG ingest to automatically pull from the latest version. Also made process to check the metadata yaml file for the publishing date of the latest version. --- parsers/monarchkg/src/loadMonarchKG.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/parsers/monarchkg/src/loadMonarchKG.py b/parsers/monarchkg/src/loadMonarchKG.py index 2417cb2f..8cb23931 100644 --- a/parsers/monarchkg/src/loadMonarchKG.py +++ b/parsers/monarchkg/src/loadMonarchKG.py @@ -2,11 +2,12 @@ import os import tarfile import orjson +import requests from Common.loader_interface import SourceDataLoader from Common.kgxmodel import kgxedge from Common.biolink_constants import * -from Common.utils import GetData +from Common.utils import GetData, GetDataPullError ############## @@ -29,7 +30,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): # there is a /latest/ for this url, but without a valid get_latest_source_version function, # it could create a mismatch, pin to this version for now - self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/2024-03-18/' + self.data_url = 'https://data.monarchinitiative.org/monarch-kg-dev/latest/' self.monarch_graph_archive = 'monarch-kg.jsonl.tar.gz' self.monarch_edge_file_archive_path = 'monarch-kg_edges.jsonl' self.data_files = [self.monarch_graph_archive] @@ -63,9 +64,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): } def get_latest_source_version(self) -> str: - # possible to retrieve from /latest/index.html with beautifulsoup or some html parser but not ideal, - # planning to try to set up a better method with owners - latest_version = '2024-03-18' + """ + Gets the name of latest monarch kg version from metadata. + """ + latest_version = None + try: + metadata_yaml : requests.Response = requests.get("https://data.monarchinitiative.org/monarch-kg-dev/latest/metadata.yaml") + for line in metadata_yaml.text.split('\n'): + if("kg-version:" in line): latest_version = line.replace("kg-version:","").strip() + if(latest_version==None):raise ValueError("Cannot find 'kg-version' in Monarch KG metadata yaml.") + except Exception as e: + raise GetDataPullError(error_message=f'Unable to determine latest version for Monarch KG: {e}') return latest_version def get_data(self) -> bool: