From 3a51dc8737cf59912be734ad36dcdd22a361e2a5 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Tue, 14 Jan 2025 02:55:19 -0500 Subject: [PATCH] big refactor to improve graph spec handling and versioning - implements the intent of Dan Korn's PR to lazy load source versions only when needed - improves data source and subgraph version handling - adds ability to change local graph specs directory - improved example graph spec and docs - general clean up and refactoring of build_manager --- Common/build_manager.py | 453 +++++++++++----------- Common/kgx_file_merger.py | 9 +- Common/kgxmodel.py | 71 ++-- Common/load_manager.py | 18 +- Common/metadata.py | 58 +-- README.md | 88 +++-- graph_specs/default-graph-spec.yml | 4 +- graph_specs/example-graph-spec.yaml | 18 + graph_specs/testing-graph-spec.yml | 28 -- set_up_test_env.sh | 2 +- tests/graph_specs/testing-graph-spec.yaml | 10 + tests/test_graph_spec.py | 73 ++++ 12 files changed, 461 insertions(+), 371 deletions(-) create mode 100644 graph_specs/example-graph-spec.yaml delete mode 100644 graph_specs/testing-graph-spec.yml create mode 100644 tests/graph_specs/testing-graph-spec.yaml create mode 100644 tests/test_graph_spec.py diff --git a/Common/build_manager.py b/Common/build_manager.py index 74bd1776..453cddf4 100644 --- a/Common/build_manager.py +++ b/Common/build_manager.py @@ -3,7 +3,8 @@ import argparse import datetime import requests -import json + +from pathlib import Path from xxhash import xxh64_hexdigest from collections import defaultdict from Common.biolink_utils import BiolinkInformationResources, INFORES_STATUS_INVALID, INFORES_STATUS_DEPRECATED @@ -28,51 +29,54 @@ class GraphBuilder: - def __init__(self): + def __init__(self, + graph_specs_dir=None): self.logger = LoggingUtil.init_logging("ORION.Common.GraphBuilder", line_format='medium', log_file_path=os.environ['ORION_LOGS']) - # This dictionary holds the versions of graphs from the graph spec. - # This is more temperamental than it seems because the only way to get the current version for many sources - # is to retrieve them online. Graph versions are generated from underlying data source versions, so if versions - # are not explicitly specified in the graph spec, they may need to be retrieved. - self.graph_id_to_version = {} - - self.graphs_dir = self.init_graphs_dir() # path to the graphs output directory + self.graphs_dir = self.get_graphs_dir() # path to the graphs output directory self.source_data_manager = SourceDataManager() # access to the data sources and their metadata - self.graph_specs = self.load_graph_specs() # list of potential graphs to build (GraphSpec objects) + self.graph_specs = {} # graph_id -> GraphSpec all potential graphs that could be built, including sub-graphs + self.load_graph_specs(graph_specs_dir=graph_specs_dir) self.build_results = {} - def build_graph(self, graph_id: str): + def build_graph(self, graph_spec: GraphSpec): - self.logger.info(f'Building graph {graph_id}. Checking dependencies...') - graph_spec = self.get_graph_spec(graph_id) - graph_version = self.get_graph_version(graph_id) + graph_id = graph_spec.graph_id + self.logger.info(f'Building graph {graph_id}...') - if self.build_dependencies(graph_spec): - self.logger.info(f'Building graph {graph_id}. Dependencies are ready...') - else: - self.logger.warning(f'Aborting graph {graph_spec.graph_id}, building dependencies failed.') - return - - # check the status for previous builds of this version + graph_version = self.determine_graph_version(graph_spec) graph_metadata = self.get_graph_metadata(graph_id, graph_version) + graph_output_dir = self.get_graph_dir_path(graph_id, graph_version) + + # check for previous builds of this same graph build_status = graph_metadata.get_build_status() if build_status == Metadata.IN_PROGRESS: - self.logger.info(f'Graph {graph_id} version {graph_version} is already in progress. Skipping..') - return + self.logger.info(f'Graph {graph_id} version {graph_version} has status: in progress. ' + f'This means either the graph is already in the process of being built, ' + f'or an error occurred previously that could not be handled. ' + f'You may need to clean up and/or remove the failed build.') + return False if build_status == Metadata.BROKEN or build_status == Metadata.FAILED: self.logger.info(f'Graph {graph_id} version {graph_version} previously failed to build. Skipping..') - return - - graph_output_dir = self.get_graph_dir_path(graph_id, graph_version) - if build_status != Metadata.STABLE: + return False + if build_status == Metadata.STABLE: + self.logger.info(f'Graph {graph_id} version {graph_version} was already built.') + return True + else: # if we get here we need to build the graph - self.logger.info(f'Building graph {graph_id} version {graph_version}. Merging sources...') + self.logger.info(f'Building graph {graph_id} version {graph_version}, checking dependencies...') + if not self.build_dependencies(graph_spec): + self.logger.warning(f'Aborting graph {graph_spec.graph_id} version {graph_version}, building ' + f'dependencies failed.') + return False + + self.logger.info(f'Building graph {graph_id} version {graph_version}. ' + f'Dependencies ready, merging sources...') graph_metadata.set_build_status(Metadata.IN_PROGRESS) graph_metadata.set_graph_version(graph_version) graph_metadata.set_graph_name(graph_spec.graph_name) @@ -90,16 +94,14 @@ def build_graph(self, graph_id: str): if "merge_error" in merge_metadata: graph_metadata.set_build_error(merge_metadata["merge_error"], current_time) graph_metadata.set_build_status(Metadata.FAILED) - self.logger.error(f'Error building graph {graph_id}.') - return + self.logger.error(f'Merge error occured while building graph {graph_id}: ' + f'{merge_metadata["merge_error"]}') + return False graph_metadata.set_build_info(merge_metadata, current_time) graph_metadata.set_build_status(Metadata.STABLE) self.logger.info(f'Building graph {graph_id} complete!') self.build_results[graph_id] = {'version': graph_version, 'success': True} - else: - self.logger.info(f'Graph {graph_id} version {graph_version} was already built.') - self.build_results[graph_id] = {'version': graph_version, 'success': False} if not graph_metadata.has_qc(): self.logger.info(f'Running QC for graph {graph_id}...') @@ -108,8 +110,8 @@ def build_graph(self, graph_id: str): if qc_results['pass']: self.logger.info(f'QC passed for graph {graph_id}.') else: - self.logger.info(f'QC failed for graph {graph_id}, bailing..') - return + self.logger.warning(f'QC failed for graph {graph_id}.') + self.build_results[graph_id] = {'version': graph_version, 'success': False} needs_meta_kg = not self.has_meta_kg(graph_directory=graph_output_dir) needs_test_data = not self.has_test_data(graph_directory=graph_output_dir) @@ -132,7 +134,7 @@ def build_graph(self, graph_id: str): logger=self.logger) if dump_success: - graph_output_url = self.get_graph_output_URL(graph_id, graph_version) + graph_output_url = self.get_graph_output_url(graph_id, graph_version) graph_metadata.set_dump_url(f'{graph_output_url}graph_{graph_version}.db.dump') if 'redundant_jsonl' in output_formats: @@ -140,100 +142,117 @@ def build_graph(self, graph_id: str): redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) generate_redundant_kg(edges_filepath, redundant_filepath) - def get_graph_version(self, graph_id: str) -> str: - if graph_id not in self.graph_id_to_version: - graph_spec = self.get_graph_spec(graph_id) - if graph_spec is not None: - if graph_spec.graph_version is None: - try: - graph_spec.graph_version = self.generate_graph_version(graph_spec) - except (GetDataPullError, DataVersionError) as e: - raise GraphSpecError(error_message=e.error_message) - self.graph_id_to_version[graph_id] = graph_spec.graph_version - else: - raise GraphSpecError(error_message=f'Tried to determine the version for a ' - f'graph that was not found in the Graph Spec: {graph_id}.') - return self.graph_id_to_version[graph_id] + return True + + # determine a graph version utilizing versions of data sources, or just return the graph version specified + def determine_graph_version(self, graph_spec: GraphSpec): + # if the version was set or previously determined just back out + if graph_spec.graph_version: + return graph_spec.graph_version + try: + # go out and find the latest version for any data source that doesn't have a version specified + for source in graph_spec.sources: + if not source.source_version: + source.source_version = self.source_data_manager.get_latest_source_version(source.id) + self.logger.info(f'Source version - {source.id}: {source.version}') + + # for sub-graphs, if a graph version isn't specified, + # use the graph spec for that subgraph to determine a graph version + for subgraph in graph_spec.subgraphs: + if not subgraph.graph_version: + subgraph_graph_spec = self.graph_specs.get(subgraph.id, None) + if subgraph_graph_spec: + subgraph.graph_version = self.determine_graph_version(subgraph_graph_spec) + self.logger.info(f'found subgraph version, {graph_spec.graph_id}: {subgraph.graph_version}') + else: + raise GraphSpecError(f'Subgraph {subgraph.id} requested for graph {graph_spec.graph_id} ' + f'but the version was not specified and could not be determined without ' + f'a graph spec for {subgraph.id}.') + except (GetDataPullError, DataVersionError) as e: + raise GraphSpecError(error_message=e.error_message) + + # make a string that is a composite of versions and their merge strategy for each source + composite_version_string = "" + if graph_spec.sources: + composite_version_string += '_'.join([graph_source.version + '_' + graph_source.merge_strategy + if graph_source.merge_strategy else graph_source.version + for graph_source in graph_spec.sources]) + if graph_spec.subgraphs: + if composite_version_string: + composite_version_string += '_' + composite_version_string += '_'.join([sub_graph_source.version + '_' + sub_graph_source.merge_strategy + if sub_graph_source.merge_strategy else sub_graph_source.version + for sub_graph_source in graph_spec.subgraphs]) + graph_version = xxh64_hexdigest(composite_version_string) + graph_spec.graph_version = graph_version + self.logger.info(f'Version determined for graph {graph_spec.graph_id}: {graph_version} ({composite_version_string})') + return graph_version def build_dependencies(self, graph_spec: GraphSpec): + graph_id = graph_spec.graph_id for subgraph_source in graph_spec.subgraphs: subgraph_id = subgraph_source.id subgraph_version = subgraph_source.version - # Get the subgraph version from the subgraph source spec, - # which will either be one specified in the graph spec or None. - if subgraph_version is None: - try: - # if one was not specified, retrieve or generate it like we would any graph version - subgraph_version = self.get_graph_version(subgraph_id) - except GraphSpecError: - self.logger.error(f'Could not determine version of subgraph {subgraph_id}. ' - f'Either specify an existing version of the graph, or the subgraph must ' - f'be defined in the same Graph Spec.') - return False - if self.check_for_existing_graph_dir(subgraph_id, subgraph_version): - # load previous metadata if the specified subgraph version was already built - graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version) - subgraph_source.graph_metadata = graph_metadata.metadata - else: + if not self.check_for_existing_graph_dir(subgraph_id, subgraph_version): # If the subgraph doesn't already exist, we need to make sure it matches the current version of the # subgraph as generated by the current graph spec, otherwise we won't be able to build it. - current_subgraph_version = self.get_graph_version(subgraph_id) - if subgraph_version == current_subgraph_version: - self.logger.warning(f'For graph {graph_spec.graph_id} subgraph dependency ' - f'{subgraph_id} is not ready. Building now...') - self.build_graph(subgraph_id) - else: - self.logger.error(f'Subgraph ({subgraph_id}) version ({subgraph_version}) was specified, but that ' + subgraph_graph_spec = self.graph_specs.get(subgraph_id, None) + if not subgraph_graph_spec: + self.logger.warning(f'Subgraph {subgraph_id} version {subgraph_version} was requested for graph ' + f'{graph_id} but it was not found and could not be built without a Graph Spec.') + return False + + if subgraph_version != subgraph_graph_spec.graph_version: + self.logger.error(f'Subgraph {subgraph_id} version {subgraph_version} was specified, but that ' f'version of the graph could not be found. It can not be built now because the ' - f'current version is {current_subgraph_version}. Either specify a version that ' - f'is already built, or leave the subgraph version blank to automatically ' - f'build the new one.') + f'current version is {subgraph_graph_spec.graph_version}. Either specify a ' + f'version that is already built, or remove the subgraph version specification to ' + f'automatically include the latest one.') + return False - graph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version) - if graph_metadata.get_build_status() == Metadata.STABLE: - # we found the subgraph and it's stable - update the GraphSource in preparation for building the graph + # here the graph specs and versions all look right, but we still need to build the subgraph + self.logger.warning(f'Graph {graph_id}, subgraph dependency {subgraph_id} is not ready. Building now..') + subgraph_build_success = self.build_graph(subgraph_graph_spec) + if not subgraph_build_success: + return False + + # confirm the subgraph build worked and update the DataSource object in preparation for merging + subgraph_metadata = self.get_graph_metadata(subgraph_id, subgraph_version) + subgraph_source.graph_metadata = subgraph_metadata + if subgraph_metadata.get_build_status() == Metadata.STABLE: subgraph_dir = self.get_graph_dir_path(subgraph_id, subgraph_version) subgraph_nodes_path = self.get_graph_nodes_file_path(subgraph_dir) subgraph_edges_path = self.get_graph_edges_file_path(subgraph_dir) subgraph_source.file_paths = [subgraph_nodes_path, subgraph_edges_path] else: - self.logger.warning( - f'Attempting to build graph {graph_spec.graph_id} failed, dependency ' - f'subgraph {subgraph_id} version {subgraph_version} was not built successfully.') + self.logger.warning(f'Attempting to build graph {graph_id} failed, dependency subgraph {subgraph_id} ' + f'version {subgraph_version} was not built successfully.') return False for data_source in graph_spec.sources: source_id = data_source.id - source_version = data_source.source_version - if source_id not in get_available_data_sources(): - self.logger.warning( - f'Attempting to build graph {graph_spec.graph_id} failed: ' - f'{source_id} is not a valid data source id. ') - return False - source_metadata: SourceMetadata = self.source_data_manager.get_source_metadata(source_id, - source_version) - release_version = source_metadata.get_release_version(parsing_version=data_source.parsing_version, - normalization_version=data_source.normalization_scheme.get_composite_normalization_version(), - supplementation_version=data_source.supplementation_version) - if release_version is None: + data_source.source_version) + release_version = data_source.generate_version() + release_metadata = source_metadata.get_release_info(release_version) + if release_metadata is None: self.logger.info( - f'Attempting to build graph {graph_spec.graph_id}, ' + f'Attempting to build graph {graph_id}, ' f'dependency {source_id} is not ready. Building now...') - release_version = self.source_data_manager.run_pipeline(source_id, - source_version=source_version, + pipeline_sucess = self.source_data_manager.run_pipeline(source_id, + source_version=data_source.source_version, parsing_version=data_source.parsing_version, normalization_scheme=data_source.normalization_scheme, supplementation_version=data_source.supplementation_version) - if not release_version: - self.logger.info( - f'While attempting to build {graph_spec.graph_id}, dependency pipeline failed for {source_id}...') + if not pipeline_sucess: + self.logger.info(f'While attempting to build {graph_spec.graph_id}, ' + f'data source pipeline failed for dependency {source_id}...') return False + release_metadata = source_metadata.get_release_info(release_version) - data_source.version = release_version - data_source.release_info = source_metadata.get_release_info(release_version) + data_source.release_info = release_metadata data_source.file_paths = self.source_data_manager.get_final_file_paths(source_id, - source_version, + data_source.source_version, data_source.parsing_version, data_source.normalization_scheme.get_composite_normalization_version(), data_source.supplementation_version) @@ -333,66 +352,69 @@ def run_qc(self, qc_metadata['warnings']['invalid_knowledge_sources'] = invalid_infores_ids return qc_metadata - def load_graph_specs(self): - if 'ORION_GRAPH_SPEC' in os.environ and os.environ['ORION_GRAPH_SPEC']: - # this is a messy way to find the graph spec path, mainly for testing - URL is preferred - graph_spec_file = os.environ['ORION_GRAPH_SPEC'] - graph_spec_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'graph_specs', graph_spec_file) + def load_graph_specs(self, graph_specs_dir=None): + graph_spec_file = os.environ.get('ORION_GRAPH_SPEC', None) + graph_spec_url = os.environ.get('ORION_GRAPH_SPEC_URL', None) + + if graph_spec_file and graph_spec_url: + raise GraphSpecError(f'Configuration Error - the environment variables ORION_GRAPH_SPEC and ' + f'ORION_GRAPH_SPEC_URL were set. Please choose one or the other. See the README for ' + f'details.') + + if graph_spec_file: + if not graph_specs_dir: + graph_specs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'graph_specs') + graph_spec_path = os.path.join(graph_specs_dir, graph_spec_file) if os.path.exists(graph_spec_path): self.logger.info(f'Loading graph spec: {graph_spec_file}') with open(graph_spec_path) as graph_spec_file: - graph_spec_yaml = yaml.full_load(graph_spec_file) - return self.parse_graph_spec(graph_spec_yaml) + graph_spec_yaml = yaml.safe_load(graph_spec_file) + self.parse_graph_spec(graph_spec_yaml) + return else: - raise Exception(f'Configuration Error - Graph Spec could not be found: {graph_spec_file}') - elif 'ORION_GRAPH_SPEC_URL' in os.environ: - graph_spec_url = os.environ['ORION_GRAPH_SPEC_URL'] + raise GraphSpecError(f'Configuration Error - Graph Spec could not be found: {graph_spec_file}') + + if graph_spec_url: graph_spec_request = requests.get(graph_spec_url) graph_spec_request.raise_for_status() - graph_spec_yaml = yaml.full_load(graph_spec_request.text) - return self.parse_graph_spec(graph_spec_yaml) - else: - raise Exception(f'Configuration Error - No Graph Spec was configured. Set the environment variable ' - f'ORION_GRAPH_SPEC_URL to a URL with a valid Graph Spec yaml file. ' - f'See the README for more info.') + graph_spec_yaml = yaml.safe_load(graph_spec_request.text) + self.parse_graph_spec(graph_spec_yaml) + return + + raise GraphSpecError(f'Configuration Error - No Graph Spec was configured. Set the environment variable ' + f'ORION_GRAPH_SPEC to the name of a graph spec included in this package, or ' + f'ORION_GRAPH_SPEC_URL to a URL of a valid Graph Spec yaml file. ' + f'See the README for more info.') def parse_graph_spec(self, graph_spec_yaml): - graph_specs = [] - graph_id = "" + graph_id = None try: for graph_yaml in graph_spec_yaml['graphs']: graph_id = graph_yaml['graph_id'] - graph_name = graph_yaml['graph_name'] if 'graph_name' in graph_yaml else "" - graph_description = graph_yaml['graph_description'] if 'graph_description' in graph_yaml else "" - graph_url = graph_yaml['graph_url'] if 'graph_url' in graph_yaml else "" + graph_name = graph_yaml.get('graph_name', '') + graph_description = graph_yaml.get('graph_description', '') + graph_url = graph_yaml.get('graph_url', '') # parse the list of data sources - data_sources = [self.parse_data_source_spec(data_source) for data_source in graph_yaml['sources']] \ - if 'sources' in graph_yaml else [] + data_sources = [self.parse_data_source_spec(data_source) + for data_source in graph_yaml.get('sources', [])] # parse the list of subgraphs - subgraph_sources = [self.parse_subgraph_spec(subgraph) for subgraph in graph_yaml['subgraphs']] \ - if 'subgraphs' in graph_yaml else [] + subgraph_sources = [self.parse_subgraph_spec(subgraph) + for subgraph in graph_yaml.get('subgraphs', [])] if not data_sources and not subgraph_sources: - self.logger.error(f'Error: No sources were provided for graph: {graph_id}.') - continue + raise GraphSpecError('Error: No sources were provided for graph: {graph_id}.') - # take any normalization scheme parameters specified at the graph level - graph_wide_node_norm_version = graph_yaml['node_normalization_version'] \ - if 'node_normalization_version' in graph_yaml else None + # see if there are any normalization scheme parameters specified at the graph level + graph_wide_node_norm_version = graph_yaml.get('node_normalization_version', None) + graph_wide_edge_norm_version = graph_yaml.get('edge_normalization_version', None) + graph_wide_conflation = graph_yaml.get('conflation', None) + graph_wide_strict_norm = graph_yaml.get('strict_normalization', None) if graph_wide_node_norm_version == 'latest': graph_wide_node_norm_version = self.source_data_manager.get_latest_node_normalization_version() - graph_wide_edge_norm_version = graph_yaml['edge_normalization_version'] \ - if 'edge_normalization_version' in graph_yaml else None if graph_wide_edge_norm_version == 'latest': graph_wide_edge_norm_version = self.source_data_manager.get_latest_edge_normalization_version() - graph_wide_conflation = graph_yaml['conflation'] \ - if 'conflation' in graph_yaml else None - graph_wide_strict_norm = graph_yaml['strict_normalization'] \ - if 'strict_normalization' in graph_yaml else None - graph_wide_normalization_code_version = graph_yaml['normalization_code_version'] \ - if 'normalization_code_version' in graph_yaml else None # apply them to all the data sources, this will overwrite anything defined at the source level for data_source in data_sources: @@ -404,100 +426,101 @@ def parse_graph_spec(self, graph_spec_yaml): data_source.normalization_scheme.conflation = graph_wide_conflation if graph_wide_strict_norm is not None: data_source.normalization_scheme.strict = graph_wide_strict_norm - if graph_wide_normalization_code_version is not None: - data_source.normalization_scheme.normalization_code_version = graph_wide_normalization_code_version - graph_output_format = graph_yaml['output_format'] if 'output_format' in graph_yaml else "" + graph_output_format = graph_yaml.get('output_format', '') graph_spec = GraphSpec(graph_id=graph_id, graph_name=graph_name, graph_description=graph_description, graph_url=graph_url, - graph_version=None, # this will get populated later + graph_version=None, # this will get populated when a build is triggered graph_output_format=graph_output_format, subgraphs=subgraph_sources, sources=data_sources) - graph_specs.append(graph_spec) - except Exception as e: - self.logger.error(f'Error parsing Graph Spec ({graph_id}), formatting error or missing information: {repr(e)}') - raise e - return graph_specs + self.graph_specs[graph_id] = graph_spec + except KeyError as e: + error_message = f'Graph Spec missing required field: {e}' + if graph_id is not None: + error_message += f"(in graph {graph_id})" + raise GraphSpecError(error_message) def parse_subgraph_spec(self, subgraph_yml): subgraph_id = subgraph_yml['graph_id'] - subgraph_version = subgraph_yml['graph_version'] if 'graph_version' in subgraph_yml else None - merge_strategy = subgraph_yml['merge_strategy'] if 'merge_strategy' in subgraph_yml else 'default' + subgraph_version = subgraph_yml.get('graph_version', None) + merge_strategy = subgraph_yml.get('merge_strategy', None) + if merge_strategy == 'default': + merge_strategy = None subgraph_source = SubGraphSource(id=subgraph_id, - version=subgraph_version, + graph_version=subgraph_version, merge_strategy=merge_strategy) return subgraph_source def parse_data_source_spec(self, source_yml): + # get the source id and make sure it's valid source_id = source_yml['source_id'] if source_id not in get_available_data_sources(): error_message = f'Data source {source_id} is not a valid data source id.' self.logger.error(error_message + " " + f'Valid sources are: {", ".join(get_available_data_sources())}') - raise Exception(error_message) - - parsing_version = source_yml['parsing_version'] if 'parsing_version' in source_yml \ - else self.source_data_manager.get_latest_parsing_version(source_id) - merge_strategy = source_yml['merge_strategy'] if 'merge_strategy' in source_yml else 'default' - node_normalization_version = source_yml['node_normalization_version'] \ - if 'node_normalization_version' in source_yml \ - else self.source_data_manager.get_latest_node_normalization_version() - edge_normalization_version = source_yml['edge_normalization_version'] \ - if 'edge_normalization_version' in source_yml \ - else self.source_data_manager.get_latest_edge_normalization_version() - strict_normalization = source_yml['strict_normalization'] \ - if 'strict_normalization' in source_yml else True - normalization_code_version = source_yml['normalization_code_version'] \ - if 'normalization_code_version' in source_yml else NORMALIZATION_CODE_VERSION - conflation = source_yml['conflation'] \ - if 'conflation' in source_yml else False + raise GraphSpecError(error_message) + + # read version and normalization specifications from the graph spec + source_version = source_yml.get('source_version', None) + parsing_version = source_yml.get('parsing_version', None) + merge_strategy = source_yml.get('merge_strategy', None) + node_normalization_version = source_yml.get('node_normalization_version', None) + edge_normalization_version = source_yml.get('edge_normalization_version', None) + strict_normalization = source_yml.get('strict_normalization', True) + conflation = source_yml.get('conflation', False) + + # supplementation and normalization code version cannot be specified, set them to the current version + supplementation_version = SequenceVariantSupplementation.SUPPLEMENTATION_VERSION + normalization_code_version = NORMALIZATION_CODE_VERSION + + # if normalization versions are not specified, set them to the current latest + # source_version is intentionally not handled here because we want to do it lazily and avoid if not needed + if not parsing_version or parsing_version == 'latest': + parsing_version = self.source_data_manager.get_latest_parsing_version(source_id) + if not node_normalization_version or node_normalization_version == 'latest': + node_normalization_version = self.source_data_manager.get_latest_node_normalization_version() + if not edge_normalization_version or edge_normalization_version == 'latest': + edge_normalization_version = self.source_data_manager.get_latest_edge_normalization_version() + + # do some validation + if type(strict_normalization) != bool: + raise GraphSpecError(f'Invalid type (strict_normalization: {strict_normalization}), must be true or false.') + if type(conflation) != bool: + raise GraphSpecError(f'Invalid type (conflation: {conflation}), must be true or false.') + if merge_strategy == 'default': + merge_strategy = None + normalization_scheme = NormalizationScheme(node_normalization_version=node_normalization_version, edge_normalization_version=edge_normalization_version, normalization_code_version=normalization_code_version, strict=strict_normalization, conflation=conflation) - supplementation_version = SequenceVariantSupplementation.SUPPLEMENTATION_VERSION - - # The DataSource() will get initialized with either a specific source version, if specified, - # or a callable function which can determine the latest source version. This is for a lazy initialization - # technique, so that we don't call get_latest_source_version until we need to, if at all. - if 'source_version' not in source_yml or source_yml['source_version'] == 'latest': - get_source_version = self.source_data_manager.get_latest_source_version - source_version = None - else: - source_version = str(source_yml['source_version']) - get_source_version = None data_source = DataSource(id=source_id, source_version=source_version, - get_source_version=get_source_version, merge_strategy=merge_strategy, normalization_scheme=normalization_scheme, parsing_version=parsing_version, supplementation_version=supplementation_version) return data_source - def get_graph_spec(self, graph_id: str): - for graph_spec in self.graph_specs: - if graph_spec.graph_id == graph_id: - return graph_spec - return None - def get_graph_dir_path(self, graph_id: str, graph_version: str): return os.path.join(self.graphs_dir, graph_id, graph_version) - def get_graph_output_URL(self, graph_id: str, graph_version: str): + def get_graph_output_url(self, graph_id: str, graph_version: str): graph_output_url = os.environ.get('ORION_OUTPUT_URL', "https://localhost/") if graph_output_url[-1] != '/': graph_output_url += '/' return f'{graph_output_url}{graph_id}/{graph_version}/' - def get_graph_nodes_file_path(self, graph_output_dir: str): + @staticmethod + def get_graph_nodes_file_path(graph_output_dir: str): return os.path.join(graph_output_dir, NODES_FILENAME) - def get_graph_edges_file_path(self, graph_output_dir: str): + @staticmethod + def get_graph_edges_file_path(graph_output_dir: str): return os.path.join(graph_output_dir, EDGES_FILENAME) def check_for_existing_graph_dir(self, graph_id: str, graph_version: str): @@ -514,54 +537,32 @@ def get_graph_metadata(self, graph_id: str, graph_version: str): return GraphMetadata(graph_id, graph_output_dir) @staticmethod - def generate_graph_version(graph_spec: GraphSpec): - sources_string = ''.join( - [json.dumps(graph_source.get_metadata_representation()) - for graph_source in graph_spec.sources]) - subgraphs_string = ''.join( - [''.join([subgraph.id, subgraph.version, subgraph.merge_strategy]) - for subgraph in graph_spec.subgraphs]) - graph_version = xxh64_hexdigest(sources_string + subgraphs_string) - return graph_version - - @staticmethod - def init_graphs_dir(): - # use the directory specified by the environment variable ORION_GRAPHS - if 'ORION_GRAPHS' in os.environ and os.path.isdir(os.environ['ORION_GRAPHS']): + def get_graphs_dir(): + # confirm the directory specified by the environment variable ORION_GRAPHS is valid + graphs_dir = os.environ.get('ORION_GRAPHS', None) + if graphs_dir and Path(graphs_dir).is_dir(): return os.environ['ORION_GRAPHS'] - else: - # if graph dir is invalid or not specified back out - raise IOError( - 'GraphBuilder graphs directory not found. ' - 'Specify a valid directory with environment variable ORION_GRAPHS.') + + # if invalid or not specified back out + raise IOError('ORION graphs directory not configured properly. ' + 'Specify a valid directory with environment variable ORION_GRAPHS.') if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Merge data source files into complete graphs.") + parser = argparse.ArgumentParser(description="Merge data sources into complete graphs.") parser.add_argument('graph_id', help='ID of the graph to build. Must match an ID from the configured Graph Spec.') - parser.add_argument('-v', '--version', - action='store_true', - help='Only retrieve a generated version for graphs from the graph spec.') args = parser.parse_args() graph_id_arg = args.graph_id - retrieve_version = args.version graph_builder = GraphBuilder() if graph_id_arg == "all": - if retrieve_version: - graph_versions = [graph_spec.graph_version for graph_spec in graph_builder.graph_specs] - print('\n'.join(graph_versions)) - else: - for g_id in [graph_spec.graph_id for graph_spec in graph_builder.graph_specs]: - graph_builder.build_graph(g_id) + for graph_spec in graph_builder.graph_specs.values(): + graph_builder.build_graph(graph_spec) else: - graph_spec = graph_builder.get_graph_spec(graph_id_arg) + graph_spec = graph_builder.graph_specs.get(graph_id_arg, None) if graph_spec: - if retrieve_version: - print(graph_spec.graph_version) - else: - graph_builder.build_graph(graph_id_arg) + graph_builder.build_graph(graph_spec) else: print(f'Invalid graph spec requested: {graph_id_arg}') for results_graph_id, results in graph_builder.build_results.items(): diff --git a/Common/kgx_file_merger.py b/Common/kgx_file_merger.py index b6d54159..19abb14d 100644 --- a/Common/kgx_file_merger.py +++ b/Common/kgx_file_merger.py @@ -43,10 +43,12 @@ def merge(self, primary_sources = [] secondary_sources = [] for graph_source in chain(graph_spec.sources, graph_spec.subgraphs): - if graph_source.merge_strategy == 'default': + if not graph_source.merge_strategy: primary_sources.append(graph_source) elif graph_source.merge_strategy == 'connected_edge_subset': secondary_sources.append(graph_source) + else: + return {'merge_error': f'Unsupported merge strategy specified: {graph_source.merge_strategy}'} # TODO we should be able to process a single primary source more efficiently (ie copy and paste it) # if len(primary_sources) == 1: @@ -73,8 +75,9 @@ def merge(self, all_source_ids = [graph_source.id for graph_source in chain(graph_spec.sources, graph_spec.subgraphs)] missing_data_sets = [source_id for source_id in all_source_ids if source_id not in merge_metadata['sources'].keys()] - self.logger.error(f"Error merging graph {graph_spec.graph_id}! could not merge: {missing_data_sets}") - + error_message = f"Error merging graph {graph_spec.graph_id}! could not merge: {missing_data_sets}" + self.logger.error(error_message) + merge_metadata["merge_error"] = error_message return merge_metadata def merge_primary_sources(self, diff --git a/Common/kgxmodel.py b/Common/kgxmodel.py index 1ce3b41a..b161e6a0 100644 --- a/Common/kgxmodel.py +++ b/Common/kgxmodel.py @@ -1,7 +1,6 @@ -from dataclasses import dataclass, InitVar -from typing import Callable +from dataclasses import dataclass from Common.biolink_constants import NAMED_THING -from Common.metadata import GraphMetadata +from Common.metadata import GraphMetadata, get_source_release_version from Common.normalization import NormalizationScheme @@ -62,67 +61,65 @@ def get_metadata_representation(self): @dataclass class GraphSource: id: str - version: str = None - merge_strategy: str = 'default' + merge_strategy: str = None file_paths: list = None + # Version may be generated when requested and differs for subclasses of GraphSource. + def __getattribute__(self, name): + if name == "version": + return self.generate_version() + else: + return object.__getattribute__(self, name) + @dataclass class SubGraphSource(GraphSource): + graph_version: str = None graph_metadata: GraphMetadata = None def get_metadata_representation(self): return {'graph_id': self.id, - 'release_version': self.version, + 'graph_version': self.graph_version, 'merge_strategy:': self.merge_strategy, 'graph_metadata': self.graph_metadata.metadata if self.graph_metadata else None} + def generate_version(self): + return self.graph_version + @dataclass class DataSource(GraphSource): normalization_scheme: NormalizationScheme = None - source_version: InitVar[str] = None + source_version: str = None parsing_version: str = None supplementation_version: str = None release_info: dict = None - # This function serves as an optional way to provide a callable function which can determine the source version, - # instead of setting it during initialization. This is used like lazy initialization, because determining the - # source version of a data source can be expensive and error-prone, and we don't want to do it if we don't need to. - get_source_version: InitVar[Callable[[str], str]] = None - _source_version: str = None - _get_source_version: Callable[[str], str] = None - - def __post_init__(self, source_version, get_source_version): - self._get_source_version = get_source_version - # if a source_version is provided in initialization, just store that and return it - if source_version: - self._source_version = source_version - # if neither the source version nor a function to determine it is provided, throw an error - if not source_version and not get_source_version: - raise Exception(f'Invalid DataSource initialization - ' - f'source_version or get_source_version must be provided.') - - # when the source_version attribute is accessed either return _source_version if it is set - # or call the function supplied to retrieve it - def __getattribute__(self, name): - if name == "source_version": - if self._source_version is None: - self._source_version = self._get_source_version(self.id) - return self._source_version - else: - return object.__getattribute__(self, name) - def get_metadata_representation(self): metadata = {'source_id': self.id, - 'source_version': self.source_version, # this may produce an IDE warning but it's right - 'release_version': self.version, + 'source_version': self.source_version, 'parsing_version': self.parsing_version, 'supplementation_version': self.supplementation_version, 'normalization_scheme': self.normalization_scheme.get_metadata_representation(), + 'release_version': self.generate_version(), 'merge_strategy': self.merge_strategy} if self.release_info: metadata.update(self.release_info) return metadata - + # We can use generate_version to see if a source_version was already set. If not, we don't try to generate an + # overall version because we can't. Typical usage would be a lazy instantiation approach, first setting + # source_version to None, then checking this and retrieving/setting the source_version if needed, + # after which the overall version can be generated. + # + # We use get_source_release_version to generate versions for data sources the same deterministic way that + # the data source pipeline uses, so a version generated by a graph spec will match the release version generated by + # previous runs of the pipeline. + def generate_version(self): + if self.source_version is None: + return None + return get_source_release_version(self.id, + self.source_version, + self.parsing_version, + self.normalization_scheme.get_composite_normalization_version(), + self.supplementation_version) diff --git a/Common/load_manager.py b/Common/load_manager.py index 15029941..b4f4b168 100644 --- a/Common/load_manager.py +++ b/Common/load_manager.py @@ -504,20 +504,18 @@ def run_qc_and_metadata_stage(self, parsing_version: str, supplementation_version: str, normalization_scheme: NormalizationScheme): - # source data QC here - source_metadata = self.get_source_metadata(source_id, source_version) - normalization_version = normalization_scheme.get_composite_normalization_version() + # source data QC should go here self.logger.info(f'Generating release for {source_id}') + source_metadata = self.get_source_metadata(source_id, source_version) loader = SOURCE_DATA_LOADER_CLASSES[source_id](test_mode=self.test_mode) source_meta_information = loader.get_source_meta_information() - source_metadata.generate_release_metadata(parsing_version=parsing_version, - supplementation_version=supplementation_version, - normalization_version=normalization_version, - source_meta_information=source_meta_information) - return source_metadata.get_release_version(parsing_version=parsing_version, - supplementation_version=supplementation_version, - normalization_version=normalization_version) + normalization_version = normalization_scheme.get_composite_normalization_version() + release_version = source_metadata.generate_release_metadata(parsing_version=parsing_version, + supplementation_version=supplementation_version, + normalization_version=normalization_version, + source_meta_information=source_meta_information) + return release_version def get_source_metadata(self, source_id: str, source_version): if source_id not in self.source_metadata or source_version not in self.source_metadata[source_id]: diff --git a/Common/metadata.py b/Common/metadata.py index b2f9ca32..58f8f447 100644 --- a/Common/metadata.py +++ b/Common/metadata.py @@ -297,18 +297,6 @@ def has_supplemental_data(self, parsing_version: str, normalization_version: str except KeyError: return False - def get_release_version(self, - parsing_version: str, - normalization_version: str, - supplementation_version: str): - if "releases" in self.metadata: - for release_version, release in self.metadata["releases"].items(): - if ((release["parsing_version"] == parsing_version) and - (release["normalization_version"] == normalization_version) and - (release["supplementation_version"] == supplementation_version)): - return release_version - return None - def generate_release_metadata(self, parsing_version: str, normalization_version: str, @@ -316,12 +304,11 @@ def generate_release_metadata(self, source_meta_information: dict): if "releases" not in self.metadata: self.metadata["releases"] = {} - release_info = "".join([self.source_id, - self.source_version, - parsing_version, - normalization_version, - supplementation_version]) - release_version = xxh64_hexdigest(release_info) + release_version = get_source_release_version(self.source_id, + self.source_version, + parsing_version, + normalization_version, + supplementation_version) if release_version not in self.metadata["releases"]: self.metadata["releases"][release_version] = { "source_version": self.source_version, @@ -331,31 +318,22 @@ def generate_release_metadata(self, } self.metadata["releases"][release_version].update(source_meta_information) self.save_metadata() + return release_version def get_release_info(self, release_version: str): if 'releases' in self.metadata and release_version in self.metadata['releases']: return self.metadata['releases'][release_version] + return None - ''' - these need to be updated for the new versioning format, but we may not need them - def get_final_node_count(self): - try: - node_count = 0 - node_count += self.metadata['normalization_info']['final_normalized_nodes'] - if self.has_supplemental_data(): - node_count += self.metadata['supplementation_info']['normalization_info']['final_normalized_nodes'] - return node_count - except KeyError as k: - raise RuntimeError(f'Required metadata was not available: {k}') - - def get_final_edge_count(self): - try: - node_count = 0 - node_count += self.metadata['normalization_info']['final_normalized_edges'] - if self.has_supplemental_data(): - node_count += self.metadata['supplementation_info']['normalization_info']['final_normalized_edges'] - return node_count - except KeyError as k: - raise RuntimeError(f'Required metadata was not available: {k}') - ''' +def get_source_release_version(source_id, + source_version, + parsing_version, + normalization_version, + supplementation_version): + release_string = "_".join([source_id, + source_version, + parsing_version, + normalization_version, + supplementation_version]) + return xxh64_hexdigest(release_string) diff --git a/README.md b/README.md index 56dbbe1b..e1d9400c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ To build a graph use a Graph Spec yaml file to specify the sources you want. ORION will automatically run each data source specified through the necessary pipeline. Then it will merge the specified sources into a Knowledge Graph. -### Using ORION +### Installing and Configuring ORION Create a parent directory: ``` @@ -30,13 +30,13 @@ git clone https://github.com/RobokopU24/ORION.git Next create directories where data sources, graphs, and logs will be stored. -ORION_STORAGE - for storing data sources +**ORION_STORAGE** - for storing data sources -ORION_GRAPHS - for storing knowledge graphs +**ORION_GRAPHS** - for storing knowledge graphs -ORION_LOGS - for storing logs +**ORION_LOGS** - for storing logs -You can do this manually, or use the script indicated below to set up a default configuration. +You can do this manually, or use the script indicated below to set up a default workspace. Option 1: Use this script to create the directories and set the environment variables: ``` @@ -44,7 +44,7 @@ cd ~/ORION_root/ORION/ source ./set_up_test_env.sh ``` -Option 2: Create three directories and manually set environment variables specifying paths to the locations of those directories. +Option 2: Create three directories and set environment variables specifying paths to the locations of those directories. ``` mkdir ~/ORION_root/storage/ export ORION_STORAGE=~/ORION_root/storage/ @@ -56,52 +56,92 @@ mkdir ~/ORION_root/logs/ export ORION_LOGS=~/ORION_root/logs/ ``` -Next create or select a Graph Spec yaml file where the content of knowledge graphs to be built will be specified. +Next create or select a Graph Spec yaml file, where the content of knowledge graphs to be built is specified. -Use either of the following options, but not both: - -Note that running the setup script set_up_test_env.sh will perform Option 1 for you. +Set either of the following environment variables, but not both: Option 1: ORION_GRAPH_SPEC - the name of a Graph Spec file located in the graph_specs directory of ORION ``` -export ORION_GRAPH_SPEC=testing-graph-spec.yml +export ORION_GRAPH_SPEC=example-graph-spec.yaml +``` +Option 2: ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec yaml file +``` +export ORION_GRAPH_SPEC_URL=https://stars.renci.org/var/data_services/graph_specs/default-graph-spec.yaml ``` -Option 2: ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec file + +To build a custom graph, alter a Graph Spec file, which is composed of a list of graphs. + +For each graph, specify: + +**graph_id** - a unique identifier string for the graph, with no spaces + +**sources** - a list of sources identifiers for data sources to include in the graph + +See the full list of data sources and their identifiers in the [data sources file](https://github.com/RobokopU24/ORION/blob/master/Common/data_sources.py). + +Here is a simple example. ``` -export ORION_GRAPH_SPEC_URL=https://example.com/example-graph-spec.yml +graphs: + - graph_id: Example_Graph + graph_name: Example Graph + graph_description: A free text description of what is in the graph. + output_format: neo4j + sources: + - source_id: CTD + - source_id: HGNC ``` -To build a custom graph, alter the Graph Spec file. See the graph_specs directory for examples. +There are variety of ways to further customize a knowledge graph. The following are parameters you can set for a particular data source. Mostly, these parameters are used to indicate that you'd like to use a previously built version of a data source or a specific normalization of a source. If you specify versions that are not the latest, and haven't previously built a data source or graph with those versions, it probably won't work. + +**source_version** - the version of the data source, as determined by ORION + +**parsing_version** - the version of the parsing code in ORION for this source + +**merge_strategy** - used to specify alternative merge strategies + +The following are parameters you can set for the entire graph, or for an individual data source: -TODO: explain options available in the graph spec (normalization version, source data version can be specified) +**node_normalization_version** - the version of the node normalizer API (see: https://nodenormalization-sri.renci.org/openapi.json) + +**edge_normalization_version** - the version of biolink model used to normalize predicates and validate the KG + +**strict_normalization** - True or False specifying whether to discard nodes, node types, and edges connected to those nodes when they fail to normalize + +**conflation** - True or False flag specifying whether to conflate genes with proteins and chemicals with drugs + +For example, we could customize the previous example: ``` graphs: - - graph_id: Example_Graph_ID + - graph_id: Example_Graph graph_name: Example Graph - graph_description: This is a description of what is in the graph. + graph_description: A free text description of what is in the graph. output_format: neo4j sources: - - source_id: Biolink + - source_id: CTD - source_id: HGNC ``` +See the graph_specs directory for more examples. + +### Running ORION + Install Docker to create and run the necessary containers. -By default using docker-compose up will build every graph in your Graph Spec. It runs the command: python /ORION/Common/build_manager.py all. +By default, using docker-compose up will build every graph in your Graph Spec. It runs the command: python /ORION/Common/build_manager.py all ``` docker-compose up ``` -If you want to specify an individual graph you can override the default command with a graph id from your Spec. +If you want to build an individual graph, you can override the default command with a graph_id from the Graph Spec: ``` -docker-compose run --rm orion python /ORION/Common/build_manager.py Example_Graph_ID +docker-compose run --rm orion python /ORION/Common/build_manager.py Example_Graph ``` -To run the ORION pipeline for a single data source, you can use: +To run the ORION pipeline for a single data source, you can use the load manager: ``` -docker-compose run --rm orion python /ORION/Common/load_manager.py Example_Source +docker-compose run --rm orion python /ORION/Common/load_manager.py CTD ``` To see available arguments and a list of supported data sources: ``` -python /ORION/Common/load_manager.py -h +docker-compose run --rm orion python /ORION/Common/load_manager.py -h ``` ### For Developers diff --git a/graph_specs/default-graph-spec.yml b/graph_specs/default-graph-spec.yml index aa47ce33..62900368 100644 --- a/graph_specs/default-graph-spec.yml +++ b/graph_specs/default-graph-spec.yml @@ -3,8 +3,8 @@ graphs: - graph_id: Baseline # optional parameters to be applied to the entire graph - see README for more info - # node_normalization_version: latest - # edge_normalization_version: latest + # node_normalization_version: 2.3.5 + # edge_normalization_version: 4.2.5 # graph_name: Robokop Baseline # graph_description: 'The baseline graph from which RobokopKG and other graphs are built.' # conflation: True # (whether to conflate node types like Genes and Proteins) diff --git a/graph_specs/example-graph-spec.yaml b/graph_specs/example-graph-spec.yaml new file mode 100644 index 00000000..35992210 --- /dev/null +++ b/graph_specs/example-graph-spec.yaml @@ -0,0 +1,18 @@ +# Example graph spec +graphs: + - graph_id: Example_Graph + graph_name: Example Graph + graph_description: 'This is a small graph that can be used as an example.' + graph_url: '' + output_format: neo4j + sources: + - source_id: CTD + # source_version: November_2024 + # parsing_version: 1.5 + # node_normalization_version: 2.3.18 + # edge_normalization_version: 4.2.1 + # conflation: False + # strict_normalization: True + # merge_strategy: + - source_id: HGNC + - source_id: GtoPdb \ No newline at end of file diff --git a/graph_specs/testing-graph-spec.yml b/graph_specs/testing-graph-spec.yml deleted file mode 100644 index a3fc82e5..00000000 --- a/graph_specs/testing-graph-spec.yml +++ /dev/null @@ -1,28 +0,0 @@ -# testing graph spec -graphs: - - graph_id: Testing_Baseline - graph_name: Testing Baseline - graph_description: 'A fake description for the testing baseline!' - graph_url: 'http://localhost/fake_graph_url_for_tesing_baseline' - output_format: none - sources: - - source_id: CTD - # source_version: - # parsing_version: - # node_normalization_version: latest - # edge_normalization_version: latest - # conflation: False - # strict_normalization: True (whether or not data should be discarded when it can not be normalized) - # merge_strategy: default (used to specify alternative merge strategies) - - source_id: HGNC - - source_id: PANTHER - - - graph_id: Testing_Graph_2 - graph_name: Testing Graph 2 - graph_description: 'Another fake description for testing graph 2!' - graph_url: 'http://localhost/fake_graph_url_for_tesing_graph_2' - output_format: neo4j - subgraphs: - - graph_id: Testing_Baseline - sources: - - source_id: GtoPdb \ No newline at end of file diff --git a/set_up_test_env.sh b/set_up_test_env.sh index 1ef6edd3..e5fc26a2 100644 --- a/set_up_test_env.sh +++ b/set_up_test_env.sh @@ -17,7 +17,7 @@ export ORION_LOGS="$PWD/../ORION_logs/" #Use EITHER of the following, ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL #ORION_GRAPH_SPEC - the name of a Graph Spec file located in the graph_specs directory of ORION -export ORION_GRAPH_SPEC=testing-graph-spec.yml +export ORION_GRAPH_SPEC=example-graph-spec.yaml #ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec file #export ORION_GRAPH_SPEC_URL=https://raw.githubusercontent.com/RENCI-AUTOMAT/ORION/helm_deploy/graph_specs/yeast-graph-spec.yml diff --git a/tests/graph_specs/testing-graph-spec.yaml b/tests/graph_specs/testing-graph-spec.yaml new file mode 100644 index 00000000..f619130f --- /dev/null +++ b/tests/graph_specs/testing-graph-spec.yaml @@ -0,0 +1,10 @@ +# Testing graph spec +graphs: + - graph_id: Testing_Graph + graph_name: Testing Graph + graph_description: 'This is a small graph spec that can be used for testing.' + graph_url: '' + output_format: + sources: + - source_id: CTD + - source_id: HGNC \ No newline at end of file diff --git a/tests/test_graph_spec.py b/tests/test_graph_spec.py new file mode 100644 index 00000000..21cb25f0 --- /dev/null +++ b/tests/test_graph_spec.py @@ -0,0 +1,73 @@ +import os +import pytest +import requests.exceptions + +from Common.build_manager import GraphBuilder, GraphSpecError + + +def clear_graph_spec_config(): + os.environ['ORION_GRAPH_SPEC'] = '' + os.environ['ORION_GRAPH_SPEC_URL'] = '' + + +def reset_graph_spec_config(): + os.environ['ORION_GRAPH_SPEC'] = 'testing-graph-spec.yaml' + os.environ['ORION_GRAPH_SPEC_URL'] = '' + + +def get_testing_graph_spec_dir(): + # this is ORION/tests/graph_specs not ORION/graph_specs + testing_specs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'graph_specs') + print(testing_specs_dir) + return testing_specs_dir + + +def test_empty_graph_spec_config(): + clear_graph_spec_config() + with pytest.raises(GraphSpecError): + graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) + + +def test_invalid_graph_spec_config(): + clear_graph_spec_config() + os.environ['ORION_GRAPH_SPEC'] = 'invalid-spec.yaml' + with pytest.raises(GraphSpecError): + graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) + + +def test_invalid_graph_spec_url_config(): + clear_graph_spec_config() + os.environ['ORION_GRAPH_SPEC_URL'] = 'http://localhost/invalid_graph_spec_url' + with pytest.raises(requests.exceptions.ConnectionError): + graph_builder = GraphBuilder() + + +def test_valid_graph_spec_config(): + reset_graph_spec_config() + os.environ['ORION_GRAPH_SPEC'] = 'testing-graph-spec.yaml' + graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) + assert len(graph_builder.graph_specs) + + testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph', None) + assert testing_graph_spec is not None + + assert len(testing_graph_spec.sources) == 2 + + for source in testing_graph_spec.sources: + assert source.version is None + + +def test_graph_spec_lazy_versions(): + reset_graph_spec_config() + os.environ['ORION_GRAPH_SPEC'] = 'testing-graph-spec.yaml' + graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) + testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph', None) + for source in testing_graph_spec.sources: + assert source.version is None + for source in testing_graph_spec.sources: + source.source_version = source.id + "_1" + for source in testing_graph_spec.sources: + assert source.version is not None + + +