From f7cbef575ce50d7933a2dca417863be1217ea3a2 Mon Sep 17 00:00:00 2001 From: VincentCauchois Date: Fri, 23 Aug 2024 18:20:36 +0200 Subject: [PATCH] [WIP]fix(mtd): fixing several anomalies for mtd sync with inpn mtd --- .../geonature/core/gn_meta/mtd/__init__.py | 34 +++- .../geonature/core/gn_meta/mtd/mtd_utils.py | 177 +++++++++++++++--- .../geonature/core/gn_meta/mtd/xml_parser.py | 32 +++- backend/geonature/core/gn_meta/routes.py | 2 + 4 files changed, 213 insertions(+), 32 deletions(-) diff --git a/backend/geonature/core/gn_meta/mtd/__init__.py b/backend/geonature/core/gn_meta/mtd/__init__.py index fb7b11ec05..c2e973799c 100644 --- a/backend/geonature/core/gn_meta/mtd/__init__.py +++ b/backend/geonature/core/gn_meta/mtd/__init__.py @@ -36,6 +36,7 @@ class MTDInstanceApi: af_path = "/mtd/cadre/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" ds_path = "/mtd/cadre/jdd/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" + # TODO: check if there are endpoints to retrieve metadata for a given user and instance, and not only a given user and whatever instance ds_user_path = "/mtd/cadre/jdd/export/xml/GetRecordsByUserId?id={ID_ROLE}" af_user_path = "/mtd/cadre/export/xml/GetRecordsByUserId?id={ID_ROLE}" single_af_path = "/mtd/cadre/export/xml/GetRecordById?id={ID_AF}" # NOTE: `ID_AF` is actually an UUID and not an ID from the point of view of geonature database. @@ -60,7 +61,10 @@ def _get_xml(self, path): def _get_af_xml(self): return self._get_xml(self.af_path) - def get_af_list(self): + # TODO: make the functions `get_af_list` and `get_ds_list` homogeneous + # - Use functions `parse_acquisition_framworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset` + + def get_af_list(self) -> list: xml = self._get_af_xml() _xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8") root = etree.fromstring(xml, parser=_xml_parser) @@ -73,7 +77,7 @@ def get_af_list(self): def _get_ds_xml(self): return self._get_xml(self.ds_path) - def get_ds_list(self): + def get_ds_list(self) -> list: xml = self._get_ds_xml() return parse_jdd_xml(xml) @@ -206,11 +210,19 @@ def process_af_and_ds(af_list, ds_list, id_role=None): add_unexisting_digitizer(af["id_digitizer"] if not id_role else id_role) user_add_total_time += time.time() - start_add_user_time af = sync_af(af) + # TODO: choose whether or not to commit retrieval of the AF before association of actors + # and possibly retrieve an AF without any actor associated to it + db.session.commit() + # If AF has not been synchronized ; due to the lack of a UUID ; actor cannot be associated to it + # and thus we skip to the next AF + if not af: + continue associate_actors( actors, CorAcquisitionFrameworkActor, "id_acquisition_framework", af.id_acquisition_framework, + af.unique_acquisition_framework_id, ) # TODO: remove actors removed from MTD db.session.commit() @@ -227,7 +239,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None): user_add_total_time += time.time() - start_add_user_time ds = sync_ds(ds, list_cd_nomenclature) if ds is not None: - associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset) + associate_actors( + actors, + CorDatasetActor, + "id_dataset", + ds.id_dataset, + ds.unique_dataset_id, + ) user_add_total_time = round(user_add_total_time, 2) db.session.commit() @@ -253,11 +271,13 @@ def sync_af_and_ds_by_user(id_role, id_af=None): """ Method to trigger MTD sync on user authentication. - Args: - id_role (int): The ID of the role (group or user). - id_af (str, optional): The ID of the AF (Acquisition Framework). Defaults to None. + Parameters + ----------- + id_role : int + The ID of the role (group or user). + id_af : str, optional + The ID of an AF (Acquisition Framework). """ - logger.info("MTD - SYNC USER : START") # Create an instance of MTDInstanceApi diff --git a/backend/geonature/core/gn_meta/mtd/mtd_utils.py b/backend/geonature/core/gn_meta/mtd/mtd_utils.py index e8bde8bf04..a877e801e5 100644 --- a/backend/geonature/core/gn_meta/mtd/mtd_utils.py +++ b/backend/geonature/core/gn_meta/mtd/mtd_utils.py @@ -1,15 +1,17 @@ import logging import json from copy import copy +import pprint +from typing import Literal from flask import current_app from sqlalchemy import select, exists -from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.exc import SQLAlchemyError, IntegrityError from sqlalchemy.sql import func, update from sqlalchemy.dialects.postgresql import insert as pg_insert -from geonature.utils.env import DB +from geonature.utils.env import DB, db from geonature.core.gn_meta.models import ( TDatasets, CorDatasetActor, @@ -32,7 +34,8 @@ } # get the root logger -log = logging.getLogger() +# log = logging.getLogger() +logger = logging.getLogger("MTD_SYNC") def sync_ds(ds, cd_nomenclatures): @@ -43,12 +46,22 @@ def sync_ds(ds, cd_nomenclatures): :param ds: DS infos :param cd_nomenclatures: cd_nomenclature from ref_normenclatures.t_nomenclatures """ + + uuid_ds = ds["unique_dataset_id"] + name_ds = ds["dataset_name"] + + logger.debug("MTD - PROCESSING DS WITH UUID '%s' AND NAME '%s'" % (uuid_ds, name_ds)) + if not ds["cd_nomenclature_data_origin"]: ds["cd_nomenclature_data_origin"] = "NSP" # FIXME: the following temporary fix was added due to possible differences in referential of nomenclatures values between INPN and GeoNature # should be fixed by ensuring that the two referentials are identical, at least for instances that integrates with INPN and thus rely on MTD synchronization from INPN Métadonnées: GINCO and DEPOBIO instances. - if ds["cd_nomenclature_data_origin"] not in cd_nomenclatures: + ds_cd_nomenclature_data_origin = ds["cd_nomenclature_data_origin"] + if ds_cd_nomenclature_data_origin not in cd_nomenclatures: + logger.warning( + f"MTD - Nomenclature with code '{ds_cd_nomenclature_data_origin}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'" + ) return # CONTROL AF @@ -62,7 +75,9 @@ def sync_ds(ds, cd_nomenclatures): ) if af is None: - log.warning(f"AF with UUID '{af_uuid}' not found in database.") + logger.warning( + f"MTD - AF with UUID '{af_uuid}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'" + ) return ds["id_acquisition_framework"] = af.id_acquisition_framework @@ -121,7 +136,19 @@ def sync_af(af): TAcquisitionFramework The updated or inserted acquisition framework. """ + # TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ; + # af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `` tag in the XML file + # Solutions - if UUID is missing: + # - Just pass the sync of the AF + # - Generate a UUID for the AF af_uuid = af["unique_acquisition_framework_id"] + + if not af_uuid: + logger.warning( + f"No UUID provided for the AF with UUID '{af_uuid}' - SKIPPING SYNCHRONIZATION FOR THIS AF." + ) + return None + af_exists = DB.session.scalar( exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select() ) @@ -184,24 +211,39 @@ def add_or_update_organism(uuid, nom, email): return DB.session.execute(statement).scalar() -def associate_actors(actors, CorActor, pk_name, pk_value): +def associate_actors( + actors, + CorActor: CorAcquisitionFrameworkActor | CorDatasetActor, + pk_name: Literal["id_acquisition_framework", "id_dataset"], + pk_value: str, + uuid_mtd: str, +): """ - Associate actor and DS or AF according to CorActor value. + Associate actors with either a given : + - Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`. + - Dataset - writing to the table `gn_meta.cor_dataset_actor`. Parameters ---------- actors : list list of actors - CorActor : db.Model - table model - pk_name : str - pk attribute name + CorActor : CorAcquisitionFrameworkActor | CorDatasetActor + the SQLAlchemy model corresponding to the destination table + pk_name : Literal['id_acquisition_framework', 'id_dataset'] + pk attribute name: + - 'id_acquisition_framework' for AF + - 'id_dataset' for DS pk_value : str - pk value + pk value: ID of the AF or DS + uuid_mtd : str + UUID of the AF or DS """ + type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS" for actor in actors: id_organism = None uuid_organism = actor["uuid_organism"] + # TODO: choose whether to add or update an organism with no UUID specified + # - add or update it using organism name only - field `organism` if uuid_organism: with DB.session.begin_nested(): # create or update organisme @@ -211,26 +253,65 @@ def associate_actors(actors, CorActor, pk_name, pk_value): nom=actor["organism"] if actor["organism"] else "", email=actor["email"], ) + # else: + # # Create a new organism in database from organism name + # # /!\ Do not use actor email as organism email - create the organism with a name only and generating a new UUID + # raise NotImplementedError( + # f"Creation of new organism, if no UUID provided for the organism actor, not implemented yet." + # ) values = dict( id_nomenclature_actor_role=func.ref_nomenclatures.get_id_nomenclature( "ROLE_ACTEUR", actor["actor_role"] ), **{pk_name: pk_value}, ) - if not id_organism: - values["id_role"] = DB.session.scalar( - select(User.id_role).filter_by(email=actor["email"]) - ) - else: + # TODO: choose wether to: + # - (retained) Try to associate to an organism first and then to a user + # - Try to associate to a user first and then to an organism + if id_organism: values["id_organism"] = id_organism - statement = ( - pg_insert(CorActor) - .values(**values) - .on_conflict_do_nothing( - index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"], + # TODO: handle case where no user is retrieved for the actor email: + # - (retained) Just do not try to associate the actor with the metadata + # - Try to retrieve and id_organism from the organism name - field `organism` + # - Try to retrieve and id_organism from the actor email considered as an organism email - field `email` + # - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email` + else: + id_user_from_email = DB.session.scalar( + select(User.id_role).filter_by(email=actor["email"]).where(User.groupe.is_(False)) + ) + if id_user_from_email: + values["id_role"] = id_user_from_email + else: + # TODO: if actor role is "Contact Principal" ; id_nomenclature_actor_role = ? ; then a new user is created with a UUID and an ID only, but with no name nor email + # the metadata is then associated with this new user + raise NotImplementedError( + f"If actor role is 'Contact Principal': creation of a new user, if no organism retrieved nor known user from email retrieved, not implemented yet." + ) + logger.warning( + f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_role could be retrieved - with the following actor information:\n" + + format_str_dict_actor_for_logging(actor) + ) + continue + try: + statement = ( + pg_insert(CorActor) + .values(**values) + .on_conflict_do_nothing( + index_elements=[ + pk_name, + "id_organism" if id_organism else "id_role", + "id_nomenclature_actor_role", + ], + ) + ) + DB.session.execute(statement) + except IntegrityError as I: + db.session.rollback() + logger.error( + f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n" + + format_sqlalchemy_error_for_logging(I) + + format_str_dict_actor_for_logging(actor) ) - ) - DB.session.execute(statement) def associate_dataset_modules(dataset): @@ -246,3 +327,51 @@ def associate_dataset_modules(dataset): ) ).all() ) + + +def format_sqlalchemy_error_for_logging(error: SQLAlchemyError): + """ + Format SQLAlchemy error information in a nice way for MTD logging + + Parameters + ---------- + error : SQLAlchemyError + the SQLAlchemy error + + Returns + ------- + str + formatted error information + """ + indented_original_error_message = str(error.orig).replace("\n", "\n\t") + + formatted_error_message = "".join( + [ + f"\t{indented_original_error_message}", + f"SQL QUERY: {error.statement}\n", + f"\tSQL PARAMS: {error.params}\n", + ] + ) + + return formatted_error_message + + +def format_str_dict_actor_for_logging(actor: dict): + """ + Format actor information in a nice way for MTD logging + + Parameters + ---------- + actor : dict + actor information: actor_role, email, name, organism, uuid_organism, ... + + Returns + ------- + str + formatted actor information + """ + formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace( + "\n", "\n\t\t" + ).rstrip("\t") + + return formatted_str_dict_actor diff --git a/backend/geonature/core/gn_meta/mtd/xml_parser.py b/backend/geonature/core/gn_meta/mtd/xml_parser.py index efcf98d6be..63dc51494e 100644 --- a/backend/geonature/core/gn_meta/mtd/xml_parser.py +++ b/backend/geonature/core/gn_meta/mtd/xml_parser.py @@ -61,6 +61,13 @@ def parse_actors_xml(actors): return actor_list +# TODO: IMPORTANT - filter the list of acquisition frameworks with `ID_INSTANCE_FILTER` as made for the list of datasets + +# TODO: make functions for AF and DS homogeneous: refactorize +# - Have functions `parse_acquisition_frameworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset` +# - Eventually split into distinct functions the XML parsing and the mapping of fields + + def parse_acquisition_framwork_xml(xml): """ Parse an xml of AF from a string @@ -130,10 +137,33 @@ def parse_jdd_xml(xml): root = ET.fromstring(xml, parser=_xml_parser) jdd_list = [] + + def format_acquisition_framework_id_from_xml(provided_af_uuid) -> str | None: + """ + Format the acquisition framework UUID provided for the dataset + i.e. the value for the tag `` in the XML file + + Args: + provided_af_uuid (str): The acquisition framework UUID + Returns: + str | None: The formatted acquisition framework UUID, or None if none was provided + """ + if not provided_af_uuid: + return None + + if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"): + return provided_af_uuid.split("/")[-1] + + return provided_af_uuid + for jdd in root.findall(".//" + namespace + "JeuDeDonnees"): # We extract all the required informations from the different tags of the XML file jdd_uuid = get_tag_content(jdd, "identifiantJdd") - ca_uuid = get_tag_content(jdd, "identifiantCadre") + # TODO: handle case where value for the tag `` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` + # Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') : + # - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part + # - Add a check further in the MTD sync to process only if ca_uuid is in the right format + ca_uuid = format_acquisition_framework_id_from_xml(get_tag_content(jdd, "identifiantCadre")) dataset_name = get_tag_content(jdd, "libelle") dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="") dataset_desc = get_tag_content(jdd, "description", default_value="") diff --git a/backend/geonature/core/gn_meta/routes.py b/backend/geonature/core/gn_meta/routes.py index 5bb91bbee0..15af2e15ae 100644 --- a/backend/geonature/core/gn_meta/routes.py +++ b/backend/geonature/core/gn_meta/routes.py @@ -79,12 +79,14 @@ @routes.before_request def synchronize_mtd(): + # TODO: add `if request.method != "OPTIONS" and [...]` in the following condition if request.endpoint in ["gn_meta.get_datasets", "gn_meta.get_acquisition_frameworks_list"]: from flask_login import current_user if current_user.is_authenticated: params = request.json if request.is_json else request.args try: + # TODO: trigger a user sync without id_af in case no id_af is provided list_id_af = params.get("id_acquisition_frameworks", []) for id_af in list_id_af: sync_af_and_ds_by_user(id_role=current_user.id_role, id_af=id_af)