From 970e13bbbdea1d0d41301bc5f4c1edd67201db2d Mon Sep 17 00:00:00 2001 From: VincentCauchois Date: Tue, 29 Oct 2024 17:01:34 +0100 Subject: [PATCH] [WIP](mtd): several modifications for mtd module --- .../geonature/core/gn_meta/mtd/__init__.py | 17 +- .../geonature/core/gn_meta/mtd/mtd_utils.py | 323 ++++++++++++++++-- .../geonature/core/gn_meta/mtd/xml_parser.py | 33 +- backend/geonature/core/gn_meta/routes.py | 2 + 4 files changed, 342 insertions(+), 33 deletions(-) diff --git a/backend/geonature/core/gn_meta/mtd/__init__.py b/backend/geonature/core/gn_meta/mtd/__init__.py index fe119862ab..c499491b10 100644 --- a/backend/geonature/core/gn_meta/mtd/__init__.py +++ b/backend/geonature/core/gn_meta/mtd/__init__.py @@ -36,6 +36,7 @@ class MTDInstanceApi: af_path = "/mtd/cadre/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" ds_path = "/mtd/cadre/jdd/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" + # TODO: check if there are endpoints to retrieve metadata for a given user and instance, and not only a given user and whatever instance ds_user_path = "/mtd/cadre/jdd/export/xml/GetRecordsByUserId?id={ID_ROLE}" af_user_path = "/mtd/cadre/export/xml/GetRecordsByUserId?id={ID_ROLE}" single_af_path = "/mtd/cadre/export/xml/GetRecordById?id={ID_AF}" # NOTE: `ID_AF` is actually an UUID and not an ID from the point of view of geonature database. @@ -60,6 +61,9 @@ def _get_xml(self, path): def _get_af_xml(self): return self._get_xml(self.af_path) + # TODO: make the functions `get_af_list` and `get_ds_list` homogeneous + # - Use functions `parse_acquisition_framworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset` + def get_af_list(self) -> list: xml = self._get_af_xml() _xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8") @@ -206,6 +210,9 @@ def process_af_and_ds(af_list, ds_list, id_role=None): add_unexisting_digitizer(af["id_digitizer"] if not id_role else id_role) user_add_total_time += time.time() - start_add_user_time af = sync_af(af) + # TODO: choose whether or not to commit retrieval of the AF before association of actors + # and possibly retrieve an AF without any actor associated to it + db.session.commit() # If the AF has not been retrieved, associated actors cannot be retrieved either # and thus we continue to the next AF if not af: @@ -215,6 +222,7 @@ def process_af_and_ds(af_list, ds_list, id_role=None): CorAcquisitionFrameworkActor, "id_acquisition_framework", af.id_acquisition_framework, + af.unique_acquisition_framework_id, ) # TODO: remove actors removed from MTD db.session.commit() @@ -231,7 +239,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None): user_add_total_time += time.time() - start_add_user_time ds = sync_ds(ds, list_cd_nomenclature) if ds is not None: - associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset) + associate_actors( + actors, + CorDatasetActor, + "id_dataset", + ds.id_dataset, + ds.unique_dataset_id, + ) user_add_total_time = round(user_add_total_time, 2) db.session.commit() @@ -264,7 +278,6 @@ def sync_af_and_ds_by_user(id_role, id_af=None): id_af : str, optional The ID of an AF (Acquisition Framework). """ - logger.info("MTD - SYNC USER : START") # Create an instance of MTDInstanceApi diff --git a/backend/geonature/core/gn_meta/mtd/mtd_utils.py b/backend/geonature/core/gn_meta/mtd/mtd_utils.py index e8bde8bf04..a3e5a654fa 100644 --- a/backend/geonature/core/gn_meta/mtd/mtd_utils.py +++ b/backend/geonature/core/gn_meta/mtd/mtd_utils.py @@ -1,15 +1,19 @@ import logging import json from copy import copy +import pprint +from typing import Literal, Union +import uuid from flask import current_app +from pypnusershub.routes import insert_or_update_role from sqlalchemy import select, exists -from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.exc import SQLAlchemyError, IntegrityError from sqlalchemy.sql import func, update from sqlalchemy.dialects.postgresql import insert as pg_insert -from geonature.utils.env import DB +from geonature.utils.env import DB, db from geonature.core.gn_meta.models import ( TDatasets, CorDatasetActor, @@ -32,7 +36,8 @@ } # get the root logger -log = logging.getLogger() +# log = logging.getLogger() +logger = logging.getLogger("MTD_SYNC") def sync_ds(ds, cd_nomenclatures): @@ -43,12 +48,22 @@ def sync_ds(ds, cd_nomenclatures): :param ds: DS infos :param cd_nomenclatures: cd_nomenclature from ref_normenclatures.t_nomenclatures """ + + uuid_ds = ds["unique_dataset_id"] + name_ds = ds["dataset_name"] + + logger.debug("MTD - PROCESSING DS WITH UUID '%s' AND NAME '%s'" % (uuid_ds, name_ds)) + if not ds["cd_nomenclature_data_origin"]: ds["cd_nomenclature_data_origin"] = "NSP" # FIXME: the following temporary fix was added due to possible differences in referential of nomenclatures values between INPN and GeoNature # should be fixed by ensuring that the two referentials are identical, at least for instances that integrates with INPN and thus rely on MTD synchronization from INPN Métadonnées: GINCO and DEPOBIO instances. - if ds["cd_nomenclature_data_origin"] not in cd_nomenclatures: + ds_cd_nomenclature_data_origin = ds["cd_nomenclature_data_origin"] + if ds_cd_nomenclature_data_origin not in cd_nomenclatures: + logger.warning( + f"MTD - Nomenclature with code '{ds_cd_nomenclature_data_origin}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'" + ) return # CONTROL AF @@ -62,7 +77,9 @@ def sync_ds(ds, cd_nomenclatures): ) if af is None: - log.warning(f"AF with UUID '{af_uuid}' not found in database.") + logger.warning( + f"MTD - AF with UUID '{af_uuid}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'" + ) return ds["id_acquisition_framework"] = af.id_acquisition_framework @@ -109,7 +126,8 @@ def sync_ds(ds, cd_nomenclatures): def sync_af(af): - """Will update a given AF (Acquisition Framework) if already exists in database according to UUID, else update the AF. + """ + Will update a given AF (Acquisition Framework) if already exists in database according to UUID, else update the AF. Parameters ---------- @@ -121,7 +139,22 @@ def sync_af(af): TAcquisitionFramework The updated or inserted acquisition framework. """ + # TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ; + # af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `` tag in the XML file + # Solutions - if UUID is missing: + # - Just pass the sync of the AF + # - Generate a UUID for the AF af_uuid = af["unique_acquisition_framework_id"] + name_af = af["acquisition_framework_name"] + + logger.debug("MTD - PROCESSING AF WITH UUID '%s' AND NAME '%s'" % (af_uuid, name_af)) + + if not af_uuid: + logger.warning( + f"No UUID provided for the AF with UUID '{af_uuid}' and name '{name_af}' - SKIPPING SYNCHRONIZATION FOR THIS AF." + ) + return None + af_exists = DB.session.scalar( exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select() ) @@ -184,53 +217,235 @@ def add_or_update_organism(uuid, nom, email): return DB.session.execute(statement).scalar() -def associate_actors(actors, CorActor, pk_name, pk_value): +def associate_actors( + actors, + CorActor: Union[CorAcquisitionFrameworkActor, CorDatasetActor], + pk_name: Literal["id_acquisition_framework", "id_dataset"], + pk_value: str, + uuid_mtd: str, +): """ - Associate actor and DS or AF according to CorActor value. + Associate actors with either a given : + - Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`. + - Dataset - writing to the table `gn_meta.cor_dataset_actor`. Parameters ---------- actors : list list of actors - CorActor : db.Model - table model - pk_name : str - pk attribute name + CorActor : Union[CorAcquisitionFrameworkActor, CorDatasetActor] + the SQLAlchemy model corresponding to the destination table + pk_name : Literal['id_acquisition_framework', 'id_dataset'] + pk attribute name: + - 'id_acquisition_framework' for AF + - 'id_dataset' for DS pk_value : str - pk value + pk value: ID of the AF or DS + uuid_mtd : str + UUID of the AF or DS """ + type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS" for actor in actors: id_organism = None uuid_organism = actor["uuid_organism"] + organism_name = actor["organism"] + email_actor = actor["email"] + # TODO: choose whether to add or update an organism with no UUID specified + # - add or update it using organism name only - field `organism` if uuid_organism: with DB.session.begin_nested(): # create or update organisme # FIXME: prevent update of organism email from actor email ! Several actors may be associated to the same organism and still have different mails ! id_organism = add_or_update_organism( uuid=uuid_organism, - nom=actor["organism"] if actor["organism"] else "", - email=actor["email"], + nom=organism_name if organism_name else None, + email=email_actor, + ) + else: + # Create a new organism in database with the organism name `name_organism` and a new UUID + # /!\ We do not use the actor email as the organism email - field `email_organisme` will be empty + # Only the three non-null fields will be written: `id_organisme`, `uuid_organisme`, `nom_organisme`. + if organism_name: + # TODO: /!\ Handle case where there is also an organism with the name equals to the value of `name_organism` + # - check if there already is an organism with the name `name_organism` + # - if there is: + # - (optional ?) update the organism with the values `organism_name` and `email_actor` ? + # - set the variable `id_organism` with the ID of the existing organism + # - if there is not: + # - create a new organism with the value of `organism_name` for the name, but with no email and with newly generated ID and UUID + is_exists_organism = DB.session.scalar( + exists().where(BibOrganismes.nom_organisme == organism_name).select() ) + if is_exists_organism: + # TODO: chose whether to keep the following section - update of the organism with the values `organism_name` and `email_actor` ? + # --- If yes: uncomment the two following statements + # uuid_organism = DB.session.scalar( + # select(BibOrganismes.uuid_organisme) + # .where(BibOrganismes.nom_organisme == organism_name) + # .limit(1) + # ) + # id_organism = add_or_update_organism( + # uuid=uuid_organism, + # nom=organism_name, + # email=email_actor, + # ) + id_organism = DB.session.scalar( + select(BibOrganismes.id_organisme) + .where(BibOrganismes.nom_organisme == organism_name) + .limit(1) + ) + else: + with DB.session.begin_nested(): + # create or update organisme + # FIXME: prevent update of organism email from actor email ! Several actors may be associated to the same organism and still have different mails ! + id_organism = add_or_update_organism( + uuid=str(uuid.uuid4()), + nom=organism_name, + email=None, + ) + cd_nomenclature_actor_role = actor["actor_role"] + id_nomenclature_actor_role = func.ref_nomenclatures.get_id_nomenclature( + "ROLE_ACTEUR", cd_nomenclature_actor_role + ) values = dict( - id_nomenclature_actor_role=func.ref_nomenclatures.get_id_nomenclature( - "ROLE_ACTEUR", actor["actor_role"] - ), + id_nomenclature_actor_role=id_nomenclature_actor_role, **{pk_name: pk_value}, ) - if not id_organism: - values["id_role"] = DB.session.scalar( - select(User.id_role).filter_by(email=actor["email"]) - ) - else: + # TODO: choose wether to: + # - (retained) Try to associate to an organism first and then to a user + # - Try to associate to a user first and then to an organism + if id_organism: values["id_organism"] = id_organism - statement = ( - pg_insert(CorActor) - .values(**values) - .on_conflict_do_nothing( - index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"], + # TODO: handle case where no user is retrieved for the actor email: + # - (retained) If the actor role is "Contact Principal" associate to a new user with only a UUID and an ID, else just do not try to associate the actor with the metadata + # - Try to retrieve an id_organism from the organism name - field `organism` + # - Try to retrieve an id_organism from the actor email considered as the organism email - field `email` + # - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email` + else: + id_user_from_email = DB.session.scalar( + select(User.id_role).filter_by(email=email_actor).where(User.groupe.is_(False)) + ) + if id_user_from_email: + values["id_role"] = id_user_from_email + else: + # If actor role is "Contact Principal", i.e. cd_nomenclature_actor_role = '1' , + # then we create a new user with no specific information + # the three non-null fields for `utilisateurs.t_roles` will be set to default: + # - `groupe`: False - the role is a user and not a group + # - `id_role`: generated by the nextval sequence + # - `uuid_role`: generated by uuid_generate_v4() + # in particular: we do not specify field `email` even if `email_actor` is to be set + # TODO: FUNCTIONAL - verify: + # - whether to: + # - (alternative 0) systematically add a new user for each concerned metadata + # - (RETAINED FOR THE MOMENT) or rather use a single user for all metadata without retrieved "Contact Principal" + # - whether it is right to use only (strictly) negative values for those new users: + # - must ensure that we will not already have (strictly) negative values in the `utilisateurs.t_roles` table for concerned instances (GINCO, DEPOBIO, ...) + # - what are the limits for 'serial4' PostgreSQL type: minimum and maximum values, ... + # - alternatives: + # - using the nextval sequence several times until an unused value is found BUT possible future conflict with new entries from INPN users + # - take the first value superior to 0 and which is not already used BUT possible future conflict with new entries from INPN users + # - take a base value which is enough high to avoid conflicts with INPN users IDs + # - start from the maximum or minimum value allowed by 'serial4' PostgreSQL type + # - whether to add a new user or a new organism... + cd_nomenclature_actor_role_for_contact_principal = "1" + if cd_nomenclature_actor_role == cd_nomenclature_actor_role_for_contact_principal: + ### Alternative 0 (see TODO above): + ### UNCOMMENT the folowing section if this alternative is eventually chosen + # # Get an integer that is equals to 1 less than the minimum of `utilisateurs.id_role` field in database + # # to ensure that the new ID is not already used + # # - we cannot rely on the next value from the sequence `utilisateurs.t_roles.t_roles_id_role_seq` because we can have, + # # and actually have for GINCO and DEPOBIO instances, ID values higher than this next value: this is due + # # to the fact that entries are inserted in the table `utilisateurs.t_roles` specifying the ID rather than + # # using the nextval sequence. + # # - moreover, we cannot actually take a (strictly) positive integer value as we risk to take an ID that could + # # later be needed for the retrieval of a user from the INPN as users are retrieved from the INPN taking + # # values associated to them in the INPN, especially the ID, for the insert in `utilisateurs.t_roles`. + # # - in other words, the field `utilisateurs.t_roles.id_roles` is actually not localized to the GeoNature instance, at + # # least for instances integrated with the INPN such as GINCO and DEPOBIO instances. + # # We take a value that is at most equal to -1 so as to clearly identify those generated users as the users + # # with (strictly) negative IDs. + # min_id_role = DB.session.scalar( + # select(func.min(User.id_role)) + # ) + # if min_id_role == 1: + # min_id_role = -1 + # if min_id_role: + # id_generated_user = min_id_role - 1 + # else: + # id_generated_user = 1 + ### RETAINED FOR THE MOMENT (see TODO above): + # TODO: + # - i. TODO: Choose of ID for user created for metadata with no "Contact Principal" that could be retrieved - alternatives: + # - (RETAINED FOR THE MOMENT)(alternative 0.1) value 0 - assuming that 0 is never used amongst the different instances + # - (alternative 0.2) value -1 - assuming that -1 is never used amongst the different instances AND that it is a valid value for 'serial4' + # actually it should not be possible to have negative values for the 'serial4' PostgreSQL type, + # which should allow values ranging from 1 to 2147483647 + # // documentation PG 16: https://www.postgresql.org/docs/current/datatype-numeric.html#DATATYPE-NUMERIC + # BUT a test creating a user with ID = -1 in a GN database has been done successfully... + # TODO: determiner why is it possible to have a value of -1, and other negative values (-2 has been tested also) + # - (alternative 0.3) value 2147483647 - maximum value for 'serial4' PostgreSQL type + # - (alternative 0.4) minimum negative value [to be determined] + # - ii. TODO: Choose whether to + # - add textual information to the created user + # - which information to provide: + # - (RETAINED FOR THE MOMENT) "Contact Principal for 'orphan' metadata - with no 'Contact Principal' that could be retrieved during MTD INPN synchronisation" + # - (alternatives) ... + # - which fields in `utilisateurs.t_roles` to be written: + # - (?) `identifiant` + # - (?) `nom_role` + # - (?) `prenom_role` + # - (?) `email` + # - (?) `id_organisme` + # - (RETAINED FOR THE MOMENT) `desc_role` (text) + # - (RETAINED FOR THE MOMENT) "Contact Principal for 'orphan' metadata - i.e. with no 'Contact Principal' that could be retrieved during INPN MTD synchronisation" + # - (??) `champs_addi` + # - (?) `remarques` (text) + # - (alternative) or not - the user will only have: an ID, a UUID + # Retrieve information for the role with ID 0 + desc_role_for_new_user = "Contact Principal for 'orphan' metadata - i.e. with no 'Contact Principal' that could be retrieved during MTD INPN synchronisation" + id_new_user = 0 + new_user = DB.session.get(User, id_new_user) + if new_user: + assert new_user.desc_role == desc_role_for_new_user + else: + dict_data_generated_user = { + "id_role": id_new_user, + "desc_role": desc_role_for_new_user, + } + dict_data_generated_user = insert_or_update_role( + data=dict_data_generated_user + ) + # TODO: verify it is ok to commit here + # Done to ensure that the insert from previous statement is actually committed + db.session.commit() + values["id_role"] = id_new_user + else: + logger.warning( + f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_role could be retrieved - with the following actor information:\n" + + format_str_dict_actor_for_logging(actor) + ) + continue + try: + statement = ( + pg_insert(CorActor) + .values(**values) + .on_conflict_do_nothing( + index_elements=[ + pk_name, + "id_organism" if id_organism else "id_role", + "id_nomenclature_actor_role", + ], + ) + ) + DB.session.execute(statement) + except IntegrityError as I: + db.session.rollback() + logger.error( + f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n" + + format_sqlalchemy_error_for_logging(I) + + format_str_dict_actor_for_logging(actor) ) - ) - DB.session.execute(statement) def associate_dataset_modules(dataset): @@ -246,3 +461,51 @@ def associate_dataset_modules(dataset): ) ).all() ) + + +def format_sqlalchemy_error_for_logging(error: SQLAlchemyError): + """ + Format SQLAlchemy error information in a nice way for MTD logging + + Parameters + ---------- + error : SQLAlchemyError + the SQLAlchemy error + + Returns + ------- + str + formatted error information + """ + indented_original_error_message = str(error.orig).replace("\n", "\n\t") + + formatted_error_message = "".join( + [ + f"\t{indented_original_error_message}", + f"SQL QUERY: {error.statement}\n", + f"\tSQL PARAMS: {error.params}\n", + ] + ) + + return formatted_error_message + + +def format_str_dict_actor_for_logging(actor: dict): + """ + Format actor information in a nice way for MTD logging + + Parameters + ---------- + actor : dict + actor information: actor_role, email, name, organism, uuid_organism, ... + + Returns + ------- + str + formatted actor information + """ + formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace( + "\n", "\n\t\t" + ).rstrip("\t") + + return formatted_str_dict_actor diff --git a/backend/geonature/core/gn_meta/mtd/xml_parser.py b/backend/geonature/core/gn_meta/mtd/xml_parser.py index efcf98d6be..5ada7e3d20 100644 --- a/backend/geonature/core/gn_meta/mtd/xml_parser.py +++ b/backend/geonature/core/gn_meta/mtd/xml_parser.py @@ -1,5 +1,6 @@ import datetime import json +from typing import Union from flask import current_app from lxml import etree as ET @@ -61,6 +62,13 @@ def parse_actors_xml(actors): return actor_list +# TODO: IMPORTANT - filter the list of acquisition frameworks with `ID_INSTANCE_FILTER` as made for the list of datasets + +# TODO: make functions for AF and DS homogeneous: refactorize +# - Have functions `parse_acquisition_frameworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset` +# - Eventually split into distinct functions the XML parsing and the mapping of fields + + def parse_acquisition_framwork_xml(xml): """ Parse an xml of AF from a string @@ -130,10 +138,33 @@ def parse_jdd_xml(xml): root = ET.fromstring(xml, parser=_xml_parser) jdd_list = [] + + def format_acquisition_framework_id_from_xml(provided_af_uuid) -> Union[str, None]: + """ + Format the acquisition framework UUID provided for the dataset + i.e. the value for the tag `` in the XML file + + Args: + provided_af_uuid (str): The acquisition framework UUID + Returns: + Union[str, None]: The formatted acquisition framework UUID, or None if none was provided + """ + if not provided_af_uuid: + return None + + if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"): + return provided_af_uuid.split("/")[-1] + + return provided_af_uuid + for jdd in root.findall(".//" + namespace + "JeuDeDonnees"): # We extract all the required informations from the different tags of the XML file jdd_uuid = get_tag_content(jdd, "identifiantJdd") - ca_uuid = get_tag_content(jdd, "identifiantCadre") + # TODO: handle case where value for the tag `` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` + # Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') : + # - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part + # - Add a check further in the MTD sync to process only if ca_uuid is in the right format + ca_uuid = format_acquisition_framework_id_from_xml(get_tag_content(jdd, "identifiantCadre")) dataset_name = get_tag_content(jdd, "libelle") dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="") dataset_desc = get_tag_content(jdd, "description", default_value="") diff --git a/backend/geonature/core/gn_meta/routes.py b/backend/geonature/core/gn_meta/routes.py index 5bb91bbee0..15af2e15ae 100644 --- a/backend/geonature/core/gn_meta/routes.py +++ b/backend/geonature/core/gn_meta/routes.py @@ -79,12 +79,14 @@ @routes.before_request def synchronize_mtd(): + # TODO: add `if request.method != "OPTIONS" and [...]` in the following condition if request.endpoint in ["gn_meta.get_datasets", "gn_meta.get_acquisition_frameworks_list"]: from flask_login import current_user if current_user.is_authenticated: params = request.json if request.is_json else request.args try: + # TODO: trigger a user sync without id_af in case no id_af is provided list_id_af = params.get("id_acquisition_frameworks", []) for id_af in list_id_af: sync_af_and_ds_by_user(id_role=current_user.id_role, id_af=id_af)