bis-med-it · javihern98 · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = "^3.9"
-httpx = {version = "0.*", optional = true}
+httpx = "^0.27.0"
 msgspec = "0.*"
 lxml = {version = "5.*", optional = true}
 xmltodict = {version = "0.*", optional = true}
@@ -34,7 +34,6 @@ pandas = {version = "^2.2.2", optional = true}
 
 [tool.poetry.extras]
 dc = ["python-dateutil"]
-fmr = ["httpx"]
 xml = ["lxml", "xmltodict", "sdmxschemas"]
 data = ["pandas"]
 

diff --git a/src/pysdmx/io/__init__.py b/src/pysdmx/io/__init__.py
@@ -1 +1,5 @@
 """IO module for SDMX data."""
+
+from pysdmx.io.reader import get_datasets, read_sdmx
+
+__all__ = ["read_sdmx", "get_datasets"]
diff --git a/src/pysdmx/io/csv/sdmx10/reader/__init__.py b/src/pysdmx/io/csv/sdmx10/reader/__init__.py
@@ -14,10 +14,7 @@ def __generate_dataset_from_sdmx_csv(data: pd.DataFrame) -> PandasDataset:
     structure_id = data["DATAFLOW"].iloc[0]
     # Drop 'DATAFLOW' column from DataFrame
     df_csv = data.drop(["DATAFLOW"], axis=1)
-    urn = (
-        f"urn:sdmx:org.sdmx.infomodel.datastructure."
-        f"DataFlow={structure_id}"
-    )
+    urn = f"DataFlow={structure_id}"
 
     # Extract dataset attributes from sdmx-csv (all values are the same)
     attributes = {

diff --git a/src/pysdmx/io/csv/sdmx20/__init__.py b/src/pysdmx/io/csv/sdmx20/__init__.py
@@ -1,6 +1,6 @@
 """SDMX 2.0 CSV reader and writer."""
 
-from pysdmx.model.message import ActionType
+from pysdmx.model.dataset import ActionType
 
 SDMX_CSV_ACTION_MAPPER = {
     ActionType.Append: "A",

diff --git a/src/pysdmx/io/csv/sdmx20/reader/__init__.py b/src/pysdmx/io/csv/sdmx20/reader/__init__.py
@@ -7,7 +7,7 @@
 
 from pysdmx.errors import Invalid
 from pysdmx.io.pd import PandasDataset
-from pysdmx.model.message import ActionType
+from pysdmx.model.dataset import ActionType
 
 ACTION_SDMX_CSV_MAPPER_READING = {
     "A": ActionType.Append,
@@ -49,20 +49,11 @@ def __generate_dataset_from_sdmx_csv(data: pd.DataFrame) -> PandasDataset:
     df_csv = data.drop(["STRUCTURE", "STRUCTURE_ID"], axis=1)
 
     if structure_type == "DataStructure".lower():
-        urn = (
-            "urn:sdmx:org.sdmx.infomodel.datastructure."
-            f"DataStructure={structure_id}"
-        )
+        urn = f"DataStructure={structure_id}"
     elif structure_type == "DataFlow".lower():
-        urn = (
-            "urn:sdmx:org.sdmx.infomodel.datastructure."
-            f"DataFlow={structure_id}"
-        )
+        urn = f"DataFlow={structure_id}"
     elif structure_type == "dataprovision":
-        urn = (
-            f"urn:sdmx:org.sdmx.infomodel.registry."
-            f"ProvisionAgreement={structure_id}"
-        )
+        urn = f"ProvisionAgreement={structure_id}"
     else:
         raise Invalid(
             "Invalid value on STRUCTURE column",

diff --git a/src/pysdmx/io/enums.py b/src/pysdmx/io/enums.py
@@ -0,0 +1,21 @@
+"""IO Enumerations for SDMX files."""
+
+from enum import Enum
+
+
+class SDMXFormat(Enum):
+    """Enumeration of supported SDMX read formats."""
+
+    SDMX_ML_2_1_STRUCTURE = "SDMX-ML 2.1 Structure"
+    SDMX_ML_2_1_DATA_STRUCTURE_SPECIFIC = "SDMX-ML 2.1 StructureSpecific"
+    SDMX_ML_2_1_DATA_GENERIC = "SDMX-ML 2.1 Generic"
+    SDMX_ML_2_1_SUBMISSION = "SDMX-ML 2.1 Submission"
+    SDMX_ML_2_1_ERROR = "SDMX-ML 2.1 Error"
+    SDMX_JSON_2 = "SDMX-JSON 2.0.0"
+    FUSION_JSON = "FusionJSON"
+    SDMX_CSV_1_0 = "SDMX-CSV 1.0"
+    SDMX_CSV_2_0 = "SDMX-CSV 2.0"
+
+    def __str__(self) -> str:
+        """Return the string representation of the format."""
+        return self.value
diff --git a/src/pysdmx/io/input_processor.py b/src/pysdmx/io/input_processor.py
@@ -1,12 +1,18 @@
 """Processes the input that comes into read_sdmx function."""
 
-from io import BytesIO, TextIOWrapper
+import csv
+import os.path
+from io import BytesIO, StringIO, TextIOWrapper
 from json import JSONDecodeError, loads
 from os import PathLike
 from pathlib import Path
 from typing import Tuple, Union
 
-from pysdmx.errors import Invalid
+import pandas as pd
+from httpx import get as httpx_get
+
+from pysdmx.errors import Invalid, NotImplemented
+from pysdmx.io.enums import SDMXFormat
 
 
 def __remove_bom(input_string: str) -> str:
@@ -17,50 +23,113 @@ def __check_xml(infile: str) -> bool:
     return infile[:5] == "<?xml"
 
 
+def __check_csv(infile: str) -> bool:
+    try:
+        pd.read_csv(StringIO(infile), nrows=2)
+        if (
+            len(infile.splitlines()) > 1
+            or infile.splitlines()[0].count(",") > 1
+        ):
+            return True
+    except Exception:
+        return False
+    return False
+
+
+def __check_json(infile: str) -> bool:
+    try:
+        loads(infile)
+        return True
+    except JSONDecodeError:
+        return False
+
+
+def __get_sdmx_ml_flavour(infile: str) -> Tuple[str, SDMXFormat]:
+    flavour_check = infile[:1000].lower()
+    if ":generic" in flavour_check:
+        return infile, SDMXFormat.SDMX_ML_2_1_DATA_GENERIC
+    if ":structurespecificdata" in flavour_check:
+        return infile, SDMXFormat.SDMX_ML_2_1_DATA_STRUCTURE_SPECIFIC
+    if ":structure" in flavour_check:
+        return infile, SDMXFormat.SDMX_ML_2_1_STRUCTURE
+    if ":registryinterface" in flavour_check:
+        return infile, SDMXFormat.SDMX_ML_2_1_SUBMISSION
+    if ":error" in flavour_check:
+        return infile, SDMXFormat.SDMX_ML_2_1_ERROR
+    raise Invalid("Validation Error", "Cannot parse input as SDMX-ML.")
+
+
+def __get_sdmx_csv_flavour(infile: str) -> Tuple[str, SDMXFormat]:
+    headers = csv.reader(StringIO(infile)).__next__()
+    if "DATAFLOW" in headers:
+        return infile, SDMXFormat.SDMX_CSV_1_0
+    elif "STRUCTURE" in headers and "STRUCTURE_ID" in headers:
+        return infile, SDMXFormat.SDMX_CSV_2_0
+    raise Invalid("Validation Error", "Cannot parse input as SDMX-CSV.")
+
+
+def __check_sdmx_str(infile: str) -> Tuple[str, SDMXFormat]:
+    """Attempts to infer the SDMX format of the input string."""
+    if __check_xml(infile):
+        return __get_sdmx_ml_flavour(infile)
+    if __check_csv(infile):
+        return __get_sdmx_csv_flavour(infile)
+    if __check_json(infile):
+        raise NotImplemented("JSON formats reading are not supported yet")
+    raise Invalid("Validation Error", "Cannot parse input as SDMX.")
+
+
 def process_string_to_read(
-    infile: Union[str, Path, BytesIO],
-) -> Tuple[str, str]:
+    input: Union[str, Path, BytesIO],
+) -> Tuple[str, SDMXFormat]:
     """Processes the input that comes into read_sdmx function.
 
+    Automatically detects the format of the input. The input can be a file,
+    URL, or string.
+
     Args:
-        infile: Path to file, URL, or string.
+        input: Path to file, URL, or string.
 
     Returns:
         tuple: Tuple containing the parsed input and the format of the input.
 
     Raises:
         Invalid: If the input cannot be parsed as SDMX.
     """
+    if isinstance(input, str) and os.path.exists(input):
+        input = Path(input)
     # Read file as string
-    if isinstance(infile, (Path, PathLike)):
-        with open(infile, "r", encoding="utf-8-sig", errors="replace") as f:
+    if isinstance(input, (Path, PathLike)):
+        with open(input, "r", encoding="utf-8-sig", errors="replace") as f:
             out_str = f.read()
 
     # Read from BytesIO
-    elif isinstance(infile, BytesIO):
-        text_wrap = TextIOWrapper(infile, encoding="utf-8", errors="replace")
+    elif isinstance(input, BytesIO):
+        text_wrap = TextIOWrapper(input, encoding="utf-8", errors="replace")
         out_str = text_wrap.read()
 
-    elif isinstance(infile, str):
-        out_str = infile
+    elif isinstance(input, str):
+        if input.startswith("http"):
+            try:
+                response = httpx_get(input, timeout=60)
+                if (
+                    response.status_code != 200
+                    and "<?xml" not in response.text
+                ):
+                    raise Exception("Invalid URL, no SDMX Error found")
+                out_str = response.text
+            except Exception:
+                raise Invalid(
+                    "Validation Error",
+                    f"Cannot retrieve a SDMX Message from URL: {input}.",
+                ) from None
+        else:
+            out_str = input
     else:
         raise Invalid(
-            "Validation Error", f"Cannot parse input of type {type(infile)}."
+            "Validation Error", f"Cannot parse input of type {type(input)}."
         )
 
     out_str = __remove_bom(out_str)
 
-    # Check if string is a valid JSON
-    try:
-        loads(out_str)
-        return out_str, "json"
-    except JSONDecodeError:
-        pass
-
-    # Check if string is a valid XML
-    if __check_xml(out_str):
-        return out_str, "xml"
-
-    raise Invalid(
-        "Validation Error", f"Cannot parse input as SDMX. Found {infile}"
-    )
+    return __check_sdmx_str(out_str)
diff --git a/src/pysdmx/io/reader.py b/src/pysdmx/io/reader.py
@@ -0,0 +1,126 @@
+"""SDMX All formats reader module."""
+
+from io import BytesIO
+from pathlib import Path
+from typing import Sequence, Union
+
+from pysdmx.errors import Invalid, NotFound
+from pysdmx.io.enums import SDMXFormat
+from pysdmx.io.input_processor import process_string_to_read
+from pysdmx.model import Schema
+from pysdmx.model.dataset import Dataset
+from pysdmx.model.message import Message
+from pysdmx.util import parse_short_urn
+
+
+def read_sdmx(
+    infile: Union[str, Path, BytesIO],
+    validate: bool = True,
+    use_dataset_id: bool = False,
+) -> Message:
+    """Reads any sdmx file or buffer and returns a dictionary.
+
+    Supported metadata formats are:
+    - SDMX-ML 2.1
+
+    Supported data formats are:
+    - SDMX-ML 2.1
+    - SDMX-CSV 1.0
+    - SDMX-CSV 2.0
+
+    Args:
+        infile: Path to file (pathlib.Path), URL, or string.
+        use_dataset_id: Whether to use the dataset ID as
+            the key in the resulting dictionary (only for SDMX-ML).
+        validate: Validate the input file (only for SDMX-ML).
+
+    Returns:
+        A dictionary containing the parsed SDMX data or metadata.
+
+    Raises:
+        Invalid: If the file is empty or the format is not supported.
+    """
+    input_str, read_format = process_string_to_read(infile)
+
+    if read_format in (
+        SDMXFormat.SDMX_ML_2_1_DATA_GENERIC,
+        SDMXFormat.SDMX_ML_2_1_DATA_STRUCTURE_SPECIFIC,
+        SDMXFormat.SDMX_ML_2_1_STRUCTURE,
+        SDMXFormat.SDMX_ML_2_1_SUBMISSION,
+        SDMXFormat.SDMX_ML_2_1_ERROR,
+    ):
+        # SDMX-ML 2.1
+        from pysdmx.io.xml.sdmx21.reader import read_xml
+
+        result = read_xml(
+            input_str, validate=validate, use_dataset_id=use_dataset_id
+        )
+    elif read_format == SDMXFormat.SDMX_CSV_1_0:
+        # SDMX-CSV 1.0
+        from pysdmx.io.csv.sdmx10.reader import read
+
+        result = read(input_str)
+    else:
+        # SDMX-CSV 2.0
+        from pysdmx.io.csv.sdmx20.reader import read
+
+        result = read(input_str)
+
+    if len(result) == 0:
+        raise Invalid("Empty SDMX Message")
+
+    # Returning a Message class
+    if read_format in (
+        SDMXFormat.SDMX_CSV_1_0,
+        SDMXFormat.SDMX_CSV_2_0,
+        SDMXFormat.SDMX_ML_2_1_DATA_GENERIC,
+        SDMXFormat.SDMX_ML_2_1_DATA_STRUCTURE_SPECIFIC,
+    ):
+        # TODO: Add here the Schema download for Datasets, based on structure
+        # TODO: Ensure we have changed the signature of the data readers
+        return Message(data=result)
+
+    # TODO: Ensure we have changed the signature of the structure readers
+    return Message(structures=result)
+
+
+def get_datasets(
+    data: Union[str, Path, BytesIO],
+    structure: Union[str, Path, BytesIO],
+    validate: bool = True,
+) -> Sequence[Dataset]:
+    """Reads a data message and a structure message and returns a dataset.
+
+    Args:
+        data: Path to file (pathlib.Path), URL, or string for the data message.
+        structure:
+          Path to file (pathlib.Path), URL, or string
+          for the structure message.
+        validate: Validate the input file (only for SDMX-ML).
+
+    Returns:
+        A sequence of Datasets
+    """
+    data_msg = read_sdmx(data, validate=validate)
+    if not data_msg.data:
+        raise Invalid("No data found in the data message")
+
+    structure_msg = read_sdmx(structure, validate=validate)
+    if structure_msg.structures is None:
+        raise Invalid("No structure found in the structure message")
+
+    for dataset in data_msg.data.values():
+        short_urn: str = (
+            dataset.structure.short_urn
+            if isinstance(dataset.structure, Schema)
+            else dataset.structure
+        )
+        sdmx_type = parse_short_urn(short_urn).sdmx_type
+        if sdmx_type == "DataStructure":
+            try:
+                dsd = structure_msg.get_data_structure_definition(short_urn)
+                dataset.structure = dsd.to_schema()
+            except NotFound:
+                continue
+
+    return list(data_msg.data.values())