impresso · simon-clematide · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ build/
 dist/
 .pytest_cache
 .ipynb_*
-*.__pyc
+*.__pyc
+venv/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,34 @@
+## Development Setup
+
+To set up a development environment and install an editable version of the `impresso_essentials` package, follow these steps:
+
+1. **Create a virtual environment**:
+
+   ```sh
+   python3 -m venv venv
+
+   ```
+
+2. **Activate the virtual environment**:
+
+   - On Windows:
+     ```sh
+     venv\Scripts\activate
+     ```
+   - On macOS/Linux:
+     ```sh
+     source venv/bin/activate
+     ```
+
+3. **Install the package in editable mode**:
+
+   ```sh
+   pip install -e .
+   ```
+
+4. **Run the tests**:
+   ```sh
+   pytest
+   ```
+
+This will discover and run all the tests in the tests directory.
diff --git a/impresso_essentials/io/s3_path_parser.py b/impresso_essentials/io/s3_path_parser.py
@@ -0,0 +1,159 @@
+# -*- coding: utf-8 -*-
+"""s3_path_parser.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1_-fTugHJfr7HsPFjnGTelvdC7E97NJ-K
+
+#  s3 file path parser
+"""
+
+import re
+
+# Define the possible values for various components
+phases = "sandbox|staging|final"
+processing_labels = (
+    "embeddings|entities|langident|lingproc|ocrqa|newsagencies|topics|textreuse"
+)
+processing_subtype_labels = "embeddings|images|entities"
+tasks = "ner|nel|tm|emb|lid|pos"
+subtasks = "newsagency"
+langs = "de|fr|en|lb|multilingual"
+
+
+# Construct the regex pattern using re.VERBOSE
+pattern = rf"""
+^s3://
+(?P<bucket>
+    (?P<stage_number>\d{{2}})
+    -processed-data-
+    (?P<phase>{phases})
+)
+/ (?P<processing_label>{processing_labels})
+(?: / (?P<processing_subtype_label>{processing_subtype_labels}) )?
+/
+(?P<run_id>
+    (?P=processing_label)
+    -
+    (?P<model_id>
+        (?P<task>{tasks})
+        (?:_(?P<subtask>{subtasks}))?
+        -
+        (?P<model_specificity>[A-Za-z][A-Za-z_-]*)
+        (?:_(?P<model_version>v(?P<model_major>\d+)\.(?P<model_minor>\d+)\.(?P<model_patch>\d+)))?
+        -
+        (?P<lang>{langs})
+    )
+    _
+    (?P<run_version>v(?P<run_major>\d+)-(?P<run_minor>\d+)-(?P<run_patch>\d+))
+)
+(?:/(?P<provider_alias>[A-Za-z]+))?
+(?:/(?P<media_alias>[A-Za-z0-9]+))
+/
+(?P<file_stem>
+    (?: (?P=media_alias) - ) # Backreference here
+    (?P<year>\d{{4}})
+    (?: - (?P=processing_label) )?   # Backreference here
+)
+\.jsonl\.bz2$
+"""
+
+# Compile the regex pattern
+regex = re.compile(pattern, re.VERBOSE)
+
+
+def parse_s3_path(path):
+    """
+    Parses the given S3 path according to the defined pattern.
+
+    Args:
+        path (str): The S3 path to parse.
+
+    Returns:
+        dict: A dictionary of the matched components, or None if no match is found.
+    """
+    match = regex.match(path)
+    if match:
+        return match.groupdict()
+    else:
+        return None
+
+
+def test_matching_paths(test_paths, verbose=True):
+    """
+    Tests paths that are expected to match the regex pattern.
+    """
+
+    print("Testing paths expected to match:")
+    for path in test_paths:
+        result = parse_s3_path(path)
+        if result:
+            print(f"✅ Passed: {path}")
+            # Uncomment below to print the matched groups
+            if verbose:
+                for key, value in result.items():
+                    print(f"  {key}: {value}")
+        else:
+            print(f"❌ Failed: {path} (Expected to match but did not)")
+    print("-" * 80)
+
+
+def test_non_matching_paths(test_paths, verbose=True):
+    """
+    Tests paths that are expected not to match the regex pattern.
+    """
+
+    print("Testing paths expected not to match:")
+    for path in test_paths:
+        result = parse_s3_path(path)
+        if not result:
+            print(f"✅ Passed: {path} (Correctly did not match)")
+            if verbose:
+                for key, value in result.items():
+                    print(f"  {key}: {value}")
+        else:
+            print(f"❌ Failed: {path} (Expected not to match but did)")
+    print("-" * 80)
+
+
+correct_test_paths = [
+    # Full path with all components
+    "s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2",
+    # Path without opt_processing_subtype_label
+    "s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2",
+    # Path without provider_alias and model version
+    "s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2",
+    # Path with optional subtask
+    "s3://01-processed-data-final/newsagencies/newsagencies-ner_newsagency-model_v1.2.0-en_v1-0-0/AFP/AFP-2021-newsagencies.jsonl.bz2",
+    # Path missing optional components
+    "s3://42-processed-data-final/embeddings/embeddings-tm-mallet-de_v4-0-0/MEDIA/MEDIA-2022.jsonl.bz2",
+    # Path with media_alias but no media_alias_file_stem
+    "s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2",
+    # Path with
+    "s3://41-processed-data-staging/lingproc/lingproc-pos-spacy_v3.6.0-multilingual_v1-0-2/IMP/IMP-2024.jsonl.bz2",
+    # entity suggestion
+    "s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2",
+]
+# Run the test functions
+# test_matching_paths(correct_test_paths,verbose=True)
+
+incorrect_test_paths = [
+    # Path with missing year in file_stem
+    "s3://06-processed-data-final/ocrqa/ocrqa-ner-en_core_web_lg_v2.2.2-en_v2-1-0/NewYorkTimes/USA/USA-ocrqa.jsonl.bz2",
+    # Path with missing processing_label_file_stem
+    "s3://07-processed-data-final/lingproc/lingproc-lid-fasttext_v1.0.0-en_v1-0-0/2023.jsonl.bz2",
+    # Path with invalid phase
+    "s3://10-processed-data-production/entities/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/2021-entities.jsonl.bz2",
+    # Path with incorrect file extension
+    "s3://11-processed-data-final/embeddings/embeddings-emb-word2vec_v4.0.0-multilingual_v4-0-0/2022-embeddings.txt",
+    # Path missing processing_label in run_id
+    "s3://12-processed-data-final/topics/topics-tm-lda_v1.0.0-en_v1-0-0/2021-topics.jsonl.bz2",
+    # Path with incorrect model_id format
+    "s3://13-processed-data-final/entities/entities-unknownmodel_v1.0.0-en_v1-0-0/2021-entities.jsonl.bz2",
+    # Path with provider_alias but no media_alias
+    "s3://09-processed-data-final/lingproc/lingproc-tm-lda_v1.0.0-en_v1-0-0/PROVIDER/MEDIA-2025-lingproc.jsonl.bz2",
+]
+
+
+# test_non_matching_paths(incorrect_test_paths,verbose=False)
diff --git a/tests/test_s3_path_parser.py b/tests/test_s3_path_parser.py
@@ -0,0 +1,136 @@
+import re
+import pytest
+from impresso_essentials.io.s3_path_parser import pattern
+
+
+@pytest.mark.parametrize(
+    "s3_path, expected",
+    [
+        # First case remains the same
+        (
+            "s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2",
+            {
+                "bucket": "01-processed-data-final",
+                "stage_number": "01",
+                "phase": "final",
+                "processing_label": "entities",
+                "processing_subtype_label": "embeddings",
+                "run_id": "entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0",
+                "model_id": "ner-en_core_web_sm_v3.1.0-en",
+                "task": "ner",
+                "subtask": None,
+                "model_specificity": "en_core_web_sm",
+                "model_version": "v3.1.0",
+                "model_major": "3",
+                "model_minor": "1",
+                "model_patch": "0",
+                "lang": "en",
+                "run_version": "v1-0-0",
+                "run_major": "1",
+                "run_minor": "0",
+                "run_patch": "0",
+                "provider_alias": "Reuters",
+                "media_alias": "UK",
+                "file_stem": "UK-2021",
+                "year": "2021",
+            },
+        ),
+        (
+            "s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2",
+            {
+                "bucket": "02-processed-data-staging",
+                "stage_number": "02",
+                "phase": "staging",
+                "processing_label": "langident",
+                "processing_subtype_label": None,
+                "run_id": "langident-lid-fasttext_v1.0.0-multilingual_v2-0-1",
+                "model_id": "lid-fasttext_v1.0.0-multilingual",
+                "task": "lid",
+                "subtask": None,
+                "model_specificity": "fasttext",
+                "model_version": "v1.0.0",
+                "model_major": "1",
+                "model_minor": "0",
+                "model_patch": "0",
+                "lang": "multilingual",
+                "run_version": "v2-0-1",
+                "run_major": "2",
+                "run_minor": "0",
+                "run_patch": "1",
+                "provider_alias": None,
+                "media_alias": "BBC",
+                "file_stem": "BBC-2020-langident",
+                "year": "2020",
+            },
+        ),
+        (
+            "s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2",
+            {
+                "bucket": "03-processed-data-sandbox",
+                "stage_number": "03",
+                "phase": "sandbox",
+                "processing_label": "topics",
+                "processing_subtype_label": None,
+                "run_id": "topics-tm-lda_model-en_v3-2-4",
+                "model_id": "tm-lda_model-en",
+                "task": "tm",
+                "subtask": None,
+                "model_specificity": "lda_model",
+                "model_version": None,
+                "model_major": None,
+                "model_minor": None,
+                "model_patch": None,
+                "lang": "en",
+                "run_version": "v3-2-4",
+                "run_major": "3",
+                "run_minor": "2",
+                "run_patch": "4",
+                "provider_alias": None,
+                "media_alias": "EXP",
+                "file_stem": "EXP-2021-topics",
+                "year": "2021",
+            },
+        ),
+        (
+            "s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2",
+            {
+                "bucket": "42-processed-data-final",
+                "stage_number": "42",
+                "phase": "final",
+                "processing_label": "topics",
+                "processing_subtype_label": None,
+                "run_id": "topics-tm-bert_v3.0.0-en_v3-0-0",
+                "model_id": "tm-bert_v3.0.0-en",
+                "task": "tm",
+                "subtask": None,
+                "model_specificity": "bert",
+                "model_version": "v3.0.0",
+                "model_major": "3",
+                "model_minor": "0",
+                "model_patch": "0",
+                "lang": "en",
+                "run_version": "v3-0-0",
+                "run_major": "3",
+                "run_minor": "0",
+                "run_patch": "0",
+                "provider_alias": None,
+                "media_alias": "CNN",
+                "file_stem": "CNN-2024-topics",
+                "year": "2024",
+            },
+        ),
+    ],
+)
+def test_successful_s3_path_matches(s3_path, expected):
+    match = re.match(pattern, s3_path, re.VERBOSE)
+    assert match is not None, f"Pattern did not match for {s3_path}"
+    for key, value in expected.items():
+        assert (
+            match.group(key) == value
+        ), f"For {key}: expected {value}, got {match.group(key)}"
+
+
+def test_failed_s3_path_match():
+    path = "s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2"
+    match = re.match(pattern, path, re.VERBOSE)
+    assert match is None, f"Pattern should not match for {path}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,4 +5,5 @@ build/ @@
     dist/
     .pytest_cache
     .ipynb_*
-    *.__pyc
+    *.__pyc
+    venv/