Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

S3pathparser and contribution readme.md #14

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ build/
dist/
.pytest_cache
.ipynb_*
*.__pyc
*.__pyc
venv/
34 changes: 34 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
## Development Setup

To set up a development environment and install an editable version of the `impresso_essentials` package, follow these steps:

1. **Create a virtual environment**:

```sh
python3 -m venv venv

```

2. **Activate the virtual environment**:

- On Windows:
```sh
venv\Scripts\activate
```
- On macOS/Linux:
```sh
source venv/bin/activate
```

3. **Install the package in editable mode**:

```sh
pip install -e .
```

4. **Run the tests**:
```sh
pytest
```

This will discover and run all the tests in the tests directory.
159 changes: 159 additions & 0 deletions impresso_essentials/io/s3_path_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
"""s3_path_parser.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1_-fTugHJfr7HsPFjnGTelvdC7E97NJ-K

# s3 file path parser
"""

import re

# Define the possible values for various components
phases = "sandbox|staging|final"
processing_labels = (
"embeddings|entities|langident|lingproc|ocrqa|newsagencies|topics|textreuse"
)
processing_subtype_labels = "embeddings|images|entities"
tasks = "ner|nel|tm|emb|lid|pos"
subtasks = "newsagency"
langs = "de|fr|en|lb|multilingual"


# Construct the regex pattern using re.VERBOSE
pattern = rf"""
^s3://
(?P<bucket>
(?P<stage_number>\d{{2}})
-processed-data-
(?P<phase>{phases})
)
/ (?P<processing_label>{processing_labels})
(?: / (?P<processing_subtype_label>{processing_subtype_labels}) )?
/
(?P<run_id>
(?P=processing_label)
-
(?P<model_id>
(?P<task>{tasks})
(?:_(?P<subtask>{subtasks}))?
-
(?P<model_specificity>[A-Za-z][A-Za-z_-]*)
(?:_(?P<model_version>v(?P<model_major>\d+)\.(?P<model_minor>\d+)\.(?P<model_patch>\d+)))?
-
(?P<lang>{langs})
)
_
(?P<run_version>v(?P<run_major>\d+)-(?P<run_minor>\d+)-(?P<run_patch>\d+))
)
(?:/(?P<provider_alias>[A-Za-z]+))?
(?:/(?P<media_alias>[A-Za-z0-9]+))
/
(?P<file_stem>
(?: (?P=media_alias) - ) # Backreference here
(?P<year>\d{{4}})
(?: - (?P=processing_label) )? # Backreference here
)
\.jsonl\.bz2$
"""

# Compile the regex pattern
regex = re.compile(pattern, re.VERBOSE)


def parse_s3_path(path):
"""
Parses the given S3 path according to the defined pattern.

Args:
path (str): The S3 path to parse.

Returns:
dict: A dictionary of the matched components, or None if no match is found.
"""
match = regex.match(path)
if match:
return match.groupdict()
else:
return None


def test_matching_paths(test_paths, verbose=True):
"""
Tests paths that are expected to match the regex pattern.
"""

print("Testing paths expected to match:")
for path in test_paths:
result = parse_s3_path(path)
if result:
print(f"✅ Passed: {path}")
# Uncomment below to print the matched groups
if verbose:
for key, value in result.items():
print(f" {key}: {value}")
else:
print(f"❌ Failed: {path} (Expected to match but did not)")
print("-" * 80)


def test_non_matching_paths(test_paths, verbose=True):
"""
Tests paths that are expected not to match the regex pattern.
"""

print("Testing paths expected not to match:")
for path in test_paths:
result = parse_s3_path(path)
if not result:
print(f"✅ Passed: {path} (Correctly did not match)")
if verbose:
for key, value in result.items():
print(f" {key}: {value}")
else:
print(f"❌ Failed: {path} (Expected not to match but did)")
print("-" * 80)


correct_test_paths = [
# Full path with all components
"s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2",
# Path without opt_processing_subtype_label
"s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2",
# Path without provider_alias and model version
"s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2",
# Path with optional subtask
"s3://01-processed-data-final/newsagencies/newsagencies-ner_newsagency-model_v1.2.0-en_v1-0-0/AFP/AFP-2021-newsagencies.jsonl.bz2",
# Path missing optional components
"s3://42-processed-data-final/embeddings/embeddings-tm-mallet-de_v4-0-0/MEDIA/MEDIA-2022.jsonl.bz2",
# Path with media_alias but no media_alias_file_stem
"s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2",
# Path with
"s3://41-processed-data-staging/lingproc/lingproc-pos-spacy_v3.6.0-multilingual_v1-0-2/IMP/IMP-2024.jsonl.bz2",
# entity suggestion
"s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2",
]
# Run the test functions
# test_matching_paths(correct_test_paths,verbose=True)

incorrect_test_paths = [
# Path with missing year in file_stem
"s3://06-processed-data-final/ocrqa/ocrqa-ner-en_core_web_lg_v2.2.2-en_v2-1-0/NewYorkTimes/USA/USA-ocrqa.jsonl.bz2",
# Path with missing processing_label_file_stem
"s3://07-processed-data-final/lingproc/lingproc-lid-fasttext_v1.0.0-en_v1-0-0/2023.jsonl.bz2",
# Path with invalid phase
"s3://10-processed-data-production/entities/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/2021-entities.jsonl.bz2",
# Path with incorrect file extension
"s3://11-processed-data-final/embeddings/embeddings-emb-word2vec_v4.0.0-multilingual_v4-0-0/2022-embeddings.txt",
# Path missing processing_label in run_id
"s3://12-processed-data-final/topics/topics-tm-lda_v1.0.0-en_v1-0-0/2021-topics.jsonl.bz2",
# Path with incorrect model_id format
"s3://13-processed-data-final/entities/entities-unknownmodel_v1.0.0-en_v1-0-0/2021-entities.jsonl.bz2",
# Path with provider_alias but no media_alias
"s3://09-processed-data-final/lingproc/lingproc-tm-lda_v1.0.0-en_v1-0-0/PROVIDER/MEDIA-2025-lingproc.jsonl.bz2",
]


# test_non_matching_paths(incorrect_test_paths,verbose=False)
136 changes: 136 additions & 0 deletions tests/test_s3_path_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import re
import pytest
from impresso_essentials.io.s3_path_parser import pattern


@pytest.mark.parametrize(
"s3_path, expected",
[
# First case remains the same
(
"s3://01-processed-data-final/entities/embeddings/entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0/Reuters/UK/UK-2021.jsonl.bz2",
{
"bucket": "01-processed-data-final",
"stage_number": "01",
"phase": "final",
"processing_label": "entities",
"processing_subtype_label": "embeddings",
"run_id": "entities-ner-en_core_web_sm_v3.1.0-en_v1-0-0",
"model_id": "ner-en_core_web_sm_v3.1.0-en",
"task": "ner",
"subtask": None,
"model_specificity": "en_core_web_sm",
"model_version": "v3.1.0",
"model_major": "3",
"model_minor": "1",
"model_patch": "0",
"lang": "en",
"run_version": "v1-0-0",
"run_major": "1",
"run_minor": "0",
"run_patch": "0",
"provider_alias": "Reuters",
"media_alias": "UK",
"file_stem": "UK-2021",
"year": "2021",
},
),
(
"s3://02-processed-data-staging/langident/langident-lid-fasttext_v1.0.0-multilingual_v2-0-1/BBC/BBC-2020-langident.jsonl.bz2",
{
"bucket": "02-processed-data-staging",
"stage_number": "02",
"phase": "staging",
"processing_label": "langident",
"processing_subtype_label": None,
"run_id": "langident-lid-fasttext_v1.0.0-multilingual_v2-0-1",
"model_id": "lid-fasttext_v1.0.0-multilingual",
"task": "lid",
"subtask": None,
"model_specificity": "fasttext",
"model_version": "v1.0.0",
"model_major": "1",
"model_minor": "0",
"model_patch": "0",
"lang": "multilingual",
"run_version": "v2-0-1",
"run_major": "2",
"run_minor": "0",
"run_patch": "1",
"provider_alias": None,
"media_alias": "BBC",
"file_stem": "BBC-2020-langident",
"year": "2020",
},
),
(
"s3://03-processed-data-sandbox/topics/topics-tm-lda_model-en_v3-2-4/EXP/EXP-2021-topics.jsonl.bz2",
{
"bucket": "03-processed-data-sandbox",
"stage_number": "03",
"phase": "sandbox",
"processing_label": "topics",
"processing_subtype_label": None,
"run_id": "topics-tm-lda_model-en_v3-2-4",
"model_id": "tm-lda_model-en",
"task": "tm",
"subtask": None,
"model_specificity": "lda_model",
"model_version": None,
"model_major": None,
"model_minor": None,
"model_patch": None,
"lang": "en",
"run_version": "v3-2-4",
"run_major": "3",
"run_minor": "2",
"run_patch": "4",
"provider_alias": None,
"media_alias": "EXP",
"file_stem": "EXP-2021-topics",
"year": "2021",
},
),
(
"s3://42-processed-data-final/topics/topics-tm-bert_v3.0.0-en_v3-0-0/CNN/CNN-2024-topics.jsonl.bz2",
{
"bucket": "42-processed-data-final",
"stage_number": "42",
"phase": "final",
"processing_label": "topics",
"processing_subtype_label": None,
"run_id": "topics-tm-bert_v3.0.0-en_v3-0-0",
"model_id": "tm-bert_v3.0.0-en",
"task": "tm",
"subtask": None,
"model_specificity": "bert",
"model_version": "v3.0.0",
"model_major": "3",
"model_minor": "0",
"model_patch": "0",
"lang": "en",
"run_version": "v3-0-0",
"run_major": "3",
"run_minor": "0",
"run_patch": "0",
"provider_alias": None,
"media_alias": "CNN",
"file_stem": "CNN-2024-topics",
"year": "2024",
},
),
],
)
def test_successful_s3_path_matches(s3_path, expected):
match = re.match(pattern, s3_path, re.VERBOSE)
assert match is not None, f"Pattern did not match for {s3_path}"
for key, value in expected.items():
assert (
match.group(key) == value
), f"For {key}: expected {value}, got {match.group(key)}"


def test_failed_s3_path_match():
path = "s3://42-processed-data-final/embeddings/images/image-embeddings/embeddings-resnet_dino_clip-v0-0-1/bnl/actionfem/actionfem-1927-image-embeddings.jsonl.bz2"
match = re.match(pattern, path, re.VERBOSE)
assert match is None, f"Pattern should not match for {path}"