Skip to content

Commit

Permalink
moving towards implementation of logic for 'pushing model to hub' tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
aurelienmorgan committed Dec 29, 2024
1 parent 0af22f3 commit 47a5d05
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 257 deletions.
179 changes: 6 additions & 173 deletions pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,108 +12,16 @@
import pandas as pd
import polars as pl

from datetime import datetime

from typing import Optional, Callable, Iterator

from huggingface_hub import list_repo_refs, list_repo_commits, \
list_repo_files, hf_hub_download, HfApi
from huggingface_hub.utils import RevisionNotFoundError, \
EntryNotFoundError, HfHubHTTPError

from datasets import IterableDataset, DatasetDict

from retrain_pipelines import __version__
from retrain_pipelines.utils.hf_utils import local_repo_folder_to_hub


def _dataset_repo_branch_commits_files(
repo_id: str,
repo_branch: str
) -> dict:
"""
Params:
- repo_id (str):
Path to the HuggingFace dataset.
- repo_branch (str):
Branch (of the repository of interest)
to be considered.
Results:
- (dict)
'commit_hash', 'created_at',
'title', 'files'
"""
commits = list_repo_commits(repo_id, revision=repo_branch,
repo_type="dataset",
token=os.environ["HF_TOKEN"])
commits_dict = {}
for commit in commits:
files = list_repo_files(
repo_id, revision=commit.commit_id,
repo_type="dataset",
token=os.environ["HF_TOKEN"])

commits_dict[commit.commit_id] = {
"created_at": commit.created_at.strftime(
"%Y-%m-%d %H:%M:%S UTC"),
"title": commit.title,
"files": files
}

return commits_dict


def get_dataset_branches_commits_files(
repo_id: str
) -> dict:
"""
Selection of metadata for (litterally)
all files of all commits of a given
HF dataset repo.
Params:
- repo_id (str):
Path to the HuggingFace dataset.
Results:
- (dict)
'branches'
(
'branch_name', 'commits',
(
'commit_hash', 'created_at',
'title', 'files'
)
)
"""

refs = list_repo_refs(repo_id, repo_type="dataset",
token=os.environ["HF_TOKEN"])

dataset_repo_branches = {
"repo_standard_branches": {},
"repo_convert_branches": {}
}
for repo_standard_branches in refs.branches:
dataset_repo_branches[
"repo_standard_branches"
][repo_standard_branches.name] = {
"branch_name": repo_standard_branches.ref,
"commits": _dataset_repo_branch_commits_files(
repo_id, repo_standard_branches.ref)
}

for repo_convert_branch in refs.converts:
dataset_repo_branches[
"repo_convert_branches"
][repo_convert_branch.name] = {
"branch_name": repo_convert_branch.ref,
"commits": _dataset_repo_branch_commits_files(
repo_id, repo_convert_branch.ref)
}

return dataset_repo_branches
from retrain_pipelines.utils.hf_utils import \
get_repo_branches_commits_files, local_repo_folder_to_hub


def get_latest_commit(
Expand All @@ -140,7 +48,8 @@ def get_latest_commit(
"""

dataset_repo_branches = \
get_dataset_branches_commits_files(repo_id)
get_repo_branches_commits_files(
repo_id=repo_id, repo_type="dataset")

latest_matching_commit = None
regex_pattern = re.compile(files_filter)
Expand Down Expand Up @@ -208,7 +117,8 @@ def get_commit(
return matching_commit
else:
dataset_repo_branches = \
get_dataset_branches_commits_files(repo_id)
get_repo_branches_commits_files(
repo_id=repo_id, repo_type="dataset")
for \
branch_type, branches \
in dataset_repo_branches.items() \
Expand Down Expand Up @@ -521,83 +431,6 @@ def dataset_dict_to_config_str(
return result


def get_latest_README_commit(
repo_id: str,
target_commit_hash: str,
verbose: bool = True
) -> (str, datetime):
"""
Using a given commit as a starting point,
look for the latest prior commit for which
there was a README.md file.
This is to address cases where
'the commit corresponding to this commit_hash
didn't include a README and
many entries are missing from `dataset_info`'.
for instance, typical of 'auto-convert bot'
(think duckdb or parquet,
@see https://huggingface.co/docs/dataset-viewer/en/parquet#conversion-to-parquet).
Params:
- repo_id (str):
Path to the HuggingFace dataset.
- commit_hash (Optional, str):
particular "revision" of the dataset
to scan.
- verbose (bool):
whether or not to print commit
hash and date (target vs latest README)
Results:
- (str, datetime):
latest_README_commit_hash,
latest_README_commit_date
"""
hf_dataset_branches_commits_files = \
get_dataset_branches_commits_files(repo_id=repo_id)

target_date = None
for repo, repo_data in hf_dataset_branches_commits_files.items():
for branch, branch_data in repo_data.items():
for commit_hash, commit_data in branch_data['commits'].items():
if commit_hash == target_commit_hash:
target_date = datetime.strptime(
commit_data['created_at'], '%Y-%m-%d %H:%M:%S UTC')
break
if target_date:
break
if target_date:
break
if verbose:
print("target commit : ".ljust(25), target_commit_hash, target_date)

README_date = None
README_commit_hash = None
for repo, repo_data in hf_dataset_branches_commits_files.items():
for branch, branch_data in repo_data.items():
for commit_hash, commit_data in branch_data['commits'].items():
if 'README.md' in commit_data['files']:
commit_date = datetime.strptime(
commit_data['created_at'], '%Y-%m-%d %H:%M:%S UTC')
if commit_date <= target_date:
README_date = datetime.strptime(
commit_data['created_at'], '%Y-%m-%d %H:%M:%S UTC')
README_commit_hash = commit_hash
if verbose:
print("lastest README commit : ".ljust(25),
README_commit_hash, README_date)
break
else:
continue
break
else:
continue
break

return README_commit_hash, README_date


def push_dataset_version_to_hub(
repo_id: str,
dataset_dict: DatasetDict,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@

from retrain_pipelines import __version__
from retrain_pipelines.dataset.hf_utils import \
get_latest_README_commit, get_size_category, \
dataset_dict_to_config_str
get_size_category, dataset_dict_to_config_str
from retrain_pipelines.utils.hf_utils import \
get_arxiv_codes, get_license_label, \
get_pretty_name
get_latest_README_commit, get_arxiv_codes, \
get_license_label, get_pretty_name


def _dataset_readme_params(
Expand Down Expand Up @@ -80,13 +79,15 @@ def _dataset_readme_params(
main_commit_hash, main_commit_utc_date_str = \
get_latest_README_commit(
repo_id=hf_dataset_dict["repo_id"],
target_commit_hash=hf_dataset_dict["commit_hash"]
target_commit_hash=hf_dataset_dict["commit_hash"],
repo_type="dataset"
)
enrich_commit_hash, enrich_commit_utc_date_str = \
get_latest_README_commit(
repo_id=hf_enrich_dataset_dict["repo_id"],
target_commit_hash=\
hf_enrich_dataset_dict["commit_hash"]
hf_enrich_dataset_dict["commit_hash"],
repo_type="dataset"
)

main_pretty_name = get_pretty_name(
Expand Down Expand Up @@ -256,7 +257,8 @@ def get_dataset_readme_content(
version_label=version_label,
utc_timestamp_str=utc_timestamp_str,
mf_flow_name=mf_flow_name,
mf_run_id=mf_run_id
mf_run_id=mf_run_id,
engine=engine
)

env = Environment(loader=FileSystemLoader(template_folder))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ Data-enrichment rate&nbsp;: +{{ (enrichment_rate * 100)|round(1) ~ '%' }}<br />

<hr />
Powered by
<a target="_blank" href="https://pypi.org/project/retrain-pipelines/">retrain-pipelines {{ __version__ }}</a> -
<code><a target="_blank"
href="https://pypi.org/project/retrain-pipelines/">retrain-pipelines
{{ __version__ }}</a></code> -
<code>Run by <a target="_blank" href="https://huggingface.co/{{ run_user }}">{{ run_user }}</a></code> -
<em><b>{{ mf_flow_name }}</b></em> - mf_run_id&nbsp;: <code>{{ mf_run_id }}</code>

Loading

0 comments on commit 47a5d05

Please sign in to comment.