Skip to content

Commit

Permalink
complete rework on commits datetime handling for HF repos
Browse files Browse the repository at this point in the history
  • Loading branch information
aurelienmorgan committed Dec 30, 2024
1 parent 47a5d05 commit 4d8a940
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 83 deletions.
25 changes: 13 additions & 12 deletions pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_latest_commit(
Results:
- (dict):
'commit_hash', 'commit_date',
'commit_hash', 'commit_datetime',
'branch_name', 'files'
"""

Expand All @@ -65,15 +65,15 @@ def get_latest_commit(
if regex_pattern.search(f)
]
if matching_files:
commit_date = commit_data["created_at"]
commit_datetime = commit_data["created_at"]
if (
not latest_matching_commit
or commit_date >
latest_matching_commit["commit_date"]
or commit_datetime >
latest_matching_commit["commit_datetime"]
):
latest_matching_commit = {
"commit_hash": commit_hash,
"commit_date": commit_date,
"commit_datetime": commit_datetime,
"branch_name": \
branch_data["branch_name"],
"files": matching_files,
Expand All @@ -100,11 +100,12 @@ def get_commit(
Particular "revision" of the dataset
to scan.
- files_filter (str):
Only consider files matching this regex pattern.
Only consider files matching
this regex pattern.
Results:
- (dict):
'commit_hash', 'commit_date',
'commit_hash', 'commit_datetime',
'branch_name', 'files'
"""

Expand Down Expand Up @@ -137,7 +138,7 @@ def get_commit(
if len(matching_files) > 0:
matching_commit = {
"commit_hash": commit_hash,
"commit_date": \
"commit_datetime": \
branch_commit_data["created_at"],
"branch_name": \
branch_data["branch_name"],
Expand Down Expand Up @@ -200,9 +201,8 @@ def get_lazy_df(
- commit_hash (str):
gets handy when no input value
is given as input.
- commit_date (str):
24hrs, UTC format.
- lazydf (pl.lazyframe.frame.LazyFrame):
- commit_datetime (datetime)
- lazydf (pl.lazyframe.frame.LazyFrame)
"""

parquet_commit = get_commit(
Expand Down Expand Up @@ -234,7 +234,8 @@ def get_lazy_df(
return {
"repo_id": repo_id,
"commit_hash": parquet_commit['commit_hash'],
"commit_utc_date_str": parquet_commit['commit_date'],
"commit_datetime": \
parquet_commit['commit_datetime'],
"lazy_df": lazy_df
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json

from ast import literal_eval
from datetime import datetime

from jinja2 import Environment, FileSystemLoader

Expand All @@ -24,7 +25,7 @@ def _dataset_readme_params(
augmentation_rate: float,
enrichment_rate: float,
version_label: str,
utc_timestamp_str: str,
commit_datetime: datetime,
mf_flow_name: str,
mf_run_id: str,
engine:str = "cpu"
Expand All @@ -39,12 +40,12 @@ def _dataset_readme_params(
- hf_dataset_dict (dict):
- repo_id
- commit_hash
- commit_utc_date_str
- commit_datetime
- lazy_df
- hf_enrich_dataset_dict (dict)
- repo_id
- commit_hash
- commit_utc_date_str
- commit_datetime
- dataset_dict (DatasetDict):
the dataset version to be pushed
to the HF hub.
Expand All @@ -58,8 +59,8 @@ def _dataset_readme_params(
- version_label (str):
typical `retrain-pipelines`
version label are of format "major.minor"
- utc_timestamp_str (str):
timestampt for the new dataset version.
- commit_datetime (datetime):
timestamp for the new dataset version.
- mf_flow_name (str)
- mf_run_id (str)
- engine (str):
Expand All @@ -76,13 +77,13 @@ def _dataset_readme_params(
dataset_dict["supervised_finetuning"]["validation"].num_rows
size_category = get_size_category(records_count)

main_commit_hash, main_commit_utc_date_str = \
main_commit_hash, main_commit_datetime = \
get_latest_README_commit(
repo_id=hf_dataset_dict["repo_id"],
target_commit_hash=hf_dataset_dict["commit_hash"],
repo_type="dataset"
)
enrich_commit_hash, enrich_commit_utc_date_str = \
enrich_commit_hash, enrich_commit_datetime = \
get_latest_README_commit(
repo_id=hf_enrich_dataset_dict["repo_id"],
target_commit_hash=\
Expand Down Expand Up @@ -150,7 +151,7 @@ def _build_keys(d, parent='', output_str=''):
return {
"configs": dataset_dict_to_config_str(dataset_dict),
"new_version_label": version_label,
"utc_timestamp": utc_timestamp_str,
"commit_datetime": commit_datetime,

"pretty_name": pretty_name,

Expand All @@ -162,10 +163,10 @@ def _build_keys(d, parent='', output_str=''):
"main_commit_hash": main_commit_hash,
"enrich_commit_hash": enrich_commit_hash,

"main_commit_utc_date_str": \
main_commit_utc_date_str,
"enrich_commit_utc_date_str": \
enrich_commit_utc_date_str,
"main_commit_datetime": \
main_commit_datetime,
"enrich_commit_datetime": \
enrich_commit_datetime,

"main_pretty_name": main_pretty_name,
"enrich_pretty_name": enrich_pretty_name,
Expand Down Expand Up @@ -198,7 +199,7 @@ def get_dataset_readme_content(
augmentation_rate: float,
enrichment_rate: float,
version_label: str,
utc_timestamp_str: str,
commit_datetime: datetime,
mf_flow_name: str,
mf_run_id: str,
engine:str = "cpu"
Expand All @@ -218,12 +219,12 @@ def get_dataset_readme_content(
- hf_dataset_dict (dict):
- repo_id
- commit_hash
- commit_utc_date_str
- commit_datetime
- lazy_df
- hf_enrich_dataset_dict (dict)
- repo_id
- commit_hash
- commit_utc_date_str
- commit_datetime
- dataset_dict (DatasetDict):
the dataset version to be pushed
to the HF hub.
Expand All @@ -237,8 +238,8 @@ def get_dataset_readme_content(
- version_label (str):
typical `retrain-pipelines`
version label are of format "major.minor"
- utc_timestamp_str (str):
timestampt for the new dataset version.
- commit_datetime (datetime):
timestamp for the new dataset version.
- mf_flow_name (str)
- mf_run_id (str)
- engine (str):
Expand All @@ -255,7 +256,7 @@ def get_dataset_readme_content(
augmentation_rate=augmentation_rate,
enrichment_rate=enrichment_rate,
version_label=version_label,
utc_timestamp_str=utc_timestamp_str,
commit_datetime=commit_datetime,
mf_flow_name=mf_flow_name,
mf_run_id=mf_run_id,
engine=engine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@

version: '{{ new_version_label }}'

timestamp: {{ utc_timestamp }}
timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}'

pretty_name: {{ pretty_name }}

source_datasets:
- {{ main_repo_id }}
- {{ enrich_repo_id }}

license: {{license_label}}
license: {{ license_label }}

language:
- en
Expand All @@ -41,14 +41,15 @@ size_categories:

# {{ pretty_name }}

`version {{ new_version_label }}` - `{{ utc_timestamp }}`
`version {{ new_version_label }}` - `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}`

Source datasets :
- main :
- <b>{{ main_pretty_name }}</b><br />
`{{ main_repo_id }}`
(<a href="https://huggingface.co/datasets/{{ main_repo_id }}/blob/{{ main_commit_hash }}/README.md"
target="_blank">{{ main_commit_hash[:7] }}</a> - {{ main_commit_utc_date_str }})
target="_blank">{{ main_commit_hash[:7] }}</a> -
{{ main_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
<br />
license&nbsp;:
{% if main_license_label -%}
Expand All @@ -68,7 +69,8 @@ Source datasets :
- <b>{{ enrich_pretty_name }}</b><br />
`{{ enrich_repo_id }}`
(<a href="https://huggingface.co/datasets/{{ enrich_repo_id }}/blob/{{ enrich_commit_hash }}/README.md"
target="_blank">{{ enrich_commit_hash[:7] }}</a> - {{ enrich_commit_utc_date_str }})
target="_blank">{{ enrich_commit_hash[:7] }}</a> -
{{ enrich_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
<br />
license&nbsp;:
{% if enrich_license_label -%}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json

from ast import literal_eval
from datetime import datetime

from jinja2 import Environment, FileSystemLoader

Expand All @@ -20,7 +21,7 @@ def _model_readme_params(
base_model_dict: dict,
training_dataset_dict: dict,
version_label: str,
utc_timestamp_str: str,
commit_datetime: datetime,
mf_flow_name: str,
mf_run_id: str,
) -> dict:
Expand All @@ -34,14 +35,14 @@ def _model_readme_params(
- base_model_dict (dict)
- training_dataset_dict (dict):
- repo_id
- version_label
- commit_hash
- commit_utc_date_str
-
- commit_datetime
- version_label (str):
typical `retrain-pipelines`
version label are of format "major.minor"
- utc_timestamp_str (str):
timestampt for the new dataset version.
- commit_datetime (datetime):
timestamp for the new model version.
- mf_flow_name (str)
- mf_run_id (str)
Expand All @@ -51,7 +52,7 @@ def _model_readme_params(

pretty_name = "retrain-pipelines Function Caller"

base_model_commit_hash, base_model_commit_utc_date_str = \
base_model_commit_hash, base_model_commit_datetime = \
get_latest_README_commit(
repo_id=base_model_dict["repo_id"],
target_commit_hash=base_model_dict["commit_hash"],
Expand All @@ -77,7 +78,7 @@ def _model_readme_params(

return {
"new_version_label": version_label,
"utc_timestamp": utc_timestamp_str,
"commit_datetime": commit_datetime,

"pretty_name": pretty_name,

Expand All @@ -87,13 +88,13 @@ def _model_readme_params(
training_dataset_dict["version_label"],
"dataset_commit_hash": \
training_dataset_dict["commit_hash"],
"dataset_utc_timestamp_str": \
training_dataset_dict["utc_timestamp_str"],
"dataset_commit_datetime": \
training_dataset_dict["commit_datetime"],

"base_model_repo_id": base_model_dict["repo_id"],
"base_model_pretty_name": base_model_pretty_name,
"base_model_commit_hash": base_model_commit_hash,
"base_model_commit_utc_date_str": base_model_commit_utc_date_str,
"base_model_commit_datetime": base_model_commit_datetime,
"base_model_arxiv_codes": base_model_arxiv_codes,
"base_model_license_label": base_model_license_label,

Expand All @@ -111,7 +112,7 @@ def get_model_readme_content(
training_dataset_dict: dict,

version_label: str,
utc_timestamp_str: str,
commit_datetime: datetime,

mf_flow_name: str,
mf_run_id: str,
Expand All @@ -134,12 +135,12 @@ def get_model_readme_content(
- training_dataset_dict (dict)
- repo_id
- commit_hash
- commit_utc_date_str
- commit_datetime
- version_label (str):
typical `retrain-pipelines`
version label are of format "major.minor"
- utc_timestamp_str (str):
timestampt for the new dataset version.
- commit_datetime (datetime):
timestamp for the new dataset version.
- mf_flow_name (str)
- mf_run_id (str)
Expand All @@ -151,7 +152,7 @@ def get_model_readme_content(
base_model_dict=base_model_dict,
training_dataset_dict=training_dataset_dict,
version_label=version_label,
utc_timestamp_str=utc_timestamp_str,
commit_datetime=commit_datetime,
mf_flow_name=mf_flow_name,
mf_run_id=mf_run_id
)
Expand Down
Loading

0 comments on commit 4d8a940

Please sign in to comment.