From 4d8a9409b564e5a277d69f98e06e82c8766bb500 Mon Sep 17 00:00:00 2001 From: Aurelien-Morgan Date: Mon, 30 Dec 2024 11:28:53 +0100 Subject: [PATCH] complete rework on commits datetime handling for HF repos --- .../dataset/hf_utils/hf_utils.py | 25 +++++++------ .../dataset_readme.py | 37 ++++++++++--------- .../dataset_readme_template.md | 12 +++--- .../model_readme.py | 31 ++++++++-------- .../model_readme_template.md | 30 ++++++++------- .../pipeline_card.py | 16 ++++---- .../template.html | 9 +++-- pkg_src/retrain_pipelines/utils/hf_utils.py | 12 ++---- 8 files changed, 89 insertions(+), 83 deletions(-) diff --git a/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py b/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py index 2b54018..d6cc3cd 100644 --- a/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py +++ b/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py @@ -43,7 +43,7 @@ def get_latest_commit( Results: - (dict): - 'commit_hash', 'commit_date', + 'commit_hash', 'commit_datetime', 'branch_name', 'files' """ @@ -65,15 +65,15 @@ def get_latest_commit( if regex_pattern.search(f) ] if matching_files: - commit_date = commit_data["created_at"] + commit_datetime = commit_data["created_at"] if ( not latest_matching_commit - or commit_date > - latest_matching_commit["commit_date"] + or commit_datetime > + latest_matching_commit["commit_datetime"] ): latest_matching_commit = { "commit_hash": commit_hash, - "commit_date": commit_date, + "commit_datetime": commit_datetime, "branch_name": \ branch_data["branch_name"], "files": matching_files, @@ -100,11 +100,12 @@ def get_commit( Particular "revision" of the dataset to scan. - files_filter (str): - Only consider files matching this regex pattern. + Only consider files matching + this regex pattern. Results: - (dict): - 'commit_hash', 'commit_date', + 'commit_hash', 'commit_datetime', 'branch_name', 'files' """ @@ -137,7 +138,7 @@ def get_commit( if len(matching_files) > 0: matching_commit = { "commit_hash": commit_hash, - "commit_date": \ + "commit_datetime": \ branch_commit_data["created_at"], "branch_name": \ branch_data["branch_name"], @@ -200,9 +201,8 @@ def get_lazy_df( - commit_hash (str): gets handy when no input value is given as input. - - commit_date (str): - 24hrs, UTC format. - - lazydf (pl.lazyframe.frame.LazyFrame): + - commit_datetime (datetime) + - lazydf (pl.lazyframe.frame.LazyFrame) """ parquet_commit = get_commit( @@ -234,7 +234,8 @@ def get_lazy_df( return { "repo_id": repo_id, "commit_hash": parquet_commit['commit_hash'], - "commit_utc_date_str": parquet_commit['commit_date'], + "commit_datetime": \ + parquet_commit['commit_datetime'], "lazy_df": lazy_df } diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py index 855fdde..53a4cfa 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py @@ -3,6 +3,7 @@ import json from ast import literal_eval +from datetime import datetime from jinja2 import Environment, FileSystemLoader @@ -24,7 +25,7 @@ def _dataset_readme_params( augmentation_rate: float, enrichment_rate: float, version_label: str, - utc_timestamp_str: str, + commit_datetime: datetime, mf_flow_name: str, mf_run_id: str, engine:str = "cpu" @@ -39,12 +40,12 @@ def _dataset_readme_params( - hf_dataset_dict (dict): - repo_id - commit_hash - - commit_utc_date_str + - commit_datetime - lazy_df - hf_enrich_dataset_dict (dict) - repo_id - commit_hash - - commit_utc_date_str + - commit_datetime - dataset_dict (DatasetDict): the dataset version to be pushed to the HF hub. @@ -58,8 +59,8 @@ def _dataset_readme_params( - version_label (str): typical `retrain-pipelines` version label are of format "major.minor" - - utc_timestamp_str (str): - timestampt for the new dataset version. + - commit_datetime (datetime): + timestamp for the new dataset version. - mf_flow_name (str) - mf_run_id (str) - engine (str): @@ -76,13 +77,13 @@ def _dataset_readme_params( dataset_dict["supervised_finetuning"]["validation"].num_rows size_category = get_size_category(records_count) - main_commit_hash, main_commit_utc_date_str = \ + main_commit_hash, main_commit_datetime = \ get_latest_README_commit( repo_id=hf_dataset_dict["repo_id"], target_commit_hash=hf_dataset_dict["commit_hash"], repo_type="dataset" ) - enrich_commit_hash, enrich_commit_utc_date_str = \ + enrich_commit_hash, enrich_commit_datetime = \ get_latest_README_commit( repo_id=hf_enrich_dataset_dict["repo_id"], target_commit_hash=\ @@ -150,7 +151,7 @@ def _build_keys(d, parent='', output_str=''): return { "configs": dataset_dict_to_config_str(dataset_dict), "new_version_label": version_label, - "utc_timestamp": utc_timestamp_str, + "commit_datetime": commit_datetime, "pretty_name": pretty_name, @@ -162,10 +163,10 @@ def _build_keys(d, parent='', output_str=''): "main_commit_hash": main_commit_hash, "enrich_commit_hash": enrich_commit_hash, - "main_commit_utc_date_str": \ - main_commit_utc_date_str, - "enrich_commit_utc_date_str": \ - enrich_commit_utc_date_str, + "main_commit_datetime": \ + main_commit_datetime, + "enrich_commit_datetime": \ + enrich_commit_datetime, "main_pretty_name": main_pretty_name, "enrich_pretty_name": enrich_pretty_name, @@ -198,7 +199,7 @@ def get_dataset_readme_content( augmentation_rate: float, enrichment_rate: float, version_label: str, - utc_timestamp_str: str, + commit_datetime: datetime, mf_flow_name: str, mf_run_id: str, engine:str = "cpu" @@ -218,12 +219,12 @@ def get_dataset_readme_content( - hf_dataset_dict (dict): - repo_id - commit_hash - - commit_utc_date_str + - commit_datetime - lazy_df - hf_enrich_dataset_dict (dict) - repo_id - commit_hash - - commit_utc_date_str + - commit_datetime - dataset_dict (DatasetDict): the dataset version to be pushed to the HF hub. @@ -237,8 +238,8 @@ def get_dataset_readme_content( - version_label (str): typical `retrain-pipelines` version label are of format "major.minor" - - utc_timestamp_str (str): - timestampt for the new dataset version. + - commit_datetime (datetime): + timestamp for the new dataset version. - mf_flow_name (str) - mf_run_id (str) - engine (str): @@ -255,7 +256,7 @@ def get_dataset_readme_content( augmentation_rate=augmentation_rate, enrichment_rate=enrichment_rate, version_label=version_label, - utc_timestamp_str=utc_timestamp_str, + commit_datetime=commit_datetime, mf_flow_name=mf_flow_name, mf_run_id=mf_run_id, engine=engine diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md index a1b9500..6186857 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md @@ -10,7 +10,7 @@ version: '{{ new_version_label }}' -timestamp: {{ utc_timestamp }} +timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}' pretty_name: {{ pretty_name }} @@ -18,7 +18,7 @@ source_datasets: - {{ main_repo_id }} - {{ enrich_repo_id }} -license: {{license_label}} +license: {{ license_label }} language: - en @@ -41,14 +41,15 @@ size_categories: # {{ pretty_name }} -`version {{ new_version_label }}` - `{{ utc_timestamp }}` +`version {{ new_version_label }}` - `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}` Source datasets : - main : - {{ main_pretty_name }}
`{{ main_repo_id }}` ({{ main_commit_hash[:7] }} - {{ main_commit_utc_date_str }}) + target="_blank">{{ main_commit_hash[:7] }} - + {{ main_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
license : {% if main_license_label -%} @@ -68,7 +69,8 @@ Source datasets : - {{ enrich_pretty_name }}
`{{ enrich_repo_id }}` ({{ enrich_commit_hash[:7] }} - {{ enrich_commit_utc_date_str }}) + target="_blank">{{ enrich_commit_hash[:7] }} - + {{ enrich_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
license : {% if enrich_license_label -%} diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py index db8bd6e..922120e 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py @@ -3,6 +3,7 @@ import json from ast import literal_eval +from datetime import datetime from jinja2 import Environment, FileSystemLoader @@ -20,7 +21,7 @@ def _model_readme_params( base_model_dict: dict, training_dataset_dict: dict, version_label: str, - utc_timestamp_str: str, + commit_datetime: datetime, mf_flow_name: str, mf_run_id: str, ) -> dict: @@ -34,14 +35,14 @@ def _model_readme_params( - base_model_dict (dict) - training_dataset_dict (dict): - repo_id + - version_label - commit_hash - - commit_utc_date_str - - + - commit_datetime - version_label (str): typical `retrain-pipelines` version label are of format "major.minor" - - utc_timestamp_str (str): - timestampt for the new dataset version. + - commit_datetime (datetime): + timestamp for the new model version. - mf_flow_name (str) - mf_run_id (str) @@ -51,7 +52,7 @@ def _model_readme_params( pretty_name = "retrain-pipelines Function Caller" - base_model_commit_hash, base_model_commit_utc_date_str = \ + base_model_commit_hash, base_model_commit_datetime = \ get_latest_README_commit( repo_id=base_model_dict["repo_id"], target_commit_hash=base_model_dict["commit_hash"], @@ -77,7 +78,7 @@ def _model_readme_params( return { "new_version_label": version_label, - "utc_timestamp": utc_timestamp_str, + "commit_datetime": commit_datetime, "pretty_name": pretty_name, @@ -87,13 +88,13 @@ def _model_readme_params( training_dataset_dict["version_label"], "dataset_commit_hash": \ training_dataset_dict["commit_hash"], - "dataset_utc_timestamp_str": \ - training_dataset_dict["utc_timestamp_str"], + "dataset_commit_datetime": \ + training_dataset_dict["commit_datetime"], "base_model_repo_id": base_model_dict["repo_id"], "base_model_pretty_name": base_model_pretty_name, "base_model_commit_hash": base_model_commit_hash, - "base_model_commit_utc_date_str": base_model_commit_utc_date_str, + "base_model_commit_datetime": base_model_commit_datetime, "base_model_arxiv_codes": base_model_arxiv_codes, "base_model_license_label": base_model_license_label, @@ -111,7 +112,7 @@ def get_model_readme_content( training_dataset_dict: dict, version_label: str, - utc_timestamp_str: str, + commit_datetime: datetime, mf_flow_name: str, mf_run_id: str, @@ -134,12 +135,12 @@ def get_model_readme_content( - training_dataset_dict (dict) - repo_id - commit_hash - - commit_utc_date_str + - commit_datetime - version_label (str): typical `retrain-pipelines` version label are of format "major.minor" - - utc_timestamp_str (str): - timestampt for the new dataset version. + - commit_datetime (datetime): + timestamp for the new dataset version. - mf_flow_name (str) - mf_run_id (str) @@ -151,7 +152,7 @@ def get_model_readme_content( base_model_dict=base_model_dict, training_dataset_dict=training_dataset_dict, version_label=version_label, - utc_timestamp_str=utc_timestamp_str, + commit_datetime=commit_datetime, mf_flow_name=mf_flow_name, mf_run_id=mf_run_id ) diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md index 1ca94ef..b3fd103 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md @@ -4,7 +4,7 @@ version: '{{ new_version_label }}' -timestamp: {{ utc_timestamp }} +timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}' model_name: {{ pretty_name }} @@ -56,17 +56,21 @@ model-index: --- -            !!! TEMPLATE UNDER CONSTRUCTION !!! - # {{ pretty_name }} -`version {{ new_version_label }}` - `{{ utc_timestamp }}` +`version {{ new_version_label }}` - `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}` +(retraining +source-code | +pipeline-card) -Training dataset : +Training dataset :     {{ dataset_repo_id }} v{{ dataset_version_label }} ({{ dataset_commit_hash[:7] }} - {{ dataset_utc_timestamp_str }}) + target="_blank">{{ dataset_commit_hash[:7] }} - + {{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}) Base model : {% if base_model_arxiv_codes -%} @@ -77,14 +81,14 @@ arxiv :
{% endfor -%} {% endif -%} -Source code : -https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_source-code/{{ new_version_label }} - -Pipeline-card : -https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_pipeline-card/{{ new_version_label }} - - +
+
+
+            !! Section on Eval goes here !! +
+
+

Powered by diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py index 7bcd2d2..d30acd1 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py @@ -262,8 +262,8 @@ def get_source(self, environment, template): main_dataset_repo_id=params['main_dataset_repo_id'], main_dataset_commit_hash=\ params['main_dataset_commit_hash'], - main_dataset_commit_utc_date_str=\ - params['main_dataset_commit_utc_date_str'], + main_dataset_commit_datetime=\ + params['main_dataset_commit_datetime'], records_count="{:,}".format(params['records_count']), data_schema_table=indent(data_schema_table, ' '*36), answers_tools_count_curve=answers_tools_count_curve, @@ -274,14 +274,14 @@ def get_source(self, environment, template): # model training => # dataset_repo_id=params['dataset_repo_id'], dataset_version_label=params['dataset_version_label'], - dataset_utc_timestamp_str=\ - params['dataset_utc_timestamp_str'], + dataset_commit_datetime=\ + params['dataset_commit_datetime'], dataset_commit_hash=params['dataset_commit_hash'], - model_repo_id="", - model_commit_hash=" ", - model_version_label="", - model_utc_timestamp_str="", + model_repo_id=params['model_repo_id'], + model_commit_hash=params['model_commit_hash'], + model_version_label=params['model_version_label'], + model_commit_datetime=params['model_commit_datetime'], cpt_log_history_curve=cpt_log_history_curve, # target_class_curves=target_class_curves, diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html index 527c1ee..b9d2456 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html @@ -1,4 +1,5 @@ + @@ -510,8 +511,8 @@

Exploratory Data Analysis

commit {{ main_dataset_commit_hash[:7] }} i - - {{ main_dataset_commit_utc_date_str }} + + {{ main_dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }} [ v{{ dataset_version_label }} i - {{ dataset_utc_timestamp_str }} + {{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }} [HuggingFace 🤗] @@ -592,7 +593,7 @@

v{{ model_version_label }} ({{ model_commit_hash[:7] }} - - {{ model_utc_timestamp_str }}) + - {{ model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})