From cb7c48837eec0a2a87cb89655417c78601855cfe Mon Sep 17 00:00:00 2001 From: Aurelien-Morgan Date: Mon, 30 Dec 2024 15:14:57 +0100 Subject: [PATCH] implemented logic for HF Hub publication of model safetensor and tokenizer files, with custom model card --- .../dataset/hf_utils/hf_utils.py | 22 +++-- pkg_src/retrain_pipelines/model/hf_utils.py | 73 +++++++++++++++ .../dataset_readme_template.md | 5 +- .../model_readme.py | 3 + .../model_readme_template.md | 30 ++++--- .../pipeline_card.py | 8 +- .../template.html | 89 ++++++++++++++----- pkg_src/retrain_pipelines/utils/hf_utils.py | 26 +++++- 8 files changed, 207 insertions(+), 49 deletions(-) create mode 100644 pkg_src/retrain_pipelines/model/hf_utils.py diff --git a/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py b/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py index d6cc3cd..e0f2bd3 100644 --- a/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py +++ b/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py @@ -3,7 +3,6 @@ import re import sys import json -import yaml import random import shutil import tempfile @@ -434,6 +433,8 @@ def dataset_dict_to_config_str( def push_dataset_version_to_hub( repo_id: str, + version_label: str, + timestamp_str: str, dataset_dict: DatasetDict, dataset_readme_content: str, hf_token: str = None, @@ -446,7 +447,7 @@ def push_dataset_version_to_hub( Custom `retrain-pipelines` README. Uploaded dataset version superseeds entirely - any existing version (any previously file + any existing version (any previous file not anymore present is excluded from new remote dataset snapshot). @@ -454,6 +455,12 @@ def push_dataset_version_to_hub( - repo_id (str): Path to the HuggingFace dataset version (is created if needed and if authorized). + - version_label (str): + value associated to the version + to be published on the HF hub. + - timestamp_str (str): + value associated to the version + to be published on the HF hub - dataset_dict (DatasetDict): The new version to be pushed. - dataset_readme_content (str): @@ -492,13 +499,10 @@ def push_dataset_version_to_hub( "w") as f: f.write(dataset_readme_content) - data = yaml.safe_load( - dataset_readme_content.split('---')[1]) - version, timestamp = data['version'], data['timestamp'] - commit_message = f"v{version} - {timestamp} - " + \ - f"retrain-pipelines v{__version__} - "+ \ - "Upload multi-table dataset "+ \ - "with README." + commit_message = \ + f"v{version_label} - {timestamp_str} - " + \ + f"retrain-pipelines v{__version__} - "+ \ + "Upload multi-table dataset with README." print(commit_message) dataset_version_commit_hash = \ diff --git a/pkg_src/retrain_pipelines/model/hf_utils.py b/pkg_src/retrain_pipelines/model/hf_utils.py new file mode 100644 index 0000000..34ba9e3 --- /dev/null +++ b/pkg_src/retrain_pipelines/model/hf_utils.py @@ -0,0 +1,73 @@ + +import os + +from retrain_pipelines import __version__ +from retrain_pipelines.utils.hf_utils import \ + local_repo_folder_to_hub + + +def push_model_version_to_hub( + repo_id: str, + version_label: str, + timestamp_str: str, + model_dir: str, + model_readme_content: str, + hf_token: str = None, +) -> str: + """ + Loads locally-serialized model safetensor + and tokenizer. + Includes `retrain-pipelines` README. + + Uploaded model version superseeds entirely + any existing version (any previous file + not anymore present is excluded from + new remote model snapshot). + + Params: + - repo_id (str): + Path to the HuggingFace model version + (is created if needed and if authorized). + - version_label (str): + value associated to the version + to be published on the HF hub. + - timestamp_str (str): + value associated to the version + to be published on the HF hub + - model_dir (str): + Path to the serialized + new version to be pushed. + - model_readme_content (str): + The full content (yaml header + body) + of the 'README.md' to be pushed + alongside the datafiles. + - hf_token (Optional, str): + "create on namespace" permission required. + + Results: + - (str): + commit_hash on the HF hub + for the new model version + """ + + with open(os.path.join(model_dir, "README.md"), + "w") as f: + f.write(model_readme_content) + + commit_message = \ + f"v{version_label} - {timestamp_str} - " + \ + f"retrain-pipelines v{__version__} - "+ \ + "Upload model and tokenizer with README." + print(commit_message) + + model_version_commit_hash = \ + local_repo_folder_to_hub( + repo_id=repo_id, + local_folder=model_dir, + commit_message=commit_message, + repo_type="model", + hf_token=hf_token + ) + + return model_version_commit_hash + diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md index 6186857..23b6b4e 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md @@ -10,7 +10,7 @@ version: '{{ new_version_label }}' -timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}' +timestamp: {{ commit_datetime.strftime('%Y%m%d_%H%M%S') ~ '%03d'|format(commit_datetime.microsecond // 1000) ~ '_UTC' }} pretty_name: {{ pretty_name }} @@ -29,11 +29,14 @@ task_categories: - reinforcement-learning tags: +- retrain-pipelines - function-calling - LLM Agent - code - synthetic +thumbnail: https://cdn-avatars.huggingface.co/v1/production/uploads/651e93137b2a2e027f9e55df/96hzBved0YMjCq--s0kad.png + size_categories: - {{ size_category }} diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py index 922120e..58c8a08 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py @@ -61,16 +61,19 @@ def _model_readme_params( base_model_pretty_name = get_pretty_name( repo_id=base_model_dict["repo_id"], + repo_type="model", commit_hash=base_model_commit_hash ) base_model_arxiv_codes = get_arxiv_codes( repo_id=base_model_dict["repo_id"], + repo_type="model", commit_hash=base_model_commit_hash ) base_model_license_label = get_license_label( repo_id=base_model_dict["repo_id"], + repo_type="model", commit_hash=base_model_commit_hash ) if not base_model_license_label: diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md index b3fd103..8cc4e73 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme_template.md @@ -2,14 +2,18 @@ # @see https://github.com/huggingface/hub-docs/blob/main/modelcard.md # @see https://huggingface.co/docs/huggingface_hub/guides/model-cards#update-metadata +{% set timestamp_str = commit_datetime.strftime('%Y%m%d_%H%M%S') ~ '%03d'|format(commit_datetime.microsecond // 1000) ~ '_UTC' -%} + version: '{{ new_version_label }}' -timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}' +timestamp: '{{ timestamp_str }}' model_name: {{ pretty_name }} base_model: {{ base_model_repo_id }} library_name: peft +datasets: +- {{ dataset_repo_id }} license: {{ base_model_license_label }} @@ -20,17 +24,14 @@ task_categories: - question-answering tags: +- retrain-pipelines - function-calling - LLM Agent - code - Unsloth - - thumbnail: https://cdn-avatars.huggingface.co/v1/production/uploads/651e93137b2a2e027f9e55df/96hzBved0YMjCq--s0kad.png -datasets: -- {{ dataset_repo_id }} # @see https://huggingface.co/docs/hub/models-widgets#enabling-a-widget widget: @@ -61,18 +62,21 @@ model-index: `version {{ new_version_label }}` - `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}` (retraining source-code | + href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_source-code/v{{ new_version_label }}_{{ timestamp_str }}">source-code | pipeline-card) + href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_pipeline-card/v{{ new_version_label }}_{{ timestamp_str }}">pipeline-card) Training dataset : -    {{ dataset_repo_id }} -v{{ dataset_version_label }} +- {{ dataset_repo_id }} v{{ dataset_version_label }} ({{ dataset_commit_hash[:7] }} - {{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}) Base model : +- {{ base_model_repo_id }} +({{ base_model_commit_hash[:7] }} - + {{ base_model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}) {% if base_model_arxiv_codes -%} arxiv :
{%- for base_model_arxiv_code in base_model_arxiv_codes %} @@ -85,7 +89,13 @@ arxiv :



-            !! Section on Eval goes here !! +
+
+
+              !! Section on Eval goes here !! +
+
+



diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py index d30acd1..3998167 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/pipeline_card.py @@ -79,12 +79,6 @@ def get_html( ########################## # model training # ########################## - if not params['buckets_dict']: - buckets_table = NONE_HTML_STR - else: - buckets_table = pd.DataFrame( - [params['buckets_dict']]).to_html(classes='wide', - escape=False, index = False) if params['hyperparameters_dict']: hyperparameters_table = \ pd.DataFrame([params['hyperparameters_dict']] @@ -277,6 +271,8 @@ def get_source(self, environment, template): dataset_commit_datetime=\ params['dataset_commit_datetime'], dataset_commit_hash=params['dataset_commit_hash'], + dataset_augmentation_rate=params['dataset_augmentation_rate'], + dataset_enrichment_rate=params['dataset_enrichment_rate'], model_repo_id=params['model_repo_id'], model_commit_hash=params['model_commit_hash'], diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html index b9d2456..0161e50 100644 --- a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html +++ b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/template.html @@ -578,23 +578,73 @@

aria-labelledby="innerHeadingOne" data-parent="#innerAccordion">
- - training dataset : {{ dataset_repo_id }} - v{{ dataset_version_label }} - - i - {{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }} - - [HuggingFace 🤗] -
- - model version : {{ model_repo_id }} - v{{ model_version_label }} - ({{ model_commit_hash[:7] }} - - {{ model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}) -
+ + + + + + + + + + + + + + +
+ training dataset : + + {{ dataset_repo_id }}   + v{{ dataset_version_label }} + + i + {{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }} + + + [HuggingFace 🤗] +
+ Data-augmentation rate : + +{{ (dataset_augmentation_rate * 100)|round(1) ~ '%' }}, + Data-enrichment rate : + +{{ (dataset_enrichment_rate * 100)|round(1) ~ '%' }} +
+ model version : + + {{ model_repo_id }}   + v{{ model_version_label }} + + i + {{ model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }} + + + [HuggingFace 🤗] +
+
@@ -609,11 +659,6 @@

-
-

- Features bucketization : -

-{{buckets_table}}

Hyperparameters set : diff --git a/pkg_src/retrain_pipelines/utils/hf_utils.py b/pkg_src/retrain_pipelines/utils/hf_utils.py index 12edd9c..640b1cb 100644 --- a/pkg_src/retrain_pipelines/utils/hf_utils.py +++ b/pkg_src/retrain_pipelines/utils/hf_utils.py @@ -313,7 +313,9 @@ def get_pretty_name( repo_info = api_info_method( repo_id=repo_id, revision=commit_hash ) - pretty_name = repo_info.card_data["pretty_name"] + pretty_name = repo_info.card_data[ + "model_name" if "model" == repo_type + else "pretty_name"] except (ReadTimeout, HfHubHTTPError) as err: stack_trace = \ ''.join(traceback.format_exception( @@ -462,6 +464,19 @@ def _create_repo_if_not_exists( repo_type="model", token=os.environ["HF_TOKEN"] ) + except HfHubHTTPError as err: + if ( + f"Reference already exists: refs/heads/{branch_name}" + != err.server_message.strip() + ): + print(f"Failed to create branch {branch_name} for " + + f"{repo_type} `{repo_id.split('/')[1]}` "+ + f"under the `{repo_id.split('/')[0]}` namespace " + + "on the HuggingFace Hub.", + file=sys.stderr) + print(''.join(traceback.format_exception( + type(err), err, err.__traceback__))) + return False except Exception as err: print(f"Failed to create branch {branch_name} for " + f"{repo_type} `{repo_id.split('/')[1]}` "+ @@ -485,6 +500,15 @@ def local_repo_folder_to_hub( """ Upload all files in a single commit. + Note : We do not go the "run_as_future" way + (for asynchronous upload) despite + it being Advisable when publishing + models to the HF hub (since usually + models with many params are slow to upload). + We do rely on commit_hash and need it + to continue the documentation process + for the `retrain-pipelines` run. + Params: - repo_id (str): Path to the HuggingFace repository version