Skip to content

Commit

Permalink
implemented logic for HF Hub publication of model safetensor and toke…
Browse files Browse the repository at this point in the history
…nizer files, with custom model card
  • Loading branch information
aurelienmorgan committed Dec 30, 2024
1 parent 020f834 commit cb7c488
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 49 deletions.
22 changes: 13 additions & 9 deletions pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re
import sys
import json
import yaml
import random
import shutil
import tempfile
Expand Down Expand Up @@ -434,6 +433,8 @@ def dataset_dict_to_config_str(

def push_dataset_version_to_hub(
repo_id: str,
version_label: str,
timestamp_str: str,
dataset_dict: DatasetDict,
dataset_readme_content: str,
hf_token: str = None,
Expand All @@ -446,14 +447,20 @@ def push_dataset_version_to_hub(
Custom `retrain-pipelines` README.
Uploaded dataset version superseeds entirely
any existing version (any previously file
any existing version (any previous file
not anymore present is excluded from
new remote dataset snapshot).
Params:
- repo_id (str):
Path to the HuggingFace dataset version
(is created if needed and if authorized).
- version_label (str):
value associated to the version
to be published on the HF hub.
- timestamp_str (str):
value associated to the version
to be published on the HF hub
- dataset_dict (DatasetDict):
The new version to be pushed.
- dataset_readme_content (str):
Expand Down Expand Up @@ -492,13 +499,10 @@ def push_dataset_version_to_hub(
"w") as f:
f.write(dataset_readme_content)

data = yaml.safe_load(
dataset_readme_content.split('---')[1])
version, timestamp = data['version'], data['timestamp']
commit_message = f"v{version} - {timestamp} - " + \
f"retrain-pipelines v{__version__} - "+ \
"Upload multi-table dataset "+ \
"with README."
commit_message = \
f"v{version_label} - {timestamp_str} - " + \
f"retrain-pipelines v{__version__} - "+ \
"Upload multi-table dataset with README."
print(commit_message)

dataset_version_commit_hash = \
Expand Down
73 changes: 73 additions & 0 deletions pkg_src/retrain_pipelines/model/hf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

import os

from retrain_pipelines import __version__
from retrain_pipelines.utils.hf_utils import \
local_repo_folder_to_hub


def push_model_version_to_hub(
repo_id: str,
version_label: str,
timestamp_str: str,
model_dir: str,
model_readme_content: str,
hf_token: str = None,
) -> str:
"""
Loads locally-serialized model safetensor
and tokenizer.
Includes `retrain-pipelines` README.
Uploaded model version superseeds entirely
any existing version (any previous file
not anymore present is excluded from
new remote model snapshot).
Params:
- repo_id (str):
Path to the HuggingFace model version
(is created if needed and if authorized).
- version_label (str):
value associated to the version
to be published on the HF hub.
- timestamp_str (str):
value associated to the version
to be published on the HF hub
- model_dir (str):
Path to the serialized
new version to be pushed.
- model_readme_content (str):
The full content (yaml header + body)
of the 'README.md' to be pushed
alongside the datafiles.
- hf_token (Optional, str):
"create on namespace" permission required.
Results:
- (str):
commit_hash on the HF hub
for the new model version
"""

with open(os.path.join(model_dir, "README.md"),
"w") as f:
f.write(model_readme_content)

commit_message = \
f"v{version_label} - {timestamp_str} - " + \
f"retrain-pipelines v{__version__} - "+ \
"Upload model and tokenizer with README."
print(commit_message)

model_version_commit_hash = \
local_repo_folder_to_hub(
repo_id=repo_id,
local_folder=model_dir,
commit_message=commit_message,
repo_type="model",
hf_token=hf_token
)

return model_version_commit_hash

Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

version: '{{ new_version_label }}'

timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}'
timestamp: {{ commit_datetime.strftime('%Y%m%d_%H%M%S') ~ '%03d'|format(commit_datetime.microsecond // 1000) ~ '_UTC' }}

pretty_name: {{ pretty_name }}

Expand All @@ -29,11 +29,14 @@ task_categories:
- reinforcement-learning

tags:
- retrain-pipelines
- function-calling
- LLM Agent
- code
- synthetic

thumbnail: https://cdn-avatars.huggingface.co/v1/production/uploads/651e93137b2a2e027f9e55df/96hzBved0YMjCq--s0kad.png

size_categories:
- {{ size_category }}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,19 @@ def _model_readme_params(

base_model_pretty_name = get_pretty_name(
repo_id=base_model_dict["repo_id"],
repo_type="model",
commit_hash=base_model_commit_hash
)

base_model_arxiv_codes = get_arxiv_codes(
repo_id=base_model_dict["repo_id"],
repo_type="model",
commit_hash=base_model_commit_hash
)

base_model_license_label = get_license_label(
repo_id=base_model_dict["repo_id"],
repo_type="model",
commit_hash=base_model_commit_hash
)
if not base_model_license_label:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
# @see https://github.com/huggingface/hub-docs/blob/main/modelcard.md
# @see https://huggingface.co/docs/huggingface_hub/guides/model-cards#update-metadata

{% set timestamp_str = commit_datetime.strftime('%Y%m%d_%H%M%S') ~ '%03d'|format(commit_datetime.microsecond // 1000) ~ '_UTC' -%}

version: '{{ new_version_label }}'

timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}'
timestamp: '{{ timestamp_str }}'

model_name: {{ pretty_name }}

base_model: {{ base_model_repo_id }}
library_name: peft
datasets:
- {{ dataset_repo_id }}

license: {{ base_model_license_label }}

Expand All @@ -20,17 +24,14 @@ task_categories:
- question-answering

tags:
- retrain-pipelines
- function-calling
- LLM Agent
- code
- Unsloth



thumbnail: https://cdn-avatars.huggingface.co/v1/production/uploads/651e93137b2a2e027f9e55df/96hzBved0YMjCq--s0kad.png

datasets:
- {{ dataset_repo_id }}

# @see https://huggingface.co/docs/hub/models-widgets#enabling-a-widget
widget:
Expand Down Expand Up @@ -61,18 +62,21 @@ model-index:
`version {{ new_version_label }}` - `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}`
(retraining
<a target="_blank"
href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_source-code/{{ new_version_label }}">source-code</a> |
href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_source-code/v{{ new_version_label }}_{{ timestamp_str }}">source-code</a> |
<a target="_blank"
href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_pipeline-card/{{ new_version_label }}">pipeline-card</a>)
href="https://huggingface.co/retrain-pipelines/function_caller/tree/retrain-pipelines_pipeline-card/v{{ new_version_label }}_{{ timestamp_str }}">pipeline-card</a>)

Training dataset&nbsp;:
&nbsp; &nbsp; <code>{{ dataset_repo_id }}
v{{ dataset_version_label }}</code>
- <code>{{ dataset_repo_id }} v{{ dataset_version_label }}</code>
(<a href="https://huggingface.co/datasets/{{ dataset_repo_id }}/blob/{{ dataset_commit_hash }}/README.md"
target="_blank">{{ dataset_commit_hash[:7] }}</a> -
{{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})

Base model&nbsp;:
- <code>{{ base_model_repo_id }}</code>
(<a href="https://huggingface.co/{{ base_model_repo_id }}/blob/{{ base_model_commit_hash }}/README.md"
target="_blank">{{ base_model_commit_hash[:7] }}</a> -
{{ base_model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
{% if base_model_arxiv_codes -%}
arxiv&nbsp;:<br />
{%- for base_model_arxiv_code in base_model_arxiv_codes %}
Expand All @@ -85,7 +89,13 @@ arxiv&nbsp;:<br />
<br />
<br />
<br />
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; !! Section on Eval goes here !!
<br />
<br />
<br />
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; !! Section on Eval goes here !!
<br />
<br />
<br />
<br />
<br />
<br />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,6 @@ def get_html(
##########################
# model training #
##########################
if not params['buckets_dict']:
buckets_table = NONE_HTML_STR
else:
buckets_table = pd.DataFrame(
[params['buckets_dict']]).to_html(classes='wide',
escape=False, index = False)
if params['hyperparameters_dict']:
hyperparameters_table = \
pd.DataFrame([params['hyperparameters_dict']]
Expand Down Expand Up @@ -277,6 +271,8 @@ def get_source(self, environment, template):
dataset_commit_datetime=\
params['dataset_commit_datetime'],
dataset_commit_hash=params['dataset_commit_hash'],
dataset_augmentation_rate=params['dataset_augmentation_rate'],
dataset_enrichment_rate=params['dataset_enrichment_rate'],

model_repo_id=params['model_repo_id'],
model_commit_hash=params['model_commit_hash'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -578,23 +578,73 @@ <h2 class="btn-sub" style="color: #6082B6;">
aria-labelledby="innerHeadingOne"
data-parent="#innerAccordion">
<div class="card-body">
<font color="#C0C0C0">
training dataset&nbsp: <b>{{ dataset_repo_id }}
v{{ dataset_version_label }}</b>
<span class="info-container">
<span class="info-icon" style="background-color: #C0C0C0;">i</span>
<span class="info-content" style="width: 175px; white-space: nowrap;">{{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}</span>
</span>
[<a href="https://hf.co/datasets/{{ dataset_repo_id }}/blob/{{ dataset_commit_hash }}/README.md"
target="_blank">HuggingFace &#x1F917;</a>]
</font><br />
<font color="#C0C0C0">
model version&nbsp: <b>{{ model_repo_id }}
v{{ model_version_label }}</b>
(<a href="https://hf.co/{{ model_repo_id }}/blob/{{ model_commit_hash }}/README.md"
target="_blank">{{ model_commit_hash[:7] }}</a>
- {{ model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
</font><br />
<table style="border: none;
background: transparent;
margin: 0 auto 0 0;">
<tr style="all: initial !important;
display: table-row !important;
color: #C0C0C0 !important;">
<td style="border: none; text-align: left;
padding-bottom: 0;">
training dataset&nbsp;:
</td>
<td style="border: none; text-align: left;
padding-bottom: 0;">
<b>{{ dataset_repo_id }} &nbsp;
v{{ dataset_version_label }}</b>
<span class="info-container">
<span style="background-color: #C0C0C0;"
class="info-icon">i</span>
<span style="width: 175px; white-space: nowrap;"
class="info-content">{{ dataset_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}</span>
</span>
</td>
<td style="border: none; text-align: left;
padding-bottom: 0;">
[<a href="https://hf.co/datasets/{{ dataset_repo_id }}/blob/{{ dataset_commit_hash }}/README.md"
target="_blank">HuggingFace &#x1F917;</a>]
</td>
</tr>
<tr style="all: initial !important;
display: table-row !important;
color: #C0C0C0 !important;;">
<td colspan="3"
style="border: none; font-style: italic;
text-align: left; font-size: smaller;
padding-top: 0;">
Data-augmentation rate&nbsp;:
+{{ (dataset_augmentation_rate * 100)|round(1) ~ '%' }},
Data-enrichment rate&nbsp;:
+{{ (dataset_enrichment_rate * 100)|round(1) ~ '%' }}
</td>
</tr>
<tr style="all: initial !important;
display: table-row !important;
color: #C0C0C0 !important;">
<td style="border: none; text-align: left;
padding-top: 0;">
model version&nbsp;:
</td>
<td style="border: none; text-align: left;
padding-top: 0;">
<b>{{ model_repo_id }} &nbsp;
v{{ model_version_label }}</b>
<span class="info-container">
<span style="background-color: #C0C0C0;"
class="info-icon">i</span>
<span style="width: 175px; white-space: nowrap;"
class="info-content">{{ model_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}</span>
</span>
</td>
<td style="border: none; text-align: left;
padding-top: 0;">
[<a href="https://hf.co/{{ model_repo_id }}/blob/{{ model_commit_hash }}/README.md"
target="_blank">HuggingFace &#x1F917;</a>]
</td>
</tr>
</table>
<hr style="width: 50%; border: none;
border-top: 1px solid lightgrey;">
<table style="border: none;
background: transparent;
align: center;">
Expand All @@ -609,11 +659,6 @@ <h2 class="btn-sub" style="color: #6082B6;">
</td>
</tr>
</table>
<br />
<h3 style="color:#FFD700;">
Features bucketization&nbsp;:
</h3>
{{buckets_table}}
<br />
<h3 style="color:#FFD700;">
Hyperparameters set&nbsp;:
Expand Down
Loading

0 comments on commit cb7c488

Please sign in to comment.