complete rework on commits datetime handling for HF repos

aurelienmorgan · Dec 30, 2024 · 4d8a940 · 4d8a940
1 parent 47a5d05
commit 4d8a940
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 83 deletions.
diff --git a/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py b/pkg_src/retrain_pipelines/dataset/hf_utils/hf_utils.py
@@ -43,7 +43,7 @@ def get_latest_commit(
 
     Results:
         - (dict):
-            'commit_hash', 'commit_date',
+            'commit_hash', 'commit_datetime',
             'branch_name', 'files'
     """
 
@@ -65,15 +65,15 @@ def get_latest_commit(
                       if regex_pattern.search(f)
                 ]
                 if matching_files:
-                    commit_date = commit_data["created_at"]
+                    commit_datetime = commit_data["created_at"]
                     if (
                         not latest_matching_commit
-                        or commit_date >
-                            latest_matching_commit["commit_date"]
+                        or commit_datetime >
+                            latest_matching_commit["commit_datetime"]
                     ):
                         latest_matching_commit = {
                             "commit_hash": commit_hash,
-                            "commit_date": commit_date,
+                            "commit_datetime": commit_datetime,
                             "branch_name": \
                                 branch_data["branch_name"],
                             "files": matching_files,
@@ -100,11 +100,12 @@ def get_commit(
             Particular "revision" of the dataset
             to scan.
         - files_filter (str):
-            Only consider files matching this regex pattern.
+            Only consider files matching
+            this regex pattern.
 
     Results:
         - (dict):
-            'commit_hash', 'commit_date',
+            'commit_hash', 'commit_datetime',
             'branch_name', 'files'
     """
 
@@ -137,7 +138,7 @@ def get_commit(
                         if len(matching_files) > 0:
                             matching_commit = {
                                 "commit_hash": commit_hash,
-                                "commit_date": \
+                                "commit_datetime": \
                                     branch_commit_data["created_at"],
                                 "branch_name": \
                                     branch_data["branch_name"],
@@ -200,9 +201,8 @@ def get_lazy_df(
             - commit_hash (str):
                 gets handy when no input value
                 is given as input.
-            - commit_date (str):
-                24hrs, UTC format.
-            - lazydf (pl.lazyframe.frame.LazyFrame):
+            - commit_datetime (datetime)
+            - lazydf (pl.lazyframe.frame.LazyFrame)
     """
 
     parquet_commit = get_commit(
@@ -234,7 +234,8 @@ def get_lazy_df(
     return {
             "repo_id": repo_id,
             "commit_hash": parquet_commit['commit_hash'],
-            "commit_utc_date_str": parquet_commit['commit_date'],
+            "commit_datetime": \
+                parquet_commit['commit_datetime'],
             "lazy_df": lazy_df
         }
 

diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme.py
@@ -3,6 +3,7 @@
 import json
 
 from ast import literal_eval
+from datetime import datetime
 
 from jinja2 import Environment, FileSystemLoader
 
@@ -24,7 +25,7 @@ def _dataset_readme_params(
     augmentation_rate: float,
     enrichment_rate: float,
     version_label: str,
-    utc_timestamp_str: str,
+    commit_datetime: datetime,
     mf_flow_name: str,
     mf_run_id: str,
     engine:str = "cpu"
@@ -39,12 +40,12 @@ def _dataset_readme_params(
         - hf_dataset_dict (dict):
             - repo_id
             - commit_hash
-            - commit_utc_date_str
+            - commit_datetime
             - lazy_df
         - hf_enrich_dataset_dict (dict)
             - repo_id
             - commit_hash
-            - commit_utc_date_str
+            - commit_datetime
         - dataset_dict (DatasetDict):
             the dataset version to be pushed
             to the HF hub.
@@ -58,8 +59,8 @@ def _dataset_readme_params(
         - version_label (str):
             typical `retrain-pipelines`
             version label are of format "major.minor"
-        - utc_timestamp_str (str):
-            timestampt for the new dataset version.
+        - commit_datetime (datetime):
+            timestamp for the new dataset version.
         - mf_flow_name (str)
         - mf_run_id (str)
         - engine (str):
@@ -76,13 +77,13 @@ def _dataset_readme_params(
         dataset_dict["supervised_finetuning"]["validation"].num_rows
     size_category = get_size_category(records_count)
 
-    main_commit_hash, main_commit_utc_date_str = \
+    main_commit_hash, main_commit_datetime = \
         get_latest_README_commit(
             repo_id=hf_dataset_dict["repo_id"],
             target_commit_hash=hf_dataset_dict["commit_hash"],
             repo_type="dataset"
         )
-    enrich_commit_hash, enrich_commit_utc_date_str = \
+    enrich_commit_hash, enrich_commit_datetime = \
         get_latest_README_commit(
             repo_id=hf_enrich_dataset_dict["repo_id"],
             target_commit_hash=\
@@ -150,7 +151,7 @@ def _build_keys(d, parent='', output_str=''):
     return {
             "configs": dataset_dict_to_config_str(dataset_dict),
             "new_version_label": version_label,
-            "utc_timestamp": utc_timestamp_str,
+            "commit_datetime": commit_datetime,
 
             "pretty_name": pretty_name,
 
@@ -162,10 +163,10 @@ def _build_keys(d, parent='', output_str=''):
             "main_commit_hash": main_commit_hash,
             "enrich_commit_hash": enrich_commit_hash,
 
-            "main_commit_utc_date_str": \
-                main_commit_utc_date_str,
-            "enrich_commit_utc_date_str": \
-                enrich_commit_utc_date_str,
+            "main_commit_datetime": \
+                main_commit_datetime,
+            "enrich_commit_datetime": \
+                enrich_commit_datetime,
 
             "main_pretty_name": main_pretty_name,
             "enrich_pretty_name": enrich_pretty_name,
@@ -198,7 +199,7 @@ def get_dataset_readme_content(
     augmentation_rate: float,
     enrichment_rate: float,
     version_label: str,
-    utc_timestamp_str: str,
+    commit_datetime: datetime,
     mf_flow_name: str,
     mf_run_id: str,
     engine:str = "cpu"
@@ -218,12 +219,12 @@ def get_dataset_readme_content(
         - hf_dataset_dict (dict):
             - repo_id
             - commit_hash
-            - commit_utc_date_str
+            - commit_datetime
             - lazy_df
         - hf_enrich_dataset_dict (dict)
             - repo_id
             - commit_hash
-            - commit_utc_date_str
+            - commit_datetime
         - dataset_dict (DatasetDict):
             the dataset version to be pushed
             to the HF hub.
@@ -237,8 +238,8 @@ def get_dataset_readme_content(
         - version_label (str):
             typical `retrain-pipelines`
             version label are of format "major.minor"
-        - utc_timestamp_str (str):
-            timestampt for the new dataset version.
+        - commit_datetime (datetime):
+            timestamp for the new dataset version.
         - mf_flow_name (str)
         - mf_run_id (str)
         - engine (str):
@@ -255,7 +256,7 @@ def get_dataset_readme_content(
         augmentation_rate=augmentation_rate,
         enrichment_rate=enrichment_rate,
         version_label=version_label,
-        utc_timestamp_str=utc_timestamp_str,
+        commit_datetime=commit_datetime,
         mf_flow_name=mf_flow_name,
         mf_run_id=mf_run_id,
         engine=engine

diff --git a/...ipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md b/...ipelines/pipeline_card/mf_unsloth_func_call_litserve/dataset_readme_template.md
@@ -10,15 +10,15 @@
 
 version: '{{ new_version_label }}'
 
-timestamp: {{ utc_timestamp }}
+timestamp: '{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}'
 
 pretty_name: {{ pretty_name }}
 
 source_datasets:
 - {{ main_repo_id }}
 - {{ enrich_repo_id }}
 
-license: {{license_label}}
+license: {{ license_label }}
 
 language:
 - en
@@ -41,14 +41,15 @@ size_categories:
 
 # {{ pretty_name }}
 
-`version {{ new_version_label }}`  -  `{{ utc_timestamp }}`
+`version {{ new_version_label }}`  -  `{{ commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }}`
 
 Source datasets :
   - main&nbsp;:
     - <b>{{ main_pretty_name }}</b><br />
     `{{ main_repo_id }}`
     (<a href="https://huggingface.co/datasets/{{ main_repo_id }}/blob/{{ main_commit_hash }}/README.md"
-        target="_blank">{{ main_commit_hash[:7] }}</a> - {{ main_commit_utc_date_str }})
+        target="_blank">{{ main_commit_hash[:7] }}</a> -
+        {{ main_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
     <br />
     license&nbsp;:
     {% if main_license_label -%}
@@ -68,7 +69,8 @@ Source datasets :
     - <b>{{ enrich_pretty_name }}</b><br />
     `{{ enrich_repo_id }}`
     (<a href="https://huggingface.co/datasets/{{ enrich_repo_id }}/blob/{{ enrich_commit_hash }}/README.md"
-        target="_blank">{{ enrich_commit_hash[:7] }}</a> - {{ enrich_commit_utc_date_str }})
+        target="_blank">{{ enrich_commit_hash[:7] }}</a> -
+        {{ enrich_commit_datetime.strftime("%Y-%m-%d %H:%M:%S UTC") }})
     <br />
     license&nbsp;:
     {% if enrich_license_label -%}

diff --git a/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py b/pkg_src/retrain_pipelines/pipeline_card/mf_unsloth_func_call_litserve/model_readme.py
@@ -3,6 +3,7 @@
 import json
 
 from ast import literal_eval
+from datetime import datetime
 
 from jinja2 import Environment, FileSystemLoader
 
@@ -20,7 +21,7 @@ def _model_readme_params(
     base_model_dict: dict,
     training_dataset_dict: dict,
     version_label: str,
-    utc_timestamp_str: str,
+    commit_datetime: datetime,
     mf_flow_name: str,
     mf_run_id: str,
 ) -> dict:
@@ -34,14 +35,14 @@ def _model_readme_params(
         - base_model_dict (dict)
         - training_dataset_dict (dict):
             - repo_id
+            - version_label
             - commit_hash
-            - commit_utc_date_str
-            - 
+            - commit_datetime
         - version_label (str):
             typical `retrain-pipelines`
             version label are of format "major.minor"
-        - utc_timestamp_str (str):
-            timestampt for the new dataset version.
+        - commit_datetime (datetime):
+            timestamp for the new model version.
         - mf_flow_name (str)
         - mf_run_id (str)
 
@@ -51,7 +52,7 @@ def _model_readme_params(
 
     pretty_name = "retrain-pipelines Function Caller"
 
-    base_model_commit_hash, base_model_commit_utc_date_str = \
+    base_model_commit_hash, base_model_commit_datetime = \
         get_latest_README_commit(
             repo_id=base_model_dict["repo_id"],
             target_commit_hash=base_model_dict["commit_hash"],
@@ -77,7 +78,7 @@ def _model_readme_params(
 
     return {
             "new_version_label": version_label,
-            "utc_timestamp": utc_timestamp_str,
+            "commit_datetime": commit_datetime,
 
             "pretty_name": pretty_name,
 
@@ -87,13 +88,13 @@ def _model_readme_params(
                 training_dataset_dict["version_label"],
             "dataset_commit_hash": \
                 training_dataset_dict["commit_hash"],
-            "dataset_utc_timestamp_str": \
-                training_dataset_dict["utc_timestamp_str"],
+            "dataset_commit_datetime": \
+                training_dataset_dict["commit_datetime"],
 
             "base_model_repo_id": base_model_dict["repo_id"],
             "base_model_pretty_name": base_model_pretty_name,
             "base_model_commit_hash": base_model_commit_hash,
-            "base_model_commit_utc_date_str": base_model_commit_utc_date_str,
+            "base_model_commit_datetime": base_model_commit_datetime,
             "base_model_arxiv_codes": base_model_arxiv_codes,
             "base_model_license_label": base_model_license_label,
 
@@ -111,7 +112,7 @@ def get_model_readme_content(
     training_dataset_dict: dict,
 
     version_label: str,
-    utc_timestamp_str: str,
+    commit_datetime: datetime,
 
     mf_flow_name: str,
     mf_run_id: str,
@@ -134,12 +135,12 @@ def get_model_readme_content(
         - training_dataset_dict (dict)
             - repo_id
             - commit_hash
-            - commit_utc_date_str
+            - commit_datetime
         - version_label (str):
             typical `retrain-pipelines`
             version label are of format "major.minor"
-        - utc_timestamp_str (str):
-            timestampt for the new dataset version.
+        - commit_datetime (datetime):
+            timestamp for the new dataset version.
         - mf_flow_name (str)
         - mf_run_id (str)
 
@@ -151,7 +152,7 @@ def get_model_readme_content(
         base_model_dict=base_model_dict,
         training_dataset_dict=training_dataset_dict,
         version_label=version_label,
-        utc_timestamp_str=utc_timestamp_str,
+        commit_datetime=commit_datetime,
         mf_flow_name=mf_flow_name,
         mf_run_id=mf_run_id
     )