Issue #179: first commit allowing the MLflow model of a model version…

… to be exported as a run model artifact
mlflow · May 12, 2024 · 6184159 · 6184159
1 parent 7b31821
commit 6184159
Show file tree

Hide file tree

Showing 7 changed files with 169 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ For more details:
 * [JSON export file format](README_export_format.md).
 * [MLflow Object Relationships](https://github.com/amesar/mlflow-resources/blob/master/slides/Databricks_MLflow_Object_Relationships.pdf) slide deck.
 
-Last updated: _2023-12-10_.
+Last updated: _2024-05-10_
 
 ## High Level Architecture
 
@@ -23,7 +23,7 @@ Last updated: _2023-12-10_.
     * For example, copy an experiment from one user to another.
   * Backup your MLflow objects to external storage so they can be restored if needed.
   * Disaster recovery. Save your MLflow objects to external storage so they can be replicated to another tracking server.
-  * Supports new Databricks Unity Catalog models.
+  * Supports registered models in both the Databricks Workspace Model Registry and Unity Catalog Model Registry.
 
 ### MLflow Export Import scenarios
 
@@ -71,7 +71,7 @@ Full object referential integrity is maintained (e.g. an imported registered mod
 simply invoke the corresponding Python classes.
 
 Copy tools simply invoke the appropriate export and import on a temporary directory.
-## New Copy Tools
+## Copy Tools
 
 See [README_copy](README_copy.md) on how to copy model versions or runs.
 
@@ -83,10 +83,7 @@ See [README_limitations.md](README_limitations.md).
 ## Quick Start
 
 #### Setup
-```
-pip install mlflow-export-import
-```
-or the latest - _recommended_
+
 ```
 pip install git+https:///github.com/mlflow/mlflow-export-import/#egg=mlflow-export-import
 ```
@@ -111,7 +108,7 @@ import-experiment \
 
 ## Setup
 
-Supports python 3.8.
+Supports python 3.8 and above.
 
 
 ### Local setup
@@ -124,21 +121,16 @@ source mlflow-export-import/bin/activate
 
 There are several different ways to install the package.
 
-#### 1. Install from PyPI 
-
-```
-pip install git+https:///github.com/mlflow/mlflow-export-import/#egg=mlflow-export-import
-```
 
-#### 2. Install from github directly
+#### 1. Install from github
 
-Due to the fast pace of Mlflow Export Important development, it is recommended that you install from github for the latest bug fixes.
+Recommended.
 
 ```
 pip install git+https:///github.com/mlflow/mlflow-export-import/#egg=mlflow-export-import
 ```
 
-#### 3. Install from specific commit
+#### 3. Install from a specific commit
 
 ```
 pip install git+https:///github.com/mlflow/mlflow-export-import@a334f8003a3c9c3b9cd0173827be692a39355fd8
@@ -151,9 +143,13 @@ cd mlflow-export-import
 pip install -e .
 ```
 
+#### 5. Install from PyPI 
+
+Legacy. Due to the quick turnaround time for bug ad feature fixes, this is deprecated.
+
 ### Databricks notebook setup
 
-Make sure your cluster has MLflow 2.2.1 (Databricks Runtime ML version 13.0) or later installed.
+Make sure your cluster has the latest MLflow and Databricks Runtime ML version installed.
 
 There are two different ways to install the mlflow-export-import package in a Databricks notebook.
 
@@ -202,17 +198,17 @@ export DATABRICKS_TOKEN=MY_TOKEN
 See the Databricks documentation page `Access the MLflow tracking server from outside Databricks` - [AWS](https://docs.databricks.com/applications/mlflow/access-hosted-tracking-server.html) or [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/access-hosted-tracking-server).
 
 
-## Running tools
+## Running mlflow-export-import tools
 
 The main tool scripts can be executed either as a standard Python script or console script.
 
-Python [console scripts](https://python-packaging.readthedocs.io/en/latest/command-line-scripts.html#the-console-scripts-entry-point)  (such as export-run, import-run, etc.) are provided as a convenience. For a list of scripts see [setup.py](setup.py).
+Python [console scripts](https://python-packaging.readthedocs.io/en/latest/command-line-scripts.html#the-console-scripts-entry-point) are provided as a convenience. For a list of scripts see [setup.py](setup.py).
 
-This allows you to use:
+For example:
 ```
 export-experiment --help
 ```
-instead of:
+or:
 ```
 python -u -m mlflow_export_import.experiment.export_experiment --help
 ```
@@ -242,13 +238,16 @@ export MLFLOW_EXPORT_IMPORT_LOG_OUTPUT_FILE=/dbfs/mlflow_export_import/logs/expo
 export MLFLOW_EXPORT_IMPORT_LOG_FORMAT="%(asctime)s-%(levelname)s - %(message)s"
 ```
 
-Multithreading:
+## Multithreading:
 
 If you use the `use-threads` option on exports, you can use the `threadName` format option:
 ```
 export MLFLOW_EXPORT_IMPORT_LOG_FORMAT="%(threadName)s-%(levelname)s-%(message)s"
 ```
 
+Note that multithreading is experimental.
+Logging is currently not fully satisfactory as it is interspersed between threads.
+
 ## Other
 
 * [README_options.md](README_options.md) - advanced options.

diff --git a/mlflow_export_import/common/mlflow_utils.py b/mlflow_export_import/common/mlflow_utils.py
@@ -28,6 +28,8 @@ def set_experiment(mlflow_client, dbx_client, exp_name, tags=None):
     :return: Experiment
     """
     if utils.calling_databricks():
+        if not exp_name.startswith("/"):
+            raise MlflowExportImportException(f"Cannot create experiment '{exp_name}'. Databricks experiment must start with '/'.")
         create_workspace_dir(dbx_client, os.path.dirname(exp_name))
     try:
         if not tags: tags = {}
@@ -83,6 +85,8 @@ def create_workspace_dir(dbx_client, workspace_dir):
     Create Databricks workspace directory.
     """
     _logger.info(f"Creating Databricks workspace directory '{workspace_dir}'")
+    if not workspace_dir.startswith("/"):
+        raise MlflowExportImportException(f"Cannot create workspace directory '{workspace_dir}'. Databricks directory must start with '/'.")
     dbx_client.post("workspace/mkdirs", { "path": workspace_dir })
 
 

diff --git a/mlflow_export_import/common/timestamp_utils.py b/mlflow_export_import/common/timestamp_utils.py
@@ -41,10 +41,25 @@ def utc_str_to_seconds(sdt):
 
 
 def adjust_timestamps(dct, keys):
-    """ 
+    """
     Add human readable keys for millisecond timestamps.
     """
     keys = set(keys)
     for key in keys:
         if key in dct:
             dct[f"_{key}"] = fmt_ts_millis(dct[key])
+
+
+def format_seconds(seconds):
+    """
+    Format second duration h/m/s format, e.g. '6m 40s' or '40s'.
+    """
+    minutes, seconds = divmod(seconds, 60)
+    minutes = round(minutes)
+    if minutes:
+        seconds = round(seconds)
+        return f"{minutes}m {seconds}s"
+    else:
+        prec = 2 if seconds < .1 else 1
+        seconds = round(seconds,prec)
+        return f"{seconds}s"
diff --git a/mlflow_export_import/model_version/click_options.py b/mlflow_export_import/model_version/click_options.py
@@ -11,6 +11,32 @@ def opt_version(function):
     )(function)
     return function
 
+
+def opt_vrm_export_version_model(function):
+    function = click.option("--vrm-export-version-model",
+        help="Export the MLflow model (from model registry) of a model version.",
+        type=bool,
+        default=False
+    )(function)
+    return function
+
+def opt_vrm_model_artifact_path(function):
+    function = click.option("--vrm-model-artifact-path",
+        help="Destination artifact path of the Mlflow model of a model version.",
+        type=str,
+        required=False
+    )(function)
+    return function
+
+def opt_skip_download_run_artifacts(function):
+    function = click.option("--skip-download-run-artifacts",
+        help="Skip downloading run artifacts (for fine-tuned LLM models)",
+        type=bool,
+        default=False
+    )(function)
+    return function
+
+
 # == Import model version
 
 def opt_create_model(function):

diff --git a/mlflow_export_import/model_version/export_model_version.py b/mlflow_export_import/model_version/export_model_version.py
@@ -3,40 +3,52 @@
 """
 
 import os
+import time
 import click
 
 from mlflow_export_import.client.client_utils import create_mlflow_client, create_http_client
 from mlflow_export_import.common import utils, io_utils, model_utils
-from mlflow_export_import.common.timestamp_utils import adjust_timestamps
+from mlflow_export_import.common.timestamp_utils import adjust_timestamps, format_seconds
 from mlflow_export_import.run.export_run import export_run
 from mlflow_export_import.common.click_options import (
     opt_model,
     opt_output_dir,
     opt_export_permissions,
-    opt_notebook_formats,
-    opt_export_version_model
+    opt_notebook_formats
+)
+from . click_options import (
+    opt_version,
+    opt_vrm_export_version_model,
+    opt_vrm_model_artifact_path,
+    opt_skip_download_run_artifacts
 )
-from . click_options import opt_version
 
 _logger = utils.getLogger(__name__)
 
+VERSION_MODEL_ARTIFACT_PATH = "version_model"
+
 
 def export_model_version(
         model_name,
         version,
         output_dir,
         export_version_model = False,
+        vrm_model_artifact_path = "",
+        skip_download_run_artifacts = False,
         export_permissions = False,
         notebook_formats = None,
         mlflow_client = None
     ):
     """
-    Exports model version.
+    Exports a model version.
 
     :param model_name: Registered model name.
     :param version: Registered model version.
     :param output_dir: Export directory.
     :param export_version_model: Export model version's 'cached" MLflow model clone.
+    :param export_version_model: Export model version's MLflow model.
+    :param vrm_model_artifact_path:
+    :param skip_download_run_artifacts: Don't download the run's artifacts.
     :param export_permissions: Export Databricks permissions.
     :param notebook_formats: List of Databricks notebook formats. Values are SOURCE, HTML, JUPYTER or DBC (comma separated)
     :param mlflow_client: MlflowClient (optional).
@@ -49,32 +61,59 @@ def export_model_version(
     vr = mlflow_client.get_model_version(model_name, version)
     vr_dct = model_utils.model_version_to_dict(vr)
 
+    _export_registered_model(mlflow_client, model_name, export_permissions, output_dir)
+
     run = export_run(
         run_id = vr.run_id,
         output_dir = os.path.join(output_dir, "run"),
         notebook_formats = notebook_formats,
         export_deleted_runs = True, # NOTE: Important since default is not export a deleted run
+        skip_download_run_artifacts = skip_download_run_artifacts,
         mlflow_client = mlflow_client
     )
+    if not run:
+        msg = f"Cannot get run ID '{vr.run_id}' of model version '{model_name}/{version}'"
+        _logger.error(f"{msg}")
+    else:
+        _export_experiment(mlflow_client, run.info.experiment_id, output_dir)
+
     info_attr = {}
     if export_version_model:
-        _output_dir = os.path.join(output_dir, "mlflow_model")
-        vr_dct["_download_uri"] = model_utils.export_version_model(mlflow_client, vr, _output_dir)
-        info_attr["export_version_model"] = True
-
-    export_experiment(mlflow_client, run.info.experiment_id, output_dir)
-    _export_registered_model(mlflow_client, model_name, export_permissions, output_dir)
+        _export_version_model(mlflow_client, output_dir, vr, vr_dct, info_attr, export_version_model, vrm_model_artifact_path)
 
     adjust_timestamps(vr_dct, ["creation_timestamp", "last_updated_timestamp"])
     mlflow_attr = { "model_version": vr_dct}
-    msg = utils.get_obj_key_values(vr, [ "name", "version", "current_stage", "status", "run_id" ])
+    msg = utils.get_obj_key_values(vr, [ "name", "version", "alias", "current_stage", "status", "run_id" ])
     _logger.info(f"Exporting model verson: {msg}")
 
     io_utils.write_export_file(output_dir, "version.json", __file__, mlflow_attr, info_attr)
     return vr
 
-
-def export_experiment(mlflow_client, experiment_id, output_dir):
+def _export_version_model(mlflow_client,
+        output_dir, vr,
+        vr_dct,
+        info_attr,
+        export_version_model,
+        vrm_model_artifact_path
+    ):
+    start_time = time.time()
+    if vrm_model_artifact_path:
+        _output_dir = os.path.join(output_dir, f"run/artifacts/{vrm_model_artifact_path}")
+        vr_dct["source"] = vrm_model_artifact_path
+        vr_dct["_source"] = vr.source
+    else:
+        _output_dir = os.path.join(output_dir, VERSION_MODEL_ARTIFACT_PATH)
+    vr_dct["_download_uri"] = model_utils.export_version_model(mlflow_client, vr, _output_dir)
+    info_attr["vrm_export_version_model"] = export_version_model
+    info_attr["vrm_model_artifact_path"] = vrm_model_artifact_path
+    info_attr["vrm_model_artifact_full_path"] = _output_dir
+
+    dur = format_seconds(time.time()-start_time)
+    msg = { "model": f'{vr_dct["name"]}/{vr_dct["version"]}', "output_dir": _output_dir }
+    _logger.info(f"Exported registry MLflow model in {dur}: {msg}")
+
+
+def _export_experiment(mlflow_client, experiment_id, output_dir):
     http_client = create_http_client(mlflow_client)
     exp = http_client.get("experiments/get", {"experiment_id": experiment_id})
     exp = exp["experiment"]
@@ -88,12 +127,9 @@ def export_experiment(mlflow_client, experiment_id, output_dir):
 
 def _export_registered_model(mlflow_client, model_name, export_permissions, output_dir):
     model = model_utils.get_registered_model(mlflow_client, model_name, export_permissions)
-
     msg = {"name": model["name"] }
     _logger.info(f"Exporting registered model: {msg}")
-
     adjust_timestamps(model, ["creation_timestamp", "last_updated_timestamp"])
-
     mlflow_attr = { "registered_model": model }
     io_utils.write_export_file(output_dir, "model.json", __file__, mlflow_attr, {})
 
@@ -102,13 +138,18 @@ def _export_registered_model(mlflow_client, model_name, export_permissions, outp
 @opt_model
 @opt_version
 @opt_output_dir
-@opt_export_version_model
+@opt_vrm_export_version_model
+@opt_vrm_model_artifact_path
+@opt_skip_download_run_artifacts
 @opt_export_permissions
 @opt_notebook_formats
+
 def main(model,
         version,
         output_dir,
-        export_version_model,
+        vrm_export_version_model,
+        vrm_model_artifact_path,
+        skip_download_run_artifacts,
         export_permissions,
         notebook_formats,
     ):
@@ -122,7 +163,9 @@ def main(model,
         model_name = model,
         version = version,
         output_dir = output_dir,
-        export_version_model = export_version_model,
+        export_version_model = vrm_export_version_model,
+        vrm_model_artifact_path = vrm_model_artifact_path,
+        skip_download_run_artifacts = skip_download_run_artifacts,
         export_permissions = export_permissions,
         notebook_formats = notebook_formats
     )