From dc49970c829654e36b9c24a0ecb40fd38d431886 Mon Sep 17 00:00:00 2001
From: SDKAuto <sdkautomation@microsoft.com>
Date: Wed, 22 Jan 2025 16:43:08 +0000
Subject: [PATCH] CodeGen from PR 32081 in Azure/azure-rest-api-specs Merge
 3d1d279e04137e17a25c199ce8a948b9758cc63f into
 cba2d23614eab1c481fb0ff397b3eef4773865d2

---
 sdk/ai/azure-ai-inference/_meta.json          |    6 +
 .../azure/ai/inference/_client.py             |   12 +-
 .../azure/ai/inference/_configuration.py      |   12 +-
 .../azure/ai/inference/_model_base.py         |    2 +-
 .../ai/inference/_operations/_operations.py   |   13 +-
 .../azure/ai/inference/_patch.py              | 1345 +----------------
 .../azure/ai/inference/_serialization.py      |   70 +-
 .../azure/ai/inference/_version.py            |    2 +-
 .../azure/ai/inference/aio/_client.py         |   12 +-
 .../azure/ai/inference/aio/_configuration.py  |   12 +-
 .../inference/aio/_operations/_operations.py  |   13 +-
 .../azure/ai/inference/aio/_patch.py          | 1287 +---------------
 .../azure/ai/inference/models/__init__.py     |   18 +
 .../azure/ai/inference/models/_models.py      |  468 +++---
 .../azure/ai/inference/models/_patch.py       |  478 +-----
 .../azure/ai/inference/prompts/__init__.py    |    8 -
 .../azure/ai/inference/prompts/_core.py       |  312 ----
 .../azure/ai/inference/prompts/_invoker.py    |  295 ----
 .../azure/ai/inference/prompts/_mustache.py   |  671 --------
 .../azure/ai/inference/prompts/_parsers.py    |  156 --
 .../azure/ai/inference/prompts/_patch.py      |  124 --
 .../ai/inference/prompts/_prompty_utils.py    |  415 -----
 .../azure/ai/inference/prompts/_renderers.py  |   30 -
 .../azure/ai/inference/prompts/_tracer.py     |  316 ----
 .../azure/ai/inference/prompts/_utils.py      |  100 --
 .../azure/ai/inference/tracing.py             |  850 -----------
 sdk/ai/azure-ai-inference/sdk_packaging.toml  |    2 +
 sdk/ai/azure-ai-inference/setup.py            |    8 +-
 sdk/ai/azure-ai-inference/tsp-location.yaml   |    4 +-
 29 files changed, 331 insertions(+), 6710 deletions(-)
 create mode 100644 sdk/ai/azure-ai-inference/_meta.json
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
 create mode 100644 sdk/ai/azure-ai-inference/sdk_packaging.toml

diff --git a/sdk/ai/azure-ai-inference/_meta.json b/sdk/ai/azure-ai-inference/_meta.json
new file mode 100644
index 000000000000..e62f3588ea70
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/_meta.json
@@ -0,0 +1,6 @@
+{
+  "commit": "7dca60dfb7fda9a5e4aaeb4494db564b992c5c43",
+  "repository_url": "https://github.com/Azure/azure-rest-api-specs",
+  "typespec_src": "specification/ai/ModelClient",
+  "@azure-tools/typespec-python": "0.38.1"
+}
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
index 5e73e91ea2b2..0cde08ffa7cc 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
@@ -36,8 +36,8 @@ class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
@@ -114,8 +114,8 @@ class EmbeddingsClient(EmbeddingsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
@@ -192,8 +192,8 @@ class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
index 8158dd310196..894ec657140f 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
@@ -25,8 +25,8 @@ class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-a
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
@@ -82,8 +82,8 @@ class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attrib
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
@@ -139,8 +139,8 @@ class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-a
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
index 4122dde84de9..7f73b97b23ef 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-lines,arguments-differ,signature-differs,no-member
+# pylint: disable=too-many-lines
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
index 9c41e5e3f0d8..d79af04a49bf 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-locals
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -194,7 +193,7 @@ def _complete(
     def _complete(
         self,
         *,
-        messages: List[_models._models.ChatRequestMessage],
+        messages: List[_models.ChatRequestMessage],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
         frequency_penalty: Optional[float] = None,
@@ -203,7 +202,7 @@ def _complete(
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
+        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
         stop: Optional[List[str]] = None,
         tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
         tool_choice: Optional[
@@ -228,7 +227,7 @@ def _complete(
         self,
         body: Union[JSON, IO[bytes]] = _Unset,
         *,
-        messages: List[_models._models.ChatRequestMessage] = _Unset,
+        messages: List[_models.ChatRequestMessage] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         frequency_penalty: Optional[float] = None,
         stream_parameter: Optional[bool] = None,
@@ -236,7 +235,7 @@ def _complete(
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
+        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
         stop: Optional[List[str]] = None,
         tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
         tool_choice: Optional[
@@ -259,7 +258,7 @@ def _complete(
          Typical usage begins with a chat message for the System role that provides instructions for
          the behavior of the assistant, followed by alternating messages between the User and
          Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models._models.ChatRequestMessage]
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
@@ -316,7 +315,7 @@ def _complete(
          seemingly "stuck" request. Also note that the message content may be partially cut off if
          ``finish_reason="length"``\\ , which indicates the generation exceeded ``max_tokens`` or the
          conversation exceeded the max context length. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models._models.ChatCompletionsResponseFormat
+        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
         :keyword stop: A collection of textual sequences that will end completions generation. Default
          value is None.
         :paramtype stop: list[str]
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
index 0862989beef2..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -6,1350 +5,10 @@
 """Customize generated code here.
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-
-Why do we patch auto-generated code? Below is a summary of the changes made in all _patch files (not just this one):
-1. Add support for input argument `model_extras` (all clients)
-2. Add support for function load_client
-3. Add support for setting sticky chat completions/embeddings input arguments in the client constructor
-4. Add support for get_model_info, while caching the result (all clients)
-5. Add support for chat completion streaming (ChatCompletionsClient client only)
-6. Add support for friendly print of result objects (__str__ method) (all clients)
-7. Add support for load() method in ImageUrl class (see /models/_patch.py)
-8. Add support for sending two auth headers for api-key auth (all clients)
-9. Simplify how chat completions "response_format" is set. Define "response_format" as a flat Union of strings and
-   JsonSchemaFormat object, instead of using auto-generated base/derived classes named
-   ChatCompletionsResponseFormatXxxInternal.
-10. Allow UserMessage("my message") in addition to UserMessage(content="my message"). Same applies to 
-AssistantMessage, SystemMessage and ToolMessage.
-
 """
-import json
-import logging
-import sys
-
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, Iterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator import distributed_trace
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from . import models as _models
-from ._model_base import SdkJSONEncoder, _deserialize
-from ._serialization import Serializer
-from ._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials import TokenCredential
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-
-_SERIALIZER = Serializer()
-_SERIALIZER.client_side_validation = False
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def _get_internal_response_format(
-    response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]]
-) -> Optional[_models._models.ChatCompletionsResponseFormat]:
-    """
-    Internal helper method to convert between the public response format type that's supported in the `complete` method,
-    and the internal response format type that's used in the generated code.
-
-    :param response_format: Response format. Required.
-    :type response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]]
-    :return: Internal response format.
-    :rtype: ~azure.ai.inference._models._models.ChatCompletionsResponseFormat
-    """
-    if response_format is not None:
-
-        # To make mypy tool happy, start by declaring the type as the base class
-        internal_response_format: _models._models.ChatCompletionsResponseFormat
-
-        if isinstance(response_format, str) and response_format == "text":
-            internal_response_format = (
-                _models._models.ChatCompletionsResponseFormatText()  # pylint: disable=protected-access
-            )
-        elif isinstance(response_format, str) and response_format == "json_object":
-            internal_response_format = (
-                _models._models.ChatCompletionsResponseFormatJsonObject()  # pylint: disable=protected-access
-            )
-        elif isinstance(response_format, _models.JsonSchemaFormat):
-            internal_response_format = _models._models.ChatCompletionsResponseFormatJsonSchema(  # pylint: disable=protected-access
-                json_schema=response_format
-            )
-        else:
-            raise ValueError(f"Unsupported `response_format` {response_format}")
-
-        return internal_response_format
-
-    return None
-
-
-def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-    Keyword arguments are passed to the appropriate client's constructor, so if you need to set things like
-    `api_version`, `logging_enable`, `user_agent`, etc., you can do so here.
-    This method will only work when using Serverless API or Managed Compute endpoint.
-    It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :return: The appropriate synchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.ChatCompletionsClient or ~azure.ai.inference.EmbeddingsClient
-     or ~azure.ai.inference.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        model_info = client.get_model_info()  # type: ignore
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions", "chat-comletions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (
-        _models.ModelType.CHAT_COMPLETION,
-        "chat_completions",
-        "chat",
-        "completion",
-        "chat-completion",
-        "chat-completions",
-        "chat completion",
-        "chat completions",
-    ):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (
-        _models.ModelType.EMBEDDINGS,
-        "embedding",
-        "text_embedding",
-        "text-embeddings",
-        "text embedding",
-        "text embeddings",
-    ):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type in (
-        _models.ModelType.IMAGE_EMBEDDINGS,
-        "image_embedding",
-        "image-embeddings",
-        "image-embedding",
-        "image embedding",
-        "image embeddings",
-    ):
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-        unformatted text by default. This is equivalent to setting "text" as the response_format.
-        To output JSON format, without adhering to any schema, set to "json_object".
-        To output JSON format adhering to a provided schema, set this to an object of the class
-        ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-    :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._internal_response_format = _get_internal_response_format(response_format)
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Iterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage] or list[dict[str, Any]]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    # pylint:disable=client-method-missing-tracing-decorator
-    def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage] or list[dict[str, Any]]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        internal_response_format = _get_internal_response_format(response_format)
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": (
-                    internal_response_format if internal_response_format is not None else self._internal_response_format
-                ),
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.StreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[_models.ImageEmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
+from typing import List
 
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
index b24ab2885450..670738f0789c 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
@@ -185,73 +185,7 @@ def deserialize_from_http_generics(cls, body_bytes: Optional[Union[AnyStr, IO]],
 except NameError:
     _long_type = int
 
-
-class UTC(datetime.tzinfo):
-    """Time Zone info for handling UTC"""
-
-    def utcoffset(self, dt):
-        """UTF offset for UTC is 0.
-
-        :param datetime.datetime dt: The datetime
-        :returns: The offset
-        :rtype: datetime.timedelta
-        """
-        return datetime.timedelta(0)
-
-    def tzname(self, dt):
-        """Timestamp representation.
-
-        :param datetime.datetime dt: The datetime
-        :returns: The timestamp representation
-        :rtype: str
-        """
-        return "Z"
-
-    def dst(self, dt):
-        """No daylight saving for UTC.
-
-        :param datetime.datetime dt: The datetime
-        :returns: The daylight saving time
-        :rtype: datetime.timedelta
-        """
-        return datetime.timedelta(hours=1)
-
-
-try:
-    from datetime import timezone as _FixedOffset  # type: ignore
-except ImportError:  # Python 2.7
-
-    class _FixedOffset(datetime.tzinfo):  # type: ignore
-        """Fixed offset in minutes east from UTC.
-        Copy/pasted from Python doc
-        :param datetime.timedelta offset: offset in timedelta format
-        """
-
-        def __init__(self, offset) -> None:
-            self.__offset = offset
-
-        def utcoffset(self, dt):
-            return self.__offset
-
-        def tzname(self, dt):
-            return str(self.__offset.total_seconds() / 3600)
-
-        def __repr__(self):
-            return "<FixedOffset {}>".format(self.tzname(None))
-
-        def dst(self, dt):
-            return datetime.timedelta(0)
-
-        def __getinitargs__(self):
-            return (self.__offset,)
-
-
-try:
-    from datetime import timezone
-
-    TZ_UTC = timezone.utc
-except ImportError:
-    TZ_UTC = UTC()  # type: ignore
+TZ_UTC = datetime.timezone.utc
 
 _FLATTEN = re.compile(r"(?<!\\)\.")
 
@@ -2051,7 +1985,7 @@ def deserialize_rfc(attr):
         try:
             parsed_date = email.utils.parsedate_tz(attr)  # type: ignore
             date_obj = datetime.datetime(
-                *parsed_date[:6], tzinfo=_FixedOffset(datetime.timedelta(minutes=(parsed_date[9] or 0) / 60))
+                *parsed_date[:6], tzinfo=datetime.timezone(datetime.timedelta(minutes=(parsed_date[9] or 0) / 60))
             )
             if not date_obj.tzinfo:
                 date_obj = date_obj.astimezone(tz=TZ_UTC)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
index 84058978c521..be71c81bd282 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
@@ -6,4 +6,4 @@
 # Changes may cause incorrect behavior and will be lost if the code is regenerated.
 # --------------------------------------------------------------------------
 
-VERSION = "1.0.0b7"
+VERSION = "1.0.0b1"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
index 7cea61120519..88e6773bd8f1 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
@@ -36,8 +36,8 @@ class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
@@ -119,8 +119,8 @@ class EmbeddingsClient(EmbeddingsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
@@ -202,8 +202,8 @@ class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
index 2eee5cfe60cb..f60e112599d6 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
@@ -25,8 +25,8 @@ class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-a
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
@@ -85,8 +85,8 @@ class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attrib
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
@@ -145,8 +145,8 @@ class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-a
 
     :param endpoint: Service host. Required.
     :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is one of the
-     following types: AzureKeyCredential, AzureKeyCredential, TokenCredential Required.
+    :param credential: Credential used to authenticate requests to the service. Is either a key
+     credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
index bf4d2140b48c..8a346ca68cd4 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-locals
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -63,7 +62,7 @@ async def _complete(
     async def _complete(
         self,
         *,
-        messages: List[_models._models.ChatRequestMessage],
+        messages: List[_models.ChatRequestMessage],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
         frequency_penalty: Optional[float] = None,
@@ -72,7 +71,7 @@ async def _complete(
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
+        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
         stop: Optional[List[str]] = None,
         tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
         tool_choice: Optional[
@@ -97,7 +96,7 @@ async def _complete(
         self,
         body: Union[JSON, IO[bytes]] = _Unset,
         *,
-        messages: List[_models._models.ChatRequestMessage] = _Unset,
+        messages: List[_models.ChatRequestMessage] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         frequency_penalty: Optional[float] = None,
         stream_parameter: Optional[bool] = None,
@@ -105,7 +104,7 @@ async def _complete(
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
+        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
         stop: Optional[List[str]] = None,
         tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
         tool_choice: Optional[
@@ -128,7 +127,7 @@ async def _complete(
          Typical usage begins with a chat message for the System role that provides instructions for
          the behavior of the assistant, followed by alternating messages between the User and
          Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models._models.ChatRequestMessage]
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
@@ -185,7 +184,7 @@ async def _complete(
          seemingly "stuck" request. Also note that the message content may be partially cut off if
          ``finish_reason="length"``\\ , which indicates the generation exceeded ``max_tokens`` or the
          conversation exceeded the max context length. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models._models.ChatCompletionsResponseFormat
+        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
         :keyword stop: A collection of textual sequences that will end completions generation. Default
          value is None.
         :paramtype stop: list[str]
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
index a8984c93bae4..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -7,1291 +6,9 @@
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import json
-import logging
-import sys
+from typing import List
 
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, AsyncIterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator_async import distributed_trace_async
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from .. import models as _models
-from .._model_base import SdkJSONEncoder, _deserialize
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-from .._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-from .._patch import _get_internal_response_format
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials_async import AsyncTokenCredential
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-_LOGGER = logging.getLogger(__name__)
-
-
-async def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-    Keyword arguments are passed to the appropriate client constructor, so if you need to set things like
-    `api_version`, `logging_enable`, `user_agent`, etc., you can do so here.
-    This method will only work when using Serverless API or Managed Compute endpoint.
-    It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :return: The appropriate asynchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.aio.ChatCompletionsClient or ~azure.ai.inference.aio.EmbeddingsClient
-     or ~azure.ai.inference.aio.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    async with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        model_info = await client.get_model_info()  # type: ignore
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions", "chat-comletions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (
-        _models.ModelType.CHAT_COMPLETION,
-        "chat_completions",
-        "chat",
-        "completion",
-        "chat-completion",
-        "chat-completions",
-        "chat completion",
-        "chat completions",
-    ):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (
-        _models.ModelType.EMBEDDINGS,
-        "embedding",
-        "text_embedding",
-        "text-embeddings",
-        "text embedding",
-        "text embeddings",
-    ):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type in (
-        _models.ModelType.IMAGE_EMBEDDINGS,
-        "image_embedding",
-        "image-embeddings",
-        "image-embedding",
-        "image embedding",
-        "image embeddings",
-    ):
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-        unformatted text by default. This is equivalent to setting "text" as the response_format.
-        To output JSON format, without adhering to any schema, set to "json_object".
-        To output JSON format adhering to a provided schema, set this to an object of the class
-        ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-    :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._internal_response_format = _get_internal_response_format(response_format)
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> AsyncIterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    # pylint:disable=client-method-missing-tracing-decorator-async
-    async def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: List[_models.ChatRequestMessage] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        internal_response_format = _get_internal_response_format(response_format)
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": (
-                    internal_response_format if internal_response_format is not None else self._internal_response_format
-                ),
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.AsyncStreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[_models.ImageEmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
index 8c21439455ca..413f4c7b6e47 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
@@ -14,12 +14,18 @@
 
 
 from ._models import (  # type: ignore
+    AssistantMessage,
     ChatChoice,
     ChatCompletions,
     ChatCompletionsNamedToolChoice,
     ChatCompletionsNamedToolChoiceFunction,
+    ChatCompletionsResponseFormat,
+    ChatCompletionsResponseFormatJsonObject,
+    ChatCompletionsResponseFormatJsonSchema,
+    ChatCompletionsResponseFormatText,
     ChatCompletionsToolCall,
     ChatCompletionsToolDefinition,
+    ChatRequestMessage,
     ChatResponseMessage,
     CompletionsUsage,
     ContentItem,
@@ -37,7 +43,10 @@
     StreamingChatCompletionsUpdate,
     StreamingChatResponseMessageUpdate,
     StreamingChatResponseToolCallUpdate,
+    SystemMessage,
     TextContentItem,
+    ToolMessage,
+    UserMessage,
 )
 
 from ._enums import (  # type: ignore
@@ -54,12 +63,18 @@
 from ._patch import patch_sdk as _patch_sdk
 
 __all__ = [
+    "AssistantMessage",
     "ChatChoice",
     "ChatCompletions",
     "ChatCompletionsNamedToolChoice",
     "ChatCompletionsNamedToolChoiceFunction",
+    "ChatCompletionsResponseFormat",
+    "ChatCompletionsResponseFormatJsonObject",
+    "ChatCompletionsResponseFormatJsonSchema",
+    "ChatCompletionsResponseFormatText",
     "ChatCompletionsToolCall",
     "ChatCompletionsToolDefinition",
+    "ChatRequestMessage",
     "ChatResponseMessage",
     "CompletionsUsage",
     "ContentItem",
@@ -77,7 +92,10 @@
     "StreamingChatCompletionsUpdate",
     "StreamingChatResponseMessageUpdate",
     "StreamingChatResponseToolCallUpdate",
+    "SystemMessage",
     "TextContentItem",
+    "ToolMessage",
+    "UserMessage",
     "ChatCompletionsToolChoicePreset",
     "ChatRole",
     "CompletionsFinishReason",
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
index dd9c123b8ab9..f1b97e0c1747 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
@@ -19,6 +19,88 @@
     from .. import models as _models
 
 
+class ChatRequestMessage(_model_base.Model):
+    """An abstract representation of a chat message as provided in a request.
+
+    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
+    AssistantMessage, SystemMessage, ToolMessage, UserMessage
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message. Required. Known values are: "system",
+     "user", "assistant", and "tool".
+    :vartype role: str or ~azure.ai.inference.models.ChatRole
+    """
+
+    __mapping__: Dict[str, _model_base.Model] = {}
+    role: str = rest_discriminator(name="role")
+    """The chat role associated with this message. Required. Known values are: \"system\", \"user\",
+     \"assistant\", and \"tool\"."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        role: str,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+
+class AssistantMessage(ChatRequestMessage, discriminator="assistant"):
+    """A request chat message representing response or action from the assistant.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'assistant' for
+     assistant messages. Required. The role that provides responses to system-instructed,
+     user-prompted input.
+    :vartype role: str or ~azure.ai.inference.models.ASSISTANT
+    :ivar content: The content of the message.
+    :vartype content: str
+    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
+     subsequent input messages for the chat
+     completions request to resolve as configured.
+    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
+    """
+
+    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'assistant' for assistant messages.
+     Required. The role that provides responses to system-instructed, user-prompted input."""
+    content: Optional[str] = rest_field()
+    """The content of the message."""
+    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
+    """The tool calls that must be resolved and have their outputs appended to subsequent input
+     messages for the chat
+     completions request to resolve as configured."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: Optional[str] = None,
+        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, role=ChatRole.ASSISTANT, **kwargs)
+
+
 class ChatChoice(_model_base.Model):
     """The representation of a single prompt completion as part of an overall chat completions
     request.
@@ -78,13 +160,13 @@ class ChatCompletions(_model_base.Model):
     :vartype created: ~datetime.datetime
     :ivar model: The model used for the chat completion. Required.
     :vartype model: str
+    :ivar usage: Usage information for tokens processed and generated as part of this completions
+     operation. Required.
+    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     :ivar choices: The collection of completions choices associated with this completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required.
     :vartype choices: list[~azure.ai.inference.models.ChatChoice]
-    :ivar usage: Usage information for tokens processed and generated as part of this completions
-     operation. Required.
-    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     """
 
     id: str = rest_field()
@@ -94,13 +176,13 @@ class ChatCompletions(_model_base.Model):
      represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
     model: str = rest_field()
     """The model used for the chat completion. Required."""
+    usage: "_models.CompletionsUsage" = rest_field()
+    """Usage information for tokens processed and generated as part of this completions operation.
+     Required."""
     choices: List["_models.ChatChoice"] = rest_field()
     """The collection of completions choices associated with this completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required."""
-    usage: "_models.CompletionsUsage" = rest_field()
-    """Usage information for tokens processed and generated as part of this completions operation.
-     Required."""
 
     @overload
     def __init__(
@@ -109,8 +191,8 @@ def __init__(
         id: str,  # pylint: disable=redefined-builtin
         created: datetime.datetime,
         model: str,
-        choices: List["_models.ChatChoice"],
         usage: "_models.CompletionsUsage",
+        choices: List["_models.ChatChoice"],
     ) -> None: ...
 
     @overload
@@ -130,6 +212,8 @@ class ChatCompletionsNamedToolChoice(_model_base.Model):
 
     Readonly variables are only populated by the server, and will be ignored when sending a request.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The type of the tool. Currently, only ``function`` is supported. Required. Default
      value is "function".
     :vartype type: str
@@ -166,6 +250,8 @@ class ChatCompletionsNamedToolChoiceFunction(_model_base.Model):
     """A tool selection of a specific, named function tool that will limit chat completions to using
     the named function.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar name: The name of the function that should be called. Required.
     :vartype name: str
     """
@@ -202,6 +288,8 @@ class ChatCompletionsResponseFormat(_model_base.Model):
     ChatCompletionsResponseFormatJsonObject, ChatCompletionsResponseFormatJsonSchema,
     ChatCompletionsResponseFormatText
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The response format type to use for chat completions. Required. Default value is
      None.
     :vartype type: str
@@ -235,6 +323,8 @@ class ChatCompletionsResponseFormatJsonObject(ChatCompletionsResponseFormat, dis
     produce JSON
     via a system or user message.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: Response format type: always 'json_object' for this object. Required. Default value
      is "json_object".
     :vartype type: str
@@ -265,6 +355,8 @@ class ChatCompletionsResponseFormatJsonSchema(ChatCompletionsResponseFormat, dis
     with a
     JSON schema specified by the caller.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The type of response format being defined: ``json_schema``. Required. Default value
      is "json_schema".
     :vartype type: str
@@ -301,6 +393,8 @@ class ChatCompletionsResponseFormatText(ChatCompletionsResponseFormat, discrimin
     """A response format for Chat Completions that emits text responses. This is the default response
     format.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: Response format type: always 'text' for this object. Required. Default value is
      "text".
     :vartype type: str
@@ -373,6 +467,8 @@ class ChatCompletionsToolDefinition(_model_base.Model):
 
     Readonly variables are only populated by the server, and will be ignored when sending a request.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The type of the tool. Currently, only ``function`` is supported. Required. Default
      value is "function".
     :vartype type: str
@@ -405,197 +501,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         self.type: Literal["function"] = "function"
 
 
-class ChatRequestMessage(_model_base.Model):
-    """An abstract representation of a chat message as provided in a request.
-
-    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
-    ChatRequestAssistantMessage, ChatRequestSystemMessage, ChatRequestToolMessage,
-    ChatRequestUserMessage
-
-    :ivar role: The chat role associated with this message. Required. Known values are: "system",
-     "user", "assistant", and "tool".
-    :vartype role: str or ~azure.ai.inference.models.ChatRole
-    """
-
-    __mapping__: Dict[str, _model_base.Model] = {}
-    role: str = rest_discriminator(name="role")
-    """The chat role associated with this message. Required. Known values are: \"system\", \"user\",
-     \"assistant\", and \"tool\"."""
-
-    @overload
-    def __init__(
-        self,
-        *,
-        role: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-
-
-class ChatRequestAssistantMessage(ChatRequestMessage, discriminator="assistant"):
-    """A request chat message representing response or action from the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'assistant' for
-     assistant messages. Required. The role that provides responses to system-instructed,
-     user-prompted input.
-    :vartype role: str or ~azure.ai.inference.models.ASSISTANT
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
-     subsequent input messages for the chat
-     completions request to resolve as configured.
-    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
-    """
-
-    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'assistant' for assistant messages.
-     Required. The role that provides responses to system-instructed, user-prompted input."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
-    """The tool calls that must be resolved and have their outputs appended to subsequent input
-     messages for the chat
-     completions request to resolve as configured."""
-
-    @overload
-    def __init__(
-        self,
-        *,
-        content: Optional[str] = None,
-        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, role=ChatRole.ASSISTANT, **kwargs)
-
-
-class ChatRequestSystemMessage(ChatRequestMessage, discriminator="system"):
-    """A request chat message containing system instructions that influence how the model will
-    generate a chat completions
-    response.
-
-    :ivar role: The chat role associated with this message, which is always 'system' for system
-     messages. Required. The role that instructs or sets the behavior of the assistant.
-    :vartype role: str or ~azure.ai.inference.models.SYSTEM
-    :ivar content: The contents of the system message. Required.
-    :vartype content: str
-    """
-
-    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'system' for system messages.
-     Required. The role that instructs or sets the behavior of the assistant."""
-    content: str = rest_field()
-    """The contents of the system message. Required."""
-
-    @overload
-    def __init__(
-        self,
-        *,
-        content: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, role=ChatRole.SYSTEM, **kwargs)
-
-
-class ChatRequestToolMessage(ChatRequestMessage, discriminator="tool"):
-    """A request chat message representing requested output from a configured tool.
-
-    :ivar role: The chat role associated with this message, which is always 'tool' for tool
-     messages. Required. The role that represents extension tool activity within a chat completions
-     operation.
-    :vartype role: str or ~azure.ai.inference.models.TOOL
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_call_id: The ID of the tool call resolved by the provided content. Required.
-    :vartype tool_call_id: str
-    """
-
-    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'tool' for tool messages. Required.
-     The role that represents extension tool activity within a chat completions operation."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_call_id: str = rest_field()
-    """The ID of the tool call resolved by the provided content. Required."""
-
-    @overload
-    def __init__(
-        self,
-        *,
-        tool_call_id: str,
-        content: Optional[str] = None,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, role=ChatRole.TOOL, **kwargs)
-
-
-class ChatRequestUserMessage(ChatRequestMessage, discriminator="user"):
-    """A request chat message representing user input to the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'user' for user
-     messages. Required. The role that provides input for chat completions.
-    :vartype role: str or ~azure.ai.inference.models.USER
-    :ivar content: The contents of the user message, with available input types varying by selected
-     model. Required. Is either a str type or a [ContentItem] type.
-    :vartype content: str or list[~azure.ai.inference.models.ContentItem]
-    """
-
-    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'user' for user messages. Required.
-     The role that provides input for chat completions."""
-    content: Union["str", List["_models.ContentItem"]] = rest_field()
-    """The contents of the user message, with available input types varying by selected model.
-     Required. Is either a str type or a [ContentItem] type."""
-
-    @overload
-    def __init__(
-        self,
-        *,
-        content: Union[str, List["_models.ContentItem"]],
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, role=ChatRole.USER, **kwargs)
-
-
 class ChatResponseMessage(_model_base.Model):
     """A representation of a chat message as received in a response.
 
@@ -691,6 +596,8 @@ class ContentItem(_model_base.Model):
     You probably want to use the sub-classes and not this class directly. Known sub-classes are:
     ImageContentItem, TextContentItem
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The discriminated object type. Required. Default value is None.
     :vartype type: str
     """
@@ -730,7 +637,7 @@ class EmbeddingItem(_model_base.Model):
     :vartype index: int
     """
 
-    embedding: Union["str", List[float]] = rest_field()
+    embedding: Union[str, List[float]] = rest_field()
     """List of embedding values for the input prompt. These represent a measurement of the
      vector-based relatedness of the provided input. Or a base64 encoded string of the embedding
      vector. Required. Is either a str type or a [float] type."""
@@ -885,6 +792,8 @@ class FunctionDefinition(_model_base.Model):
     """The definition of a caller-specified function that chat completions may invoke in response to
     matching user input.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar name: The name of the function to be called. Required.
     :vartype name: str
     :ivar description: A description of what the function does. The model will use this description
@@ -927,6 +836,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class ImageContentItem(ContentItem, discriminator="image_url"):
     """A structured chat content item containing an image reference.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The discriminated object type: always 'image_url' for this type. Required. Default
      value is "image_url".
     :vartype type: str
@@ -963,6 +874,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class ImageEmbeddingInput(_model_base.Model):
     """Represents an image with optional text.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar image: The input image encoded in base64 string as a data URL. Example:
      ``data:image/{format};base64,{data}``. Required.
     :vartype image: str
@@ -1000,6 +913,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class ImageUrl(_model_base.Model):
     """An internet location from which the model may retrieve an image.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar url: The URL of the image. Required.
     :vartype url: str
     :ivar detail: The evaluation quality setting to use, which controls relative prioritization of
@@ -1035,8 +950,10 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
 
 class JsonSchemaFormat(_model_base.Model):
-    """Defines the response format for chat completions as JSON with a given schema.
-    The AI model will need to adhere to this schema when generating completions.
+    """Defines the response format for chat completions as JSON with a given schema. The AI model
+    will need to adhere to this schema when generating completions.
+
+    All required parameters must be populated in order to send to server.
 
     :ivar name: A name that labels this JSON schema. Must be a-z, A-Z, 0-9, or contain underscores
      and dashes, with a maximum length of 64. Required.
@@ -1044,7 +961,8 @@ class JsonSchemaFormat(_model_base.Model):
     :ivar schema: The definition of the JSON schema. See
      https://json-schema.org/overview/what-is-jsonschema.
      Note that AI models usually only support a subset of the keywords defined by JSON schema.
-     Consult your AI model documentation to determine what is supported. Required.
+     Consult your AI model documentation
+     to determine what is supported. Required.
     :vartype schema: dict[str, any]
     :ivar description: A description of the response format, used by the AI model to determine how
      to generate responses in this format.
@@ -1053,8 +971,9 @@ class JsonSchemaFormat(_model_base.Model):
      keywords
      not supported by the AI model. An example of such keyword may be ``maxLength`` for JSON type
      ``string``.
-     If false, and the provided JSON schema contains keywords not supported by the AI model,
-     the AI model will not error out. Instead it will ignore the unsupported keywords.
+     If false, and the provided JSON schema contains keywords not supported
+     by the AI model, the AI model will not error out. Instead it will ignore the unsupported
+     keywords.
     :vartype strict: bool
     """
 
@@ -1064,7 +983,8 @@ class JsonSchemaFormat(_model_base.Model):
     schema: Dict[str, Any] = rest_field()
     """The definition of the JSON schema. See https://json-schema.org/overview/what-is-jsonschema.
      Note that AI models usually only support a subset of the keywords defined by JSON schema.
-     Consult your AI model documentation to determine what is supported. Required."""
+     Consult your AI model documentation
+     to determine what is supported. Required."""
     description: Optional[str] = rest_field()
     """A description of the response format, used by the AI model to determine how to generate
      responses in this format."""
@@ -1072,8 +992,9 @@ class JsonSchemaFormat(_model_base.Model):
     """If set to true, the service will error out if the provided JSON schema contains keywords
      not supported by the AI model. An example of such keyword may be ``maxLength`` for JSON type
      ``string``.
-     If false, and the provided JSON schema contains keywords not supported by the AI model,
-     the AI model will not error out. Instead it will ignore the unsupported keywords."""
+     If false, and the provided JSON schema contains keywords not supported
+     by the AI model, the AI model will not error out. Instead it will ignore the unsupported
+     keywords."""
 
     @overload
     def __init__(
@@ -1201,14 +1122,14 @@ class StreamingChatCompletionsUpdate(_model_base.Model):
     :vartype created: ~datetime.datetime
     :ivar model: The model used for the chat completion. Required.
     :vartype model: str
+    :ivar usage: Usage information for tokens processed and generated as part of this completions
+     operation. Required.
+    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     :ivar choices: An update to the collection of completion choices associated with this
      completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required.
     :vartype choices: list[~azure.ai.inference.models.StreamingChatChoiceUpdate]
-    :ivar usage: Usage information for tokens processed and generated as part of this completions
-     operation.
-    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     """
 
     id: str = rest_field()
@@ -1218,12 +1139,13 @@ class StreamingChatCompletionsUpdate(_model_base.Model):
      represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
     model: str = rest_field()
     """The model used for the chat completion. Required."""
+    usage: "_models.CompletionsUsage" = rest_field()
+    """Usage information for tokens processed and generated as part of this completions operation.
+     Required."""
     choices: List["_models.StreamingChatChoiceUpdate"] = rest_field()
     """An update to the collection of completion choices associated with this completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required."""
-    usage: Optional["_models.CompletionsUsage"] = rest_field()
-    """Usage information for tokens processed and generated as part of this completions operation."""
 
     @overload
     def __init__(
@@ -1232,8 +1154,8 @@ def __init__(
         id: str,  # pylint: disable=redefined-builtin
         created: datetime.datetime,
         model: str,
+        usage: "_models.CompletionsUsage",
         choices: List["_models.StreamingChatChoiceUpdate"],
-        usage: Optional["_models.CompletionsUsage"] = None,
     ) -> None: ...
 
     @overload
@@ -1325,9 +1247,49 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
+class SystemMessage(ChatRequestMessage, discriminator="system"):
+    """A request chat message containing system instructions that influence how the model will
+    generate a chat completions
+    response.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'system' for system
+     messages. Required. The role that instructs or sets the behavior of the assistant.
+    :vartype role: str or ~azure.ai.inference.models.SYSTEM
+    :ivar content: The contents of the system message. Required.
+    :vartype content: str
+    """
+
+    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'system' for system messages.
+     Required. The role that instructs or sets the behavior of the assistant."""
+    content: str = rest_field()
+    """The contents of the system message. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: str,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, role=ChatRole.SYSTEM, **kwargs)
+
+
 class TextContentItem(ContentItem, discriminator="text"):
     """A structured chat content item containing plain text.
 
+    All required parameters must be populated in order to send to server.
+
     :ivar type: The discriminated object type: always 'text' for this type. Required. Default value
      is "text".
     :vartype type: str
@@ -1357,3 +1319,83 @@ def __init__(self, mapping: Mapping[str, Any]) -> None:
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, type="text", **kwargs)
+
+
+class ToolMessage(ChatRequestMessage, discriminator="tool"):
+    """A request chat message representing requested output from a configured tool.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'tool' for tool
+     messages. Required. The role that represents extension tool activity within a chat completions
+     operation.
+    :vartype role: str or ~azure.ai.inference.models.TOOL
+    :ivar content: The content of the message.
+    :vartype content: str
+    :ivar tool_call_id: The ID of the tool call resolved by the provided content. Required.
+    :vartype tool_call_id: str
+    """
+
+    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'tool' for tool messages. Required.
+     The role that represents extension tool activity within a chat completions operation."""
+    content: Optional[str] = rest_field()
+    """The content of the message."""
+    tool_call_id: str = rest_field()
+    """The ID of the tool call resolved by the provided content. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        tool_call_id: str,
+        content: Optional[str] = None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, role=ChatRole.TOOL, **kwargs)
+
+
+class UserMessage(ChatRequestMessage, discriminator="user"):
+    """A request chat message representing user input to the assistant.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'user' for user
+     messages. Required. The role that provides input for chat completions.
+    :vartype role: str or ~azure.ai.inference.models.USER
+    :ivar content: The contents of the user message, with available input types varying by selected
+     model. Required. Is either a str type or a [ContentItem] type.
+    :vartype content: str or list[~azure.ai.inference.models.ContentItem]
+    """
+
+    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'user' for user messages. Required.
+     The role that provides input for chat completions."""
+    content: Union[str, List["_models.ContentItem"]] = rest_field()
+    """The contents of the user message, with available input types varying by selected model.
+     Required. Is either a str type or a [ContentItem] type."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: Union[str, List["_models.ContentItem"]],
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, role=ChatRole.USER, **kwargs)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
index d44b32e99019..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
@@ -6,483 +6,9 @@
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import asyncio
-import base64
-import json
-import logging
-import queue
-import re
-import sys
+from typing import List
 
-from typing import Mapping, Literal, Any, List, AsyncIterator, Iterator, Optional, Union, overload
-from azure.core.rest import HttpResponse, AsyncHttpResponse
-from ._enums import ChatRole
-from .._model_base import rest_discriminator, rest_field
-from ._models import ChatRequestMessage
-from ._models import ImageUrl as ImageUrlGenerated
-from ._models import ChatCompletions as ChatCompletionsGenerated
-from ._models import EmbeddingsResult as EmbeddingsResultGenerated
-from ._models import ImageEmbeddingInput as EmbeddingInputGenerated
-from .. import models as _models
-
-if sys.version_info >= (3, 11):
-    from typing import Self
-else:
-    from typing_extensions import Self
-
-logger = logging.getLogger(__name__)
-
-
-class UserMessage(ChatRequestMessage, discriminator="user"):
-    """A request chat message representing user input to the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'user' for user
-     messages. Required. The role that provides input for chat completions.
-    :vartype role: str or ~azure.ai.inference.models.USER
-    :ivar content: The contents of the user message, with available input types varying by selected
-     model. Required. Is either a str type or a [ContentItem] type.
-    :vartype content: str or list[~azure.ai.inference.models.ContentItem]
-    """
-
-    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'user' for user messages. Required.
-     The role that provides input for chat completions."""
-    content: Union["str", List["_models.ContentItem"]] = rest_field()
-    """The contents of the user message, with available input types varying by selected model.
-     Required. Is either a str type or a [ContentItem] type."""
-
-    @overload
-    def __init__(
-        self,
-        content: Union[str, List["_models.ContentItem"]],
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], (List, str)):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.USER, **kwargs)
-
-
-class SystemMessage(ChatRequestMessage, discriminator="system"):
-    """A request chat message containing system instructions that influence how the model will
-    generate a chat completions
-    response.
-
-    :ivar role: The chat role associated with this message, which is always 'system' for system
-     messages. Required. The role that instructs or sets the behavior of the assistant.
-    :vartype role: str or ~azure.ai.inference.models.SYSTEM
-    :ivar content: The contents of the system message. Required.
-    :vartype content: str
-    """
-
-    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'system' for system messages.
-     Required. The role that instructs or sets the behavior of the assistant."""
-    content: str = rest_field()
-    """The contents of the system message. Required."""
-
-    @overload
-    def __init__(
-        self,
-        content: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.SYSTEM, **kwargs)
-
-
-class AssistantMessage(ChatRequestMessage, discriminator="assistant"):
-    """A request chat message representing response or action from the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'assistant' for
-     assistant messages. Required. The role that provides responses to system-instructed,
-     user-prompted input.
-    :vartype role: str or ~azure.ai.inference.models.ASSISTANT
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
-     subsequent input messages for the chat
-     completions request to resolve as configured.
-    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
-    """
-
-    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'assistant' for assistant messages.
-     Required. The role that provides responses to system-instructed, user-prompted input."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
-    """The tool calls that must be resolved and have their outputs appended to subsequent input
-     messages for the chat
-     completions request to resolve as configured."""
-
-    @overload
-    def __init__(
-        self,
-        content: Optional[str] = None,
-        *,
-        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.ASSISTANT, **kwargs)
-
-
-class ToolMessage(ChatRequestMessage, discriminator="tool"):
-    """A request chat message representing requested output from a configured tool.
-
-    :ivar role: The chat role associated with this message, which is always 'tool' for tool
-     messages. Required. The role that represents extension tool activity within a chat completions
-     operation.
-    :vartype role: str or ~azure.ai.inference.models.TOOL
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_call_id: The ID of the tool call resolved by the provided content. Required.
-    :vartype tool_call_id: str
-    """
-
-    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'tool' for tool messages. Required.
-     The role that represents extension tool activity within a chat completions operation."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_call_id: str = rest_field()
-    """The ID of the tool call resolved by the provided content. Required."""
-
-    @overload
-    def __init__(
-        self,
-        content: Optional[str] = None,
-        *,
-        tool_call_id: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.TOOL, **kwargs)
-
-
-class ChatCompletions(ChatCompletionsGenerated):
-    """Representation of the response data from a chat completions request.
-    Completions support a wide variety of tasks and generate text that continues from or
-    "completes"
-    provided prompt data.
-
-
-    :ivar id: A unique identifier associated with this chat completions response. Required.
-    :vartype id: str
-    :ivar created: The first timestamp associated with generation activity for this completions
-     response,
-     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
-    :vartype created: ~datetime.datetime
-    :ivar model: The model used for the chat completion. Required.
-    :vartype model: str
-    :ivar usage: Usage information for tokens processed and generated as part of this completions
-     operation. Required.
-    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
-    :ivar choices: The collection of completions choices associated with this completions response.
-     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
-     Token limits and other settings may limit the number of choices generated. Required.
-    :vartype choices: list[~azure.ai.inference.models.ChatChoice]
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class EmbeddingsResult(EmbeddingsResultGenerated):
-    """Representation of the response data from an embeddings request.
-    Embeddings measure the relatedness of text strings and are commonly used for search,
-    clustering,
-    recommendations, and other similar scenarios.
-
-
-    :ivar data: Embedding values for the prompts submitted in the request. Required.
-    :vartype data: list[~azure.ai.inference.models.EmbeddingItem]
-    :ivar usage: Usage counts for tokens input using the embeddings API. Required.
-    :vartype usage: ~azure.ai.inference.models.EmbeddingsUsage
-    :ivar model: The model ID used to generate this result. Required.
-    :vartype model: str
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class ImageUrl(ImageUrlGenerated):
-
-    @classmethod
-    def load(
-        cls, *, image_file: str, image_format: str, detail: Optional[Union[str, "_models.ImageDetailLevel"]] = None
-    ) -> Self:
-        """
-        Create an ImageUrl object from a local image file. The method reads the image
-        file and encodes it as a base64 string, which together with the image format
-        is then used to format the JSON `url` value passed in the request payload.
-
-        :keyword image_file: The name of the local image file to load. Required.
-        :paramtype image_file: str
-        :keyword image_format: The MIME type format of the image. For example: "jpeg", "png". Required.
-        :paramtype image_format: str
-        :keyword detail: The evaluation quality setting to use, which controls relative prioritization of
-         speed, token consumption, and accuracy. Known values are: "auto", "low", and "high".
-        :paramtype detail: str or ~azure.ai.inference.models.ImageDetailLevel
-        :return: An ImageUrl object with the image data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.ImageUrl
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(image_file, "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        url = f"data:image/{image_format};base64,{image_data}"
-        return cls(url=url, detail=detail)
-
-
-class ImageEmbeddingInput(EmbeddingInputGenerated):
-
-    @classmethod
-    def load(cls, *, image_file: str, image_format: str, text: Optional[str] = None) -> Self:
-        """
-        Create an ImageEmbeddingInput object from a local image file. The method reads the image
-        file and encodes it as a base64 string, which together with the image format
-        is then used to format the JSON `url` value passed in the request payload.
-
-        :keyword image_file: The name of the local image file to load. Required.
-        :paramtype image_file: str
-        :keyword image_format: The MIME type format of the image. For example: "jpeg", "png". Required.
-        :paramtype image_format: str
-        :keyword text: Optional. The text input to feed into the model (like DINO, CLIP).
-         Returns a 422 error if the model doesn't support the value or parameter.
-        :paramtype text: str
-        :return: An ImageEmbeddingInput object with the image data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.EmbeddingsInput
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(image_file, "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        image_uri = f"data:image/{image_format};base64,{image_data}"
-        return cls(image=image_uri, text=text)
-
-
-class BaseStreamingChatCompletions:
-    """A base class for the sync and async streaming chat completions responses, holding any common code
-    to deserializes the Server Sent Events (SSE) response stream into chat completions updates, each one
-    represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    # Enable detailed logs of SSE parsing. For development only, should be `False` by default.
-    _ENABLE_CLASS_LOGS = False
-
-    # The prefix of each line in the SSE stream that contains a JSON string
-    # to deserialize into a StreamingChatCompletionsUpdate object
-    _SSE_DATA_EVENT_PREFIX = "data: "
-
-    # The line indicating the end of the SSE stream
-    _SSE_DATA_EVENT_DONE = "data: [DONE]"
-
-    def __init__(self):
-        self._queue: "queue.Queue[_models.StreamingChatCompletionsUpdate]" = queue.Queue()
-        self._incomplete_json = ""
-        self._done = False  # Will be set to True when reading 'data: [DONE]' line
-
-    def _deserialize_and_add_to_queue(self, element: bytes) -> bool:
-
-        # Clear the queue of StreamingChatCompletionsUpdate before processing the next block
-        self._queue.queue.clear()
-
-        # Convert `bytes` to string and split the string by newline, while keeping the new line char.
-        # the last may be a partial "line" that does not contain a newline char at the end.
-        line_list: List[str] = re.split(r"(?<=\n)", element.decode("utf-8"))
-        for index, line in enumerate(line_list):
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Original line] %s", repr(line))
-
-            if index == 0:
-                line = self._incomplete_json + line
-                self._incomplete_json = ""
-
-            if index == len(line_list) - 1 and not line.endswith("\n"):
-                self._incomplete_json = line
-                return False
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Modified line] %s", repr(line))
-
-            if line == "\n":  # Empty line, indicating flush output to client
-                continue
-
-            if not line.startswith(self._SSE_DATA_EVENT_PREFIX):
-                raise ValueError(f"SSE event not supported (line `{line}`)")
-
-            if line.startswith(self._SSE_DATA_EVENT_DONE):
-                if self._ENABLE_CLASS_LOGS:
-                    logger.debug("[Done]")
-                return True
-
-            # If you reached here, the line should contain `data: {...}\n`
-            # where the curly braces contain a valid JSON object.
-            # Deserialize it into a StreamingChatCompletionsUpdate object
-            # and add it to the queue.
-            # pylint: disable=W0212 # Access to a protected member _deserialize of a client class
-            update = _models.StreamingChatCompletionsUpdate._deserialize(
-                json.loads(line[len(self._SSE_DATA_EVENT_PREFIX) : -1]), []
-            )
-
-            # We skip any update that has a None or empty choices list
-            # (this is what OpenAI Python SDK does)
-            if update.choices:
-
-                # We update all empty content strings to None
-                # (this is what OpenAI Python SDK does)
-                # for choice in update.choices:
-                #    if not choice.delta.content:
-                #        choice.delta.content = None
-
-                self._queue.put(update)
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Added to queue]")
-
-        return False
-
-
-class StreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an interator over StreamingChatCompletionsUpdate objects. It can be used for either synchronous or
-    asynchronous iterations. The class deserializes the Server Sent Events (SSE) response stream
-    into chat completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: HttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: Iterator[bytes] = response.iter_bytes()
-
-    def __iter__(self) -> Any:
-        return self
-
-    def __next__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = self._read_next_block()
-        if self._queue.empty():
-            raise StopIteration
-        return self._queue.get()
-
-    def _read_next_block(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = self._bytes_iterator.__next__()
-        except StopIteration:
-            self.close()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:  # type: ignore
-        self.close()
-
-    def close(self) -> None:
-        self._response.close()
-
-
-class AsyncStreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an async interator over StreamingChatCompletionsUpdate objects.
-    It can be used for either synchronous or asynchronous iterations. The class
-    deserializes the Server Sent Events (SSE) response stream into chat
-    completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: AsyncHttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: AsyncIterator[bytes] = response.iter_bytes()
-
-    def __aiter__(self) -> Any:
-        return self
-
-    async def __anext__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = await self._read_next_block_async()
-        if self._queue.empty():
-            raise StopAsyncIteration
-        return self._queue.get()
-
-    async def _read_next_block_async(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = await self._bytes_iterator.__anext__()
-        except StopAsyncIteration:
-            await self.aclose()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:  # type: ignore
-        asyncio.run(self.aclose())
-
-    async def aclose(self) -> None:
-        await self._response.close()
-
-
-__all__: List[str] = [
-    "AssistantMessage",
-    "AsyncStreamingChatCompletions",
-    "ChatCompletions",
-    "ChatRequestMessage",
-    "EmbeddingsResult",
-    "ImageEmbeddingInput",
-    "ImageUrl",
-    "StreamingChatCompletions",
-    "SystemMessage",
-    "ToolMessage",
-    "UserMessage",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
deleted file mode 100644
index 2e11b31cb6a4..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=unused-import
-from ._patch import patch_sdk as _patch_sdk, PromptTemplate
-
-_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
deleted file mode 100644
index ec6702995149..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="assignment,attr-defined,index,arg-type"
-# pylint: disable=line-too-long,R,consider-iterating-dictionary,raise-missing-from,dangerous-default-value
-from __future__ import annotations
-import os
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Union
-from ._tracer import Tracer, to_dict
-from ._utils import load_json
-
-
-@dataclass
-class ToolCall:
-    id: str
-    name: str
-    arguments: str
-
-
-@dataclass
-class PropertySettings:
-    """PropertySettings class to define the properties of the model
-
-    Attributes
-    ----------
-    type : str
-        The type of the property
-    default : Any
-        The default value of the property
-    description : str
-        The description of the property
-    """
-
-    type: Literal["string", "number", "array", "object", "boolean"]
-    default: Union[str, int, float, List, Dict, bool, None] = field(default=None)
-    description: str = field(default="")
-
-
-@dataclass
-class ModelSettings:
-    """ModelSettings class to define the model of the prompty
-
-    Attributes
-    ----------
-    api : str
-        The api of the model
-    configuration : Dict
-        The configuration of the model
-    parameters : Dict
-        The parameters of the model
-    response : Dict
-        The response of the model
-    """
-
-    api: str = field(default="")
-    configuration: Dict = field(default_factory=dict)
-    parameters: Dict = field(default_factory=dict)
-    response: Dict = field(default_factory=dict)
-
-
-@dataclass
-class TemplateSettings:
-    """TemplateSettings class to define the template of the prompty
-
-    Attributes
-    ----------
-    type : str
-        The type of the template
-    parser : str
-        The parser of the template
-    """
-
-    type: str = field(default="mustache")
-    parser: str = field(default="")
-
-
-@dataclass
-class Prompty:
-    """Prompty class to define the prompty
-
-    Attributes
-    ----------
-    name : str
-        The name of the prompty
-    description : str
-        The description of the prompty
-    authors : List[str]
-        The authors of the prompty
-    tags : List[str]
-        The tags of the prompty
-    version : str
-        The version of the prompty
-    base : str
-        The base of the prompty
-    basePrompty : Prompty
-        The base prompty
-    model : ModelSettings
-        The model of the prompty
-    sample : Dict
-        The sample of the prompty
-    inputs : Dict[str, PropertySettings]
-        The inputs of the prompty
-    outputs : Dict[str, PropertySettings]
-        The outputs of the prompty
-    template : TemplateSettings
-        The template of the prompty
-    file : FilePath
-        The file of the prompty
-    content : Union[str, List[str], Dict]
-        The content of the prompty
-    """
-
-    # metadata
-    name: str = field(default="")
-    description: str = field(default="")
-    authors: List[str] = field(default_factory=list)
-    tags: List[str] = field(default_factory=list)
-    version: str = field(default="")
-    base: str = field(default="")
-    basePrompty: Union[Prompty, None] = field(default=None)
-    # model
-    model: ModelSettings = field(default_factory=ModelSettings)
-
-    # sample
-    sample: Dict = field(default_factory=dict)
-
-    # input / output
-    inputs: Dict[str, PropertySettings] = field(default_factory=dict)
-    outputs: Dict[str, PropertySettings] = field(default_factory=dict)
-
-    # template
-    template: TemplateSettings = field(default_factory=TemplateSettings)
-
-    file: Union[Path, str] = field(default="")
-    content: Union[str, List[str], Dict] = field(default="")
-
-    def to_safe_dict(self) -> Dict[str, Any]:
-        d = {}
-        if self.model:
-            d["model"] = asdict(self.model)
-            _mask_secrets(d, ["model", "configuration"])
-        if self.template:
-            d["template"] = asdict(self.template)
-        if self.inputs:
-            d["inputs"] = {k: asdict(v) for k, v in self.inputs.items()}
-        if self.outputs:
-            d["outputs"] = {k: asdict(v) for k, v in self.outputs.items()}
-        if self.file:
-            d["file"] = str(self.file.as_posix()) if isinstance(self.file, Path) else self.file
-        return d
-
-    @staticmethod
-    def hoist_base_prompty(top: Prompty, base: Prompty) -> Prompty:
-        top.name = base.name if top.name == "" else top.name
-        top.description = base.description if top.description == "" else top.description
-        top.authors = list(set(base.authors + top.authors))
-        top.tags = list(set(base.tags + top.tags))
-        top.version = base.version if top.version == "" else top.version
-
-        top.model.api = base.model.api if top.model.api == "" else top.model.api
-        top.model.configuration = param_hoisting(top.model.configuration, base.model.configuration)
-        top.model.parameters = param_hoisting(top.model.parameters, base.model.parameters)
-        top.model.response = param_hoisting(top.model.response, base.model.response)
-
-        top.sample = param_hoisting(top.sample, base.sample)
-
-        top.basePrompty = base
-
-        return top
-
-    @staticmethod
-    def _process_file(file: str, parent: Path) -> Any:
-        file_path = Path(parent / Path(file)).resolve().absolute()
-        if file_path.exists():
-            items = load_json(file_path)
-            if isinstance(items, list):
-                return [Prompty.normalize(value, parent) for value in items]
-            elif isinstance(items, Dict):
-                return {key: Prompty.normalize(value, parent) for key, value in items.items()}
-            else:
-                return items
-        else:
-            raise FileNotFoundError(f"File {file} not found")
-
-    @staticmethod
-    def _process_env(variable: str, env_error=True, default: Union[str, None] = None) -> Any:
-        if variable in os.environ.keys():
-            return os.environ[variable]
-        else:
-            if default:
-                return default
-            if env_error:
-                raise ValueError(f"Variable {variable} not found in environment")
-
-            return ""
-
-    @staticmethod
-    def normalize(attribute: Any, parent: Path, env_error=True) -> Any:
-        if isinstance(attribute, str):
-            attribute = attribute.strip()
-            if attribute.startswith("${") and attribute.endswith("}"):
-                # check if env or file
-                variable = attribute[2:-1].split(":")
-                if variable[0] == "env" and len(variable) > 1:
-                    return Prompty._process_env(
-                        variable[1],
-                        env_error,
-                        variable[2] if len(variable) > 2 else None,
-                    )
-                elif variable[0] == "file" and len(variable) > 1:
-                    return Prompty._process_file(variable[1], parent)
-                else:
-                    raise ValueError(f"Invalid attribute format ({attribute})")
-            else:
-                return attribute
-        elif isinstance(attribute, list):
-            return [Prompty.normalize(value, parent) for value in attribute]
-        elif isinstance(attribute, Dict):
-            return {key: Prompty.normalize(value, parent) for key, value in attribute.items()}
-        else:
-            return attribute
-
-
-def param_hoisting(top: Dict[str, Any], bottom: Dict[str, Any], top_key: Union[str, None] = None) -> Dict[str, Any]:
-    if top_key:
-        new_dict = {**top[top_key]} if top_key in top else {}
-    else:
-        new_dict = {**top}
-    for key, value in bottom.items():
-        if not key in new_dict:
-            new_dict[key] = value
-    return new_dict
-
-
-class PromptyStream(Iterator):
-    """PromptyStream class to iterate over LLM stream.
-    Necessary for Prompty to handle streaming data when tracing."""
-
-    def __init__(self, name: str, iterator: Iterator):
-        self.name = name
-        self.iterator = iterator
-        self.items: List[Any] = []
-        self.__name__ = "PromptyStream"
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        try:
-            # enumerate but add to list
-            o = self.iterator.__next__()
-            self.items.append(o)
-            return o
-
-        except StopIteration:
-            # StopIteration is raised
-            # contents are exhausted
-            if len(self.items) > 0:
-                with Tracer.start("PromptyStream") as trace:
-                    trace("signature", f"{self.name}.PromptyStream")
-                    trace("inputs", "None")
-                    trace("result", [to_dict(s) for s in self.items])
-
-            raise StopIteration
-
-
-class AsyncPromptyStream(AsyncIterator):
-    """AsyncPromptyStream class to iterate over LLM stream.
-    Necessary for Prompty to handle streaming data when tracing."""
-
-    def __init__(self, name: str, iterator: AsyncIterator):
-        self.name = name
-        self.iterator = iterator
-        self.items: List[Any] = []
-        self.__name__ = "AsyncPromptyStream"
-
-    def __aiter__(self):
-        return self
-
-    async def __anext__(self):
-        try:
-            # enumerate but add to list
-            o = await self.iterator.__anext__()
-            self.items.append(o)
-            return o
-
-        except StopAsyncIteration:
-            # StopIteration is raised
-            # contents are exhausted
-            if len(self.items) > 0:
-                with Tracer.start("AsyncPromptyStream") as trace:
-                    trace("signature", f"{self.name}.AsyncPromptyStream")
-                    trace("inputs", "None")
-                    trace("result", [to_dict(s) for s in self.items])
-
-            raise StopAsyncIteration
-
-
-def _mask_secrets(d: Dict[str, Any], path: list[str], patterns: list[str] = ["key", "secret"]) -> bool:
-    sub_d = d
-    for key in path:
-        if key not in sub_d:
-            return False
-        sub_d = sub_d[key]
-
-    for k, v in sub_d.items():
-        if any([pattern in k.lower() for pattern in patterns]):
-            sub_d[k] = "*" * len(v)
-    return True
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
deleted file mode 100644
index d682662e7b01..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="return-value,operator"
-# pylint: disable=line-too-long,R,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,unnecessary-pass
-import abc
-from typing import Any, Callable, Dict, Literal
-from ._tracer import trace
-from ._core import Prompty
-
-
-class Invoker(abc.ABC):
-    """Abstract class for Invoker
-
-    Attributes
-    ----------
-    prompty : Prompty
-        The prompty object
-    name : str
-        The name of the invoker
-
-    """
-
-    def __init__(self, prompty: Prompty) -> None:
-        self.prompty = prompty
-        self.name = self.__class__.__name__
-
-    @abc.abstractmethod
-    def invoke(self, data: Any) -> Any:
-        """Abstract method to invoke the invoker
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        pass
-
-    @abc.abstractmethod
-    async def invoke_async(self, data: Any) -> Any:
-        """Abstract method to invoke the invoker asynchronously
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        pass
-
-    @trace
-    def run(self, data: Any) -> Any:
-        """Method to run the invoker
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        return self.invoke(data)
-
-    @trace
-    async def run_async(self, data: Any) -> Any:
-        """Method to run the invoker asynchronously
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        return await self.invoke_async(data)
-
-
-class InvokerFactory:
-    """Factory class for Invoker"""
-
-    _renderers: Dict[str, Invoker] = {}
-    _parsers: Dict[str, Invoker] = {}
-    _executors: Dict[str, Invoker] = {}
-    _processors: Dict[str, Invoker] = {}
-
-    @classmethod
-    def add_renderer(cls, name: str, invoker: Invoker) -> None:
-        cls._renderers[name] = invoker
-
-    @classmethod
-    def add_parser(cls, name: str, invoker: Invoker) -> None:
-        cls._parsers[name] = invoker
-
-    @classmethod
-    def add_executor(cls, name: str, invoker: Invoker) -> None:
-        cls._executors[name] = invoker
-
-    @classmethod
-    def add_processor(cls, name: str, invoker: Invoker) -> None:
-        cls._processors[name] = invoker
-
-    @classmethod
-    def register_renderer(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._renderers[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_parser(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._parsers[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_executor(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._executors[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_processor(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._processors[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def _get_name(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-    ) -> str:
-        if type == "renderer":
-            return prompty.template.type
-        elif type == "parser":
-            return f"{prompty.template.parser}.{prompty.model.api}"
-        elif type == "executor":
-            return prompty.model.configuration["type"]
-        elif type == "processor":
-            return prompty.model.configuration["type"]
-        else:
-            raise ValueError(f"Type {type} not found")
-
-    @classmethod
-    def _get_invoker(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-    ) -> Invoker:
-        if type == "renderer":
-            name = prompty.template.type
-            if name not in cls._renderers:
-                raise ValueError(f"Renderer {name} not found")
-
-            return cls._renderers[name](prompty)  # type: ignore
-
-        elif type == "parser":
-            name = f"{prompty.template.parser}.{prompty.model.api}"
-            if name not in cls._parsers:
-                raise ValueError(f"Parser {name} not found")
-
-            return cls._parsers[name](prompty)  # type: ignore
-
-        elif type == "executor":
-            name = prompty.model.configuration["type"]
-            if name not in cls._executors:
-                raise ValueError(f"Executor {name} not found")
-
-            return cls._executors[name](prompty)  # type: ignore
-
-        elif type == "processor":
-            name = prompty.model.configuration["type"]
-            if name not in cls._processors:
-                raise ValueError(f"Processor {name} not found")
-
-            return cls._processors[name](prompty)  # type: ignore
-
-        else:
-            raise ValueError(f"Type {type} not found")
-
-    @classmethod
-    def run(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-        data: Any,
-        default: Any = None,
-    ):
-        name = cls._get_name(type, prompty)
-        if name.startswith("NOOP") and default is not None:
-            return default
-        elif name.startswith("NOOP"):
-            return data
-
-        invoker = cls._get_invoker(type, prompty)
-        value = invoker.run(data)
-        return value
-
-    @classmethod
-    async def run_async(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-        data: Any,
-        default: Any = None,
-    ):
-        name = cls._get_name(type, prompty)
-        if name.startswith("NOOP") and default is not None:
-            return default
-        elif name.startswith("NOOP"):
-            return data
-        invoker = cls._get_invoker(type, prompty)
-        value = await invoker.run_async(data)
-        return value
-
-    @classmethod
-    def run_renderer(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("renderer", prompty, data, default)
-
-    @classmethod
-    async def run_renderer_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("renderer", prompty, data, default)
-
-    @classmethod
-    def run_parser(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("parser", prompty, data, default)
-
-    @classmethod
-    async def run_parser_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("parser", prompty, data, default)
-
-    @classmethod
-    def run_executor(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("executor", prompty, data, default)
-
-    @classmethod
-    async def run_executor_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("executor", prompty, data, default)
-
-    @classmethod
-    def run_processor(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("processor", prompty, data, default)
-
-    @classmethod
-    async def run_processor_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("processor", prompty, data, default)
-
-
-class InvokerException(Exception):
-    """Exception class for Invoker"""
-
-    def __init__(self, message: str, type: str) -> None:
-        super().__init__(message)
-        self.type = type
-
-    def __str__(self) -> str:
-        return f"{super().__str__()}. Make sure to pip install any necessary package extras (i.e. could be something like `pip install prompty[{self.type}]`) for {self.type} as well as import the appropriate invokers (i.e. could be something like `import prompty.{self.type}`)."
-
-
-@InvokerFactory.register_renderer("NOOP")
-@InvokerFactory.register_parser("NOOP")
-@InvokerFactory.register_executor("NOOP")
-@InvokerFactory.register_processor("NOOP")
-@InvokerFactory.register_parser("prompty.embedding")
-@InvokerFactory.register_parser("prompty.image")
-@InvokerFactory.register_parser("prompty.completion")
-class NoOp(Invoker):
-    def invoke(self, data: Any) -> Any:
-        return data
-
-    async def invoke_async(self, data: str) -> Any:
-        return self.invoke(data)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
deleted file mode 100644
index f7a0c21d8bb8..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=line-too-long,R,consider-using-dict-items,docstring-missing-return,docstring-missing-rtype,docstring-missing-param,global-statement,unused-argument,global-variable-not-assigned,protected-access,logging-fstring-interpolation,deprecated-method
-from __future__ import annotations
-import logging
-from collections.abc import Iterator, Sequence
-from types import MappingProxyType
-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Mapping,
-    Optional,
-    Union,
-    cast,
-)
-from typing_extensions import TypeAlias
-
-logger = logging.getLogger(__name__)
-
-
-Scopes: TypeAlias = List[Union[Literal[False, 0], Mapping[str, Any]]]
-
-
-# Globals
-_CURRENT_LINE = 1
-_LAST_TAG_LINE = None
-
-
-class ChevronError(SyntaxError):
-    """Custom exception for Chevron errors."""
-
-
-#
-# Helper functions
-#
-
-
-def grab_literal(template: str, l_del: str) -> tuple[str, str]:
-    """Parse a literal from the template.
-
-    Args:
-        template: The template to parse.
-        l_del: The left delimiter.
-
-    Returns:
-        Tuple[str, str]: The literal and the template.
-    """
-
-    global _CURRENT_LINE
-
-    try:
-        # Look for the next tag and move the template to it
-        literal, template = template.split(l_del, 1)
-        _CURRENT_LINE += literal.count("\n")
-        return (literal, template)
-
-    # There are no more tags in the template?
-    except ValueError:
-        # Then the rest of the template is a literal
-        return (template, "")
-
-
-def l_sa_check(template: str, literal: str, is_standalone: bool) -> bool:
-    """Do a preliminary check to see if a tag could be a standalone.
-
-    Args:
-        template: The template. (Not used.)
-        literal: The literal.
-        is_standalone: Whether the tag is standalone.
-
-    Returns:
-        bool: Whether the tag could be a standalone.
-    """
-
-    # If there is a newline, or the previous tag was a standalone
-    if literal.find("\n") != -1 or is_standalone:
-        padding = literal.split("\n")[-1]
-
-        # If all the characters since the last newline are spaces
-        # Then the next tag could be a standalone
-        # Otherwise it can't be
-        return padding.isspace() or padding == ""
-    else:
-        return False
-
-
-def r_sa_check(template: str, tag_type: str, is_standalone: bool) -> bool:
-    """Do a final check to see if a tag could be a standalone.
-
-    Args:
-        template: The template.
-        tag_type: The type of the tag.
-        is_standalone: Whether the tag is standalone.
-
-    Returns:
-        bool: Whether the tag could be a standalone.
-    """
-
-    # Check right side if we might be a standalone
-    if is_standalone and tag_type not in ["variable", "no escape"]:
-        on_newline = template.split("\n", 1)
-
-        # If the stuff to the right of us are spaces we're a standalone
-        return on_newline[0].isspace() or not on_newline[0]
-
-    # If we're a tag can't be a standalone
-    else:
-        return False
-
-
-def parse_tag(template: str, l_del: str, r_del: str) -> tuple[tuple[str, str], str]:
-    """Parse a tag from a template.
-
-    Args:
-        template: The template.
-        l_del: The left delimiter.
-        r_del: The right delimiter.
-
-    Returns:
-        Tuple[Tuple[str, str], str]: The tag and the template.
-
-    Raises:
-        ChevronError: If the tag is unclosed.
-        ChevronError: If the set delimiter tag is unclosed.
-    """
-    global _CURRENT_LINE
-    global _LAST_TAG_LINE
-
-    tag_types = {
-        "!": "comment",
-        "#": "section",
-        "^": "inverted section",
-        "/": "end",
-        ">": "partial",
-        "=": "set delimiter?",
-        "{": "no escape?",
-        "&": "no escape",
-    }
-
-    # Get the tag
-    try:
-        tag, template = template.split(r_del, 1)
-    except ValueError as e:
-        msg = "unclosed tag " f"at line {_CURRENT_LINE}"
-        raise ChevronError(msg) from e
-
-    # Find the type meaning of the first character
-    tag_type = tag_types.get(tag[0], "variable")
-
-    # If the type is not a variable
-    if tag_type != "variable":
-        # Then that first character is not needed
-        tag = tag[1:]
-
-    # If we might be a set delimiter tag
-    if tag_type == "set delimiter?":
-        # Double check to make sure we are
-        if tag.endswith("="):
-            tag_type = "set delimiter"
-            # Remove the equal sign
-            tag = tag[:-1]
-
-        # Otherwise we should complain
-        else:
-            msg = "unclosed set delimiter tag\n" f"at line {_CURRENT_LINE}"
-            raise ChevronError(msg)
-
-    elif (
-        # If we might be a no html escape tag
-        tag_type == "no escape?"
-        # And we have a third curly brace
-        # (And are using curly braces as delimiters)
-        and l_del == "{{"
-        and r_del == "}}"
-        and template.startswith("}")
-    ):
-        # Then we are a no html escape tag
-        template = template[1:]
-        tag_type = "no escape"
-
-    # Strip the whitespace off the key and return
-    return ((tag_type, tag.strip()), template)
-
-
-#
-# The main tokenizing function
-#
-
-
-def tokenize(template: str, def_ldel: str = "{{", def_rdel: str = "}}") -> Iterator[tuple[str, str]]:
-    """Tokenize a mustache template.
-
-    Tokenizes a mustache template in a generator fashion,
-    using file-like objects. It also accepts a string containing
-    the template.
-
-
-    Arguments:
-
-    template -- a file-like object, or a string of a mustache template
-
-    def_ldel -- The default left delimiter
-                ("{{" by default, as in spec compliant mustache)
-
-    def_rdel -- The default right delimiter
-                ("}}" by default, as in spec compliant mustache)
-
-
-    Returns:
-
-    A generator of mustache tags in the form of a tuple
-
-    -- (tag_type, tag_key)
-
-    Where tag_type is one of:
-     * literal
-     * section
-     * inverted section
-     * end
-     * partial
-     * no escape
-
-    And tag_key is either the key or in the case of a literal tag,
-    the literal itself.
-    """
-
-    global _CURRENT_LINE, _LAST_TAG_LINE
-    _CURRENT_LINE = 1
-    _LAST_TAG_LINE = None
-
-    is_standalone = True
-    open_sections = []
-    l_del = def_ldel
-    r_del = def_rdel
-
-    while template:
-        literal, template = grab_literal(template, l_del)
-
-        # If the template is completed
-        if not template:
-            # Then yield the literal and leave
-            yield ("literal", literal)
-            break
-
-        # Do the first check to see if we could be a standalone
-        is_standalone = l_sa_check(template, literal, is_standalone)
-
-        # Parse the tag
-        tag, template = parse_tag(template, l_del, r_del)
-        tag_type, tag_key = tag
-
-        # Special tag logic
-
-        # If we are a set delimiter tag
-        if tag_type == "set delimiter":
-            # Then get and set the delimiters
-            dels = tag_key.strip().split(" ")
-            l_del, r_del = dels[0], dels[-1]
-
-        # If we are a section tag
-        elif tag_type in ["section", "inverted section"]:
-            # Then open a new section
-            open_sections.append(tag_key)
-            _LAST_TAG_LINE = _CURRENT_LINE
-
-        # If we are an end tag
-        elif tag_type == "end":
-            # Then check to see if the last opened section
-            # is the same as us
-            try:
-                last_section = open_sections.pop()
-            except IndexError as e:
-                msg = f'Trying to close tag "{tag_key}"\n' "Looks like it was not opened.\n" f"line {_CURRENT_LINE + 1}"
-                raise ChevronError(msg) from e
-            if tag_key != last_section:
-                # Otherwise we need to complain
-                msg = (
-                    f'Trying to close tag "{tag_key}"\n'
-                    f'last open tag is "{last_section}"\n'
-                    f"line {_CURRENT_LINE + 1}"
-                )
-                raise ChevronError(msg)
-
-        # Do the second check to see if we're a standalone
-        is_standalone = r_sa_check(template, tag_type, is_standalone)
-
-        # Which if we are
-        if is_standalone:
-            # Remove the stuff before the newline
-            template = template.split("\n", 1)[-1]
-
-            # Partials need to keep the spaces on their left
-            if tag_type != "partial":
-                # But other tags don't
-                literal = literal.rstrip(" ")
-
-        # Start yielding
-        # Ignore literals that are empty
-        if literal != "":
-            yield ("literal", literal)
-
-        # Ignore comments and set delimiters
-        if tag_type not in ["comment", "set delimiter?"]:
-            yield (tag_type, tag_key)
-
-    # If there are any open sections when we're done
-    if open_sections:
-        # Then we need to complain
-        msg = (
-            "Unexpected EOF\n"
-            f'the tag "{open_sections[-1]}" was never closed\n'
-            f"was opened at line {_LAST_TAG_LINE}"
-        )
-        raise ChevronError(msg)
-
-
-#
-# Helper functions
-#
-
-
-def _html_escape(string: str) -> str:
-    """HTML escape all of these " & < >"""
-
-    html_codes = {
-        '"': "&quot;",
-        "<": "&lt;",
-        ">": "&gt;",
-    }
-
-    # & must be handled first
-    string = string.replace("&", "&amp;")
-    for char in html_codes:
-        string = string.replace(char, html_codes[char])
-    return string
-
-
-def _get_key(
-    key: str,
-    scopes: Scopes,
-    warn: bool,
-    keep: bool,
-    def_ldel: str,
-    def_rdel: str,
-) -> Any:
-    """Get a key from the current scope"""
-
-    # If the key is a dot
-    if key == ".":
-        # Then just return the current scope
-        return scopes[0]
-
-    # Loop through the scopes
-    for scope in scopes:
-        try:
-            # Return an empty string if falsy, with two exceptions
-            # 0 should return 0, and False should return False
-            if scope in (0, False):
-                return scope
-
-            # For every dot separated key
-            for child in key.split("."):
-                # Return an empty string if falsy, with two exceptions
-                # 0 should return 0, and False should return False
-                if scope in (0, False):
-                    return scope
-                # Move into the scope
-                try:
-                    # Try subscripting (Normal dictionaries)
-                    scope = cast(Dict[str, Any], scope)[child]
-                except (TypeError, AttributeError):
-                    try:
-                        scope = getattr(scope, child)
-                    except (TypeError, AttributeError):
-                        # Try as a list
-                        scope = scope[int(child)]  # type: ignore
-
-            try:
-                # This allows for custom falsy data types
-                # https://github.com/noahmorrison/chevron/issues/35
-                if scope._CHEVRON_return_scope_when_falsy:  # type: ignore
-                    return scope
-            except AttributeError:
-                if scope in (0, False):
-                    return scope
-                return scope or ""
-        except (AttributeError, KeyError, IndexError, ValueError):
-            # We couldn't find the key in the current scope
-            # We'll try again on the next pass
-            pass
-
-    # We couldn't find the key in any of the scopes
-
-    if warn:
-        logger.warn(f"Could not find key '{key}'")
-
-    if keep:
-        return f"{def_ldel} {key} {def_rdel}"
-
-    return ""
-
-
-def _get_partial(name: str, partials_dict: Mapping[str, str]) -> str:
-    """Load a partial"""
-    try:
-        # Maybe the partial is in the dictionary
-        return partials_dict[name]
-    except KeyError:
-        return ""
-
-
-#
-# The main rendering function
-#
-g_token_cache: Dict[str, List[tuple[str, str]]] = {}
-
-EMPTY_DICT: MappingProxyType[str, str] = MappingProxyType({})
-
-
-def render(
-    template: Union[str, List[tuple[str, str]]] = "",
-    data: Mapping[str, Any] = EMPTY_DICT,
-    partials_dict: Mapping[str, str] = EMPTY_DICT,
-    padding: str = "",
-    def_ldel: str = "{{",
-    def_rdel: str = "}}",
-    scopes: Optional[Scopes] = None,
-    warn: bool = False,
-    keep: bool = False,
-) -> str:
-    """Render a mustache template.
-
-    Renders a mustache template with a data scope and inline partial capability.
-
-    Arguments:
-
-    template      -- A file-like object or a string containing the template.
-
-    data          -- A python dictionary with your data scope.
-
-    partials_path -- The path to where your partials are stored.
-                     If set to None, then partials won't be loaded from the file system
-                     (defaults to '.').
-
-    partials_ext  -- The extension that you want the parser to look for
-                     (defaults to 'mustache').
-
-    partials_dict -- A python dictionary which will be search for partials
-                     before the filesystem is. {'include': 'foo'} is the same
-                     as a file called include.mustache
-                     (defaults to {}).
-
-    padding       -- This is for padding partials, and shouldn't be used
-                     (but can be if you really want to).
-
-    def_ldel      -- The default left delimiter
-                     ("{{" by default, as in spec compliant mustache).
-
-    def_rdel      -- The default right delimiter
-                     ("}}" by default, as in spec compliant mustache).
-
-    scopes        -- The list of scopes that get_key will look through.
-
-    warn          -- Log a warning when a template substitution isn't found in the data
-
-    keep          -- Keep unreplaced tags when a substitution isn't found in the data.
-
-
-    Returns:
-
-    A string containing the rendered template.
-    """
-
-    # If the template is a sequence but not derived from a string
-    if isinstance(template, Sequence) and not isinstance(template, str):
-        # Then we don't need to tokenize it
-        # But it does need to be a generator
-        tokens: Iterator[tuple[str, str]] = (token for token in template)
-    else:
-        if template in g_token_cache:
-            tokens = (token for token in g_token_cache[template])
-        else:
-            # Otherwise make a generator
-            tokens = tokenize(template, def_ldel, def_rdel)
-
-    output = ""
-
-    if scopes is None:
-        scopes = [data]
-
-    # Run through the tokens
-    for tag, key in tokens:
-        # Set the current scope
-        current_scope = scopes[0]
-
-        # If we're an end tag
-        if tag == "end":
-            # Pop out of the latest scope
-            del scopes[0]
-
-        # If the current scope is falsy and not the only scope
-        elif not current_scope and len(scopes) != 1:
-            if tag in ["section", "inverted section"]:
-                # Set the most recent scope to a falsy value
-                scopes.insert(0, False)
-
-        # If we're a literal tag
-        elif tag == "literal":
-            # Add padding to the key and add it to the output
-            output += key.replace("\n", "\n" + padding)
-
-        # If we're a variable tag
-        elif tag == "variable":
-            # Add the html escaped key to the output
-            thing = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            if thing is True and key == ".":
-                # if we've coerced into a boolean by accident
-                # (inverted tags do this)
-                # then get the un-coerced object (next in the stack)
-                thing = scopes[1]
-            if not isinstance(thing, str):
-                thing = str(thing)
-            output += _html_escape(thing)
-
-        # If we're a no html escape tag
-        elif tag == "no escape":
-            # Just lookup the key and add it
-            thing = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            if not isinstance(thing, str):
-                thing = str(thing)
-            output += thing
-
-        # If we're a section tag
-        elif tag == "section":
-            # Get the sections scope
-            scope = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-
-            # If the scope is a callable (as described in
-            # https://mustache.github.io/mustache.5.html)
-            if callable(scope):
-                # Generate template text from tags
-                text = ""
-                tags: List[tuple[str, str]] = []
-                for token in tokens:
-                    if token == ("end", key):
-                        break
-
-                    tags.append(token)
-                    tag_type, tag_key = token
-                    if tag_type == "literal":
-                        text += tag_key
-                    elif tag_type == "no escape":
-                        text += f"{def_ldel}& {tag_key} {def_rdel}"
-                    else:
-                        text += "{}{} {}{}".format(
-                            def_ldel,
-                            {
-                                "comment": "!",
-                                "section": "#",
-                                "inverted section": "^",
-                                "end": "/",
-                                "partial": ">",
-                                "set delimiter": "=",
-                                "no escape": "&",
-                                "variable": "",
-                            }[tag_type],
-                            tag_key,
-                            def_rdel,
-                        )
-
-                g_token_cache[text] = tags
-
-                rend = scope(
-                    text,
-                    lambda template, data=None: render(
-                        template,
-                        data={},
-                        partials_dict=partials_dict,
-                        padding=padding,
-                        def_ldel=def_ldel,
-                        def_rdel=def_rdel,
-                        scopes=data and [data] + scopes or scopes,
-                        warn=warn,
-                        keep=keep,
-                    ),
-                )
-
-                output += rend  # type: ignore[reportOperatorIssue]
-
-            # If the scope is a sequence, an iterator or generator but not
-            # derived from a string
-            elif isinstance(scope, (Sequence, Iterator)) and not isinstance(scope, str):
-                # Then we need to do some looping
-
-                # Gather up all the tags inside the section
-                # (And don't be tricked by nested end tags with the same key)
-                # TODO: This feels like it still has edge cases, no?
-                tags = []
-                tags_with_same_key = 0
-                for token in tokens:
-                    if token == ("section", key):
-                        tags_with_same_key += 1
-                    if token == ("end", key):
-                        tags_with_same_key -= 1
-                        if tags_with_same_key < 0:
-                            break
-                    tags.append(token)
-
-                # For every item in the scope
-                for thing in scope:
-                    # Append it as the most recent scope and render
-                    new_scope = [thing] + scopes
-                    rend = render(
-                        template=tags,
-                        scopes=new_scope,
-                        padding=padding,
-                        partials_dict=partials_dict,
-                        def_ldel=def_ldel,
-                        def_rdel=def_rdel,
-                        warn=warn,
-                        keep=keep,
-                    )
-
-                    output += rend
-
-            else:
-                # Otherwise we're just a scope section
-                scopes.insert(0, scope)  # type: ignore[reportArgumentType]
-
-        # If we're an inverted section
-        elif tag == "inverted section":
-            # Add the flipped scope to the scopes
-            scope = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            scopes.insert(0, cast(Literal[False], not scope))
-
-        # If we're a partial
-        elif tag == "partial":
-            # Load the partial
-            partial = _get_partial(key, partials_dict)
-
-            # Find what to pad the partial with
-            left = output.rpartition("\n")[2]
-            part_padding = padding
-            if left.isspace():
-                part_padding += left
-
-            # Render the partial
-            part_out = render(
-                template=partial,
-                partials_dict=partials_dict,
-                def_ldel=def_ldel,
-                def_rdel=def_rdel,
-                padding=part_padding,
-                scopes=scopes,
-                warn=warn,
-                keep=keep,
-            )
-
-            # If the partial was indented
-            if left.isspace():
-                # then remove the spaces from the end
-                part_out = part_out.rstrip(" \t")
-
-            # Add the partials output to the output
-            output += part_out
-
-    return output
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
deleted file mode 100644
index de3c570e5c89..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,return-value"
-# pylint: disable=line-too-long,R,consider-using-enumerate,docstring-missing-param,docstring-missing-return,docstring-missing-rtype
-import re
-import base64
-from pathlib import Path
-from typing import Any, Union
-from ._core import Prompty
-from ._invoker import Invoker, InvokerFactory
-
-
-ROLES = ["assistant", "function", "system", "user"]
-
-
-@InvokerFactory.register_parser("prompty.chat")
-class PromptyChatParser(Invoker):
-    """Prompty Chat Parser"""
-
-    def __init__(self, prompty: Prompty) -> None:
-        super().__init__(prompty)
-        self.path = Path(self.prompty.file).parent
-
-    def invoke(self, data: str) -> Any:
-        return invoke_parser(self.path, data)
-
-    async def invoke_async(self, data: str) -> Any:
-        """Invoke the Prompty Chat Parser (Async)
-
-        Parameters
-        ----------
-        data : str
-            The data to parse
-
-        Returns
-        -------
-        str
-            The parsed data
-        """
-        return self.invoke(data)
-
-
-def _inline_image(path: Union[Path, None], image_item: str) -> str:
-    """Inline Image
-
-    Parameters
-    ----------
-    image_item : str
-        The image item to inline
-
-    Returns
-    -------
-    str
-        The inlined image
-    """
-    # pass through if it's a url or base64 encoded or the path is None
-    if image_item.startswith("http") or image_item.startswith("data") or path is None:
-        return image_item
-    # otherwise, it's a local file - need to base64 encode it
-    else:
-        image_path = (path if path is not None else Path(".")) / image_item
-        with open(image_path, "rb") as f:
-            base64_image = base64.b64encode(f.read()).decode("utf-8")
-
-        if image_path.suffix == ".png":
-            return f"data:image/png;base64,{base64_image}"
-        elif image_path.suffix == ".jpg":
-            return f"data:image/jpeg;base64,{base64_image}"
-        elif image_path.suffix == ".jpeg":
-            return f"data:image/jpeg;base64,{base64_image}"
-        else:
-            raise ValueError(
-                f"Invalid image format {image_path.suffix} - currently only .png and .jpg / .jpeg are supported."
-            )
-
-
-def _parse_content(path: Union[Path, None], content: str):
-    """for parsing inline images
-
-    Parameters
-    ----------
-    content : str
-        The content to parse
-
-    Returns
-    -------
-    any
-        The parsed content
-    """
-    # regular expression to parse markdown images
-    image = r"(?P<alt>!\[[^\]]*\])\((?P<filename>.*?)(?=\"|\))\)"
-    matches = re.findall(image, content, flags=re.MULTILINE)
-    if len(matches) > 0:
-        content_items = []
-        content_chunks = re.split(image, content, flags=re.MULTILINE)
-        current_chunk = 0
-        for i in range(len(content_chunks)):
-            # image entry
-            if current_chunk < len(matches) and content_chunks[i] == matches[current_chunk][0]:
-                content_items.append(
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": _inline_image(path, matches[current_chunk][1].split(" ")[0].strip())},
-                    }
-                )
-            # second part of image entry
-            elif current_chunk < len(matches) and content_chunks[i] == matches[current_chunk][1]:
-                current_chunk += 1
-            # text entry
-            else:
-                if len(content_chunks[i].strip()) > 0:
-                    content_items.append({"type": "text", "text": content_chunks[i].strip()})
-        return content_items
-    else:
-        return content
-
-
-def invoke_parser(path: Union[Path, None], data: str) -> Any:
-    """Invoke the Prompty Chat Parser
-
-    Parameters
-    ----------
-    data : str
-        The data to parse
-
-    Returns
-    -------
-    str
-        The parsed data
-    """
-    messages = []
-    separator = r"(?i)^\s*#?\s*(" + "|".join(ROLES) + r")\s*:\s*\n"
-
-    # get valid chunks - remove empty items
-    chunks = [item for item in re.split(separator, data, flags=re.MULTILINE) if len(item.strip()) > 0]
-
-    # if no starter role, then inject system role
-    if not chunks[0].strip().lower() in ROLES:
-        chunks.insert(0, "system")
-
-    # if last chunk is role entry, then remove (no content?)
-    if chunks[-1].strip().lower() in ROLES:
-        chunks.pop()
-
-    if len(chunks) % 2 != 0:
-        raise ValueError("Invalid prompt format")
-
-    # create messages
-    for i in range(0, len(chunks), 2):
-        role = chunks[i].strip().lower()
-        content = chunks[i + 1].strip()
-        messages.append({"role": role, "content": _parse_content(path, content)})
-
-    return messages
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
deleted file mode 100644
index 14ad4f62b4c1..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=line-too-long,R
-"""Customize generated code here.
-
-Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-"""
-
-import traceback
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-from typing_extensions import Self
-from ._core import Prompty
-from ._mustache import render
-from ._parsers import invoke_parser
-from ._prompty_utils import load, prepare
-from ._utils import remove_leading_empty_space
-
-
-class PromptTemplate:
-    """The helper class which takes variant of inputs, e.g. Prompty format or string, and returns the parsed prompt in an array."""
-
-    @classmethod
-    def from_prompty(cls, file_path: str) -> Self:
-        """Initialize a PromptTemplate object from a prompty file.
-
-        :param file_path: The path to the prompty file.
-        :type file_path: str
-        :return: The PromptTemplate object.
-        :rtype: PromptTemplate
-        """
-        if not file_path:
-            raise ValueError("Please provide file_path")
-
-        # Get the absolute path of the file by `traceback.extract_stack()`, it's "-2" because:
-        #  In the stack, the last function is the current function.
-        #  The second last function is the caller function, which is the root of the file_path.
-        stack = traceback.extract_stack()
-        caller = Path(stack[-2].filename)
-        abs_file_path = Path(caller.parent / Path(file_path)).resolve().absolute()
-
-        prompty = load(str(abs_file_path))
-        return cls(prompty=prompty)
-
-    @classmethod
-    def from_string(cls, prompt_template: str, api: str = "chat", model_name: Optional[str] = None) -> Self:
-        """Initialize a PromptTemplate object from a message template.
-
-        :param prompt_template: The prompt template string.
-        :type prompt_template: str
-        :param api: The API type, e.g. "chat" or "completion".
-        :type api: str
-        :param model_name: The model name, e.g. "gpt-4o-mini".
-        :type model_name: str
-        :return: The PromptTemplate object.
-        :rtype: PromptTemplate
-        """
-        return cls(
-            api=api,
-            prompt_template=prompt_template,
-            model_name=model_name,
-            prompty=None,
-        )
-
-    def __init__(
-        self,
-        *,
-        api: str = "chat",
-        prompty: Optional[Prompty] = None,
-        prompt_template: Optional[str] = None,
-        model_name: Optional[str] = None,
-    ) -> None:
-        self.prompty = prompty
-        if self.prompty is not None:
-            self.model_name = (
-                self.prompty.model.configuration["azure_deployment"]
-                if "azure_deployment" in self.prompty.model.configuration
-                else None
-            )
-            self.parameters = self.prompty.model.parameters
-            self._config = {}
-        elif prompt_template is not None:
-            self.model_name = model_name
-            self.parameters = {}
-            # _config is a dict to hold the internal configuration
-            self._config = {
-                "api": api if api is not None else "chat",
-                "prompt_template": prompt_template,
-            }
-        else:
-            raise ValueError("Please pass valid arguments for PromptTemplate")
-
-    def create_messages(self, data: Optional[Dict[str, Any]] = None, **kwargs) -> List[Dict[str, Any]]:
-        """Render the prompt template with the given data.
-
-        :param data: The data to render the prompt template with.
-        :type data: Optional[Dict[str, Any]]
-        :return: The rendered prompt template.
-        :rtype: List[Dict[str, Any]]
-        """
-        if data is None:
-            data = kwargs
-
-        if self.prompty is not None:
-            parsed = prepare(self.prompty, data)
-            return parsed
-        elif "prompt_template" in self._config:
-            prompt_template = remove_leading_empty_space(self._config["prompt_template"])
-            system_prompt_str = render(prompt_template, data)
-            parsed = invoke_parser(None, system_prompt_str)
-            return parsed
-        else:
-            raise ValueError("Please provide valid prompt template")
-
-
-def patch_sdk():
-    """Do not remove from this file.
-
-    `patch_sdk` is a last resort escape hatch that allows you to do customizations
-    you can't accomplish using the techniques described in
-    https://aka.ms/azsdk/python/dpcodegen/python/customize
-    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
deleted file mode 100644
index 5ea38bda6229..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="assignment"
-# pylint: disable=R,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,dangerous-default-value,redefined-outer-name,unused-wildcard-import,wildcard-import,raise-missing-from
-import traceback
-from pathlib import Path
-from typing import Any, Dict, List, Union
-from ._tracer import trace
-from ._invoker import InvokerFactory
-from ._core import (
-    ModelSettings,
-    Prompty,
-    PropertySettings,
-    TemplateSettings,
-    param_hoisting,
-)
-from ._utils import (
-    load_global_config,
-    load_prompty,
-)
-
-from ._renderers import *
-from ._parsers import *
-
-
-@trace(description="Create a headless prompty object for programmatic use.")
-def headless(
-    api: str,
-    content: Union[str, List[str], dict],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    connection: str = "default",
-) -> Prompty:
-    """Create a headless prompty object for programmatic use.
-
-    Parameters
-    ----------
-    api : str
-        The API to use for the model
-    content : Union[str, List[str], dict]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    connection : str, optional
-        The connection to use, by default "default"
-
-    Returns
-    -------
-    Prompty
-        The headless prompty object
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.headless(
-            api="embedding",
-            configuration={"type": "azure", "azure_deployment": "text-embedding-ada-002"},
-            content="hello world",
-        )
-    >>> emb = prompty.execute(p)
-
-    """
-
-    # get caller's path (to get relative path for prompty.json)
-    caller = Path(traceback.extract_stack()[-2].filename)
-    templateSettings = TemplateSettings(type="NOOP", parser="NOOP")
-    modelSettings = ModelSettings(
-        api=api,
-        configuration=Prompty.normalize(
-            param_hoisting(configuration, load_global_config(caller.parent, connection)),
-            caller.parent,
-        ),
-        parameters=parameters,
-    )
-
-    return Prompty(model=modelSettings, template=templateSettings, content=content)
-
-
-def _load_raw_prompty(attributes: dict, content: str, p: Path, global_config: dict):
-    if "model" not in attributes:
-        attributes["model"] = {}
-
-    if "configuration" not in attributes["model"]:
-        attributes["model"]["configuration"] = global_config
-    else:
-        attributes["model"]["configuration"] = param_hoisting(
-            attributes["model"]["configuration"],
-            global_config,
-        )
-
-    # pull model settings out of attributes
-    try:
-        model = ModelSettings(**attributes.pop("model"))
-    except Exception as e:
-        raise ValueError(f"Error in model settings: {e}")
-
-    # pull template settings
-    try:
-        if "template" in attributes:
-            t = attributes.pop("template")
-            if isinstance(t, dict):
-                template = TemplateSettings(**t)
-            # has to be a string denoting the type
-            else:
-                template = TemplateSettings(type=t, parser="prompty")
-        else:
-            template = TemplateSettings(type="mustache", parser="prompty")
-    except Exception as e:
-        raise ValueError(f"Error in template loader: {e}")
-
-    # formalize inputs and outputs
-    if "inputs" in attributes:
-        try:
-            inputs = {k: PropertySettings(**v) for (k, v) in attributes.pop("inputs").items()}
-        except Exception as e:
-            raise ValueError(f"Error in inputs: {e}")
-    else:
-        inputs = {}
-    if "outputs" in attributes:
-        try:
-            outputs = {k: PropertySettings(**v) for (k, v) in attributes.pop("outputs").items()}
-        except Exception as e:
-            raise ValueError(f"Error in outputs: {e}")
-    else:
-        outputs = {}
-
-    prompty = Prompty(
-        **attributes,
-        model=model,
-        inputs=inputs,
-        outputs=outputs,
-        template=template,
-        content=content,
-        file=p,
-    )
-
-    return prompty
-
-
-@trace(description="Load a prompty file.")
-def load(prompty_file: Union[str, Path], configuration: str = "default") -> Prompty:
-    """Load a prompty file.
-
-    Parameters
-    ----------
-    prompty_file : Union[str, Path]
-        The path to the prompty file
-    configuration : str, optional
-        The configuration to use, by default "default"
-
-    Returns
-    -------
-    Prompty
-        The loaded prompty object
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> print(p)
-    """
-
-    p = Path(prompty_file)
-    if not p.is_absolute():
-        # get caller's path (take into account trace frame)
-        caller = Path(traceback.extract_stack()[-3].filename)
-        p = Path(caller.parent / p).resolve().absolute()
-
-    # load dictionary from prompty file
-    matter = load_prompty(p)
-
-    attributes = matter["attributes"]
-    content = matter["body"]
-
-    # normalize attribute dictionary resolve keys and files
-    attributes = Prompty.normalize(attributes, p.parent)
-
-    # load global configuration
-    global_config = Prompty.normalize(load_global_config(p.parent, configuration), p.parent)
-
-    prompty = _load_raw_prompty(attributes, content, p, global_config)
-
-    # recursive loading of base prompty
-    if "base" in attributes:
-        # load the base prompty from the same directory as the current prompty
-        base = load(p.parent / attributes["base"])
-        prompty = Prompty.hoist_base_prompty(prompty, base)
-
-    return prompty
-
-
-@trace(description="Prepare the inputs for the prompt.")
-def prepare(
-    prompt: Prompty,
-    inputs: Dict[str, Any] = {},
-):
-    """Prepare the inputs for the prompt.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-
-    Returns
-    -------
-    dict
-        The prepared and hidrated template shaped to the LLM model
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = prompty.prepare(p, inputs)
-    """
-    inputs = param_hoisting(inputs, prompt.sample)
-
-    render = InvokerFactory.run_renderer(prompt, inputs, prompt.content)
-    result = InvokerFactory.run_parser(prompt, render)
-
-    return result
-
-
-@trace(description="Prepare the inputs for the prompt.")
-async def prepare_async(
-    prompt: Prompty,
-    inputs: Dict[str, Any] = {},
-):
-    """Prepare the inputs for the prompt.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-
-    Returns
-    -------
-    dict
-        The prepared and hidrated template shaped to the LLM model
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = await prompty.prepare_async(p, inputs)
-    """
-    inputs = param_hoisting(inputs, prompt.sample)
-
-    render = await InvokerFactory.run_renderer_async(prompt, inputs, prompt.content)
-    result = await InvokerFactory.run_parser_async(prompt, render)
-
-    return result
-
-
-@trace(description="Run the prepared Prompty content against the model.")
-def run(
-    prompt: Prompty,
-    content: Union[dict, list, str],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    raw: bool = False,
-):
-    """Run the prepared Prompty content.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    content : Union[dict, list, str]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = prompty.prepare(p, inputs)
-    >>> result = prompty.run(p, content)
-    """
-
-    if configuration != {}:
-        prompt.model.configuration = param_hoisting(configuration, prompt.model.configuration)
-
-    if parameters != {}:
-        prompt.model.parameters = param_hoisting(parameters, prompt.model.parameters)
-
-    result = InvokerFactory.run_executor(prompt, content)
-    if not raw:
-        result = InvokerFactory.run_processor(prompt, result)
-
-    return result
-
-
-@trace(description="Run the prepared Prompty content against the model.")
-async def run_async(
-    prompt: Prompty,
-    content: Union[dict, list, str],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    raw: bool = False,
-):
-    """Run the prepared Prompty content.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    content : Union[dict, list, str]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = await prompty.prepare_async(p, inputs)
-    >>> result = await prompty.run_async(p, content)
-    """
-
-    if configuration != {}:
-        prompt.model.configuration = param_hoisting(configuration, prompt.model.configuration)
-
-    if parameters != {}:
-        prompt.model.parameters = param_hoisting(parameters, prompt.model.parameters)
-
-    result = await InvokerFactory.run_executor_async(prompt, content)
-    if not raw:
-        result = await InvokerFactory.run_processor_async(prompt, result)
-
-    return result
-
-
-@trace(description="Execute a prompty")
-def execute(
-    prompt: Union[str, Prompty],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    inputs: Dict[str, Any] = {},
-    raw: bool = False,
-    config_name: str = "default",
-):
-    """Execute a prompty.
-
-    Parameters
-    ----------
-    prompt : Union[str, Prompty]
-        The prompty object or path to the prompty file
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-    connection : str, optional
-        The connection to use, by default "default"
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> inputs = {"name": "John Doe"}
-    >>> result = prompty.execute("prompts/basic.prompty", inputs=inputs)
-    """
-    if isinstance(prompt, str):
-        path = Path(prompt)
-        if not path.is_absolute():
-            # get caller's path (take into account trace frame)
-            caller = Path(traceback.extract_stack()[-3].filename)
-            path = Path(caller.parent / path).resolve().absolute()
-        prompt = load(path, config_name)
-
-    # prepare content
-    content = prepare(prompt, inputs)
-
-    # run LLM model
-    result = run(prompt, content, configuration, parameters, raw)
-
-    return result
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
deleted file mode 100644
index 0d682a7fe151..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,assignment,arg-type"
-from pathlib import Path
-from ._core import Prompty
-from ._invoker import Invoker, InvokerFactory
-from ._mustache import render
-
-
-@InvokerFactory.register_renderer("mustache")
-class MustacheRenderer(Invoker):
-    """Render a mustache template."""
-
-    def __init__(self, prompty: Prompty) -> None:
-        super().__init__(prompty)
-        self.templates = {}
-        cur_prompt = self.prompty
-        while cur_prompt:
-            self.templates[Path(cur_prompt.file).name] = cur_prompt.content
-            cur_prompt = cur_prompt.basePrompty
-        self.name = Path(self.prompty.file).name
-
-    def invoke(self, data: str) -> str:
-        generated = render(self.prompty.content, data)  # type: ignore
-        return generated
-
-    async def invoke_async(self, data: str) -> str:
-        return self.invoke(data)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
deleted file mode 100644
index 24f800b465f4..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,arg-type,misc,return-value,assignment,func-returns-value"
-# pylint: disable=R,redefined-outer-name,bare-except,unspecified-encoding
-import os
-import json
-import inspect
-import traceback
-import importlib
-import contextlib
-from pathlib import Path
-from numbers import Number
-from datetime import datetime
-from functools import wraps, partial
-from typing import Any, Callable, Dict, Iterator, List, Union
-
-
-# clean up key value pairs for sensitive values
-def sanitize(key: str, value: Any) -> Any:
-    if isinstance(value, str) and any([s in key.lower() for s in ["key", "token", "secret", "password", "credential"]]):
-        return len(str(value)) * "*"
-
-    if isinstance(value, dict):
-        return {k: sanitize(k, v) for k, v in value.items()}
-
-    return value
-
-
-class Tracer:
-    _tracers: Dict[str, Callable[[str], Iterator[Callable[[str, Any], None]]]] = {}
-
-    @classmethod
-    def add(cls, name: str, tracer: Callable[[str], Iterator[Callable[[str, Any], None]]]) -> None:
-        cls._tracers[name] = tracer
-
-    @classmethod
-    def clear(cls) -> None:
-        cls._tracers = {}
-
-    @classmethod
-    @contextlib.contextmanager
-    def start(cls, name: str) -> Iterator[Callable[[str, Any], None]]:
-        with contextlib.ExitStack() as stack:
-            traces: List[Any] = [stack.enter_context(tracer(name)) for tracer in cls._tracers.values()]  # type: ignore
-            yield lambda key, value: [  # type: ignore
-                # normalize and sanitize any trace values
-                trace(key, sanitize(key, to_dict(value)))
-                for trace in traces
-            ]
-
-
-def to_dict(obj: Any) -> Union[Dict[str, Any], List[Dict[str, Any]], str, Number, bool]:
-    # simple json types
-    if isinstance(obj, str) or isinstance(obj, Number) or isinstance(obj, bool):
-        return obj
-
-    # datetime
-    if isinstance(obj, datetime):
-        return obj.isoformat()
-
-    # safe Prompty obj serialization
-    if type(obj).__name__ == "Prompty":
-        return obj.to_safe_dict()
-
-    # safe PromptyStream obj serialization
-    if type(obj).__name__ == "PromptyStream":
-        return "PromptyStream"
-
-    if type(obj).__name__ == "AsyncPromptyStream":
-        return "AsyncPromptyStream"
-
-    # recursive list and dict
-    if isinstance(obj, List):
-        return [to_dict(item) for item in obj]  # type: ignore
-
-    if isinstance(obj, Dict):
-        return {k: v if isinstance(v, str) else to_dict(v) for k, v in obj.items()}
-
-    if isinstance(obj, Path):
-        return str(obj)
-
-    # cast to string otherwise...
-    return str(obj)
-
-
-def _name(func: Callable, args):
-    if hasattr(func, "__qualname__"):
-        signature = f"{func.__module__}.{func.__qualname__}"
-    else:
-        signature = f"{func.__module__}.{func.__name__}"
-
-    # core invoker gets special treatment prompty.invoker.Invoker
-    core_invoker = signature.startswith("prompty.invoker.Invoker.run")
-    if core_invoker:
-        name = type(args[0]).__name__
-        if signature.endswith("async"):
-            signature = f"{args[0].__module__}.{args[0].__class__.__name__}.invoke_async"
-        else:
-            signature = f"{args[0].__module__}.{args[0].__class__.__name__}.invoke"
-    else:
-        name = func.__name__
-
-    return name, signature
-
-
-def _inputs(func: Callable, args, kwargs) -> dict:
-    ba = inspect.signature(func).bind(*args, **kwargs)
-    ba.apply_defaults()
-
-    inputs = {k: to_dict(v) for k, v in ba.arguments.items() if k != "self"}
-
-    return inputs
-
-
-def _results(result: Any) -> Union[Dict, List[Dict], str, Number, bool]:
-    return to_dict(result) if result is not None else "None"
-
-
-def _trace_sync(func: Union[Callable, None] = None, **okwargs: Any) -> Callable:
-
-    @wraps(func)  # type: ignore
-    def wrapper(*args, **kwargs):
-        name, signature = _name(func, args)  # type: ignore
-        with Tracer.start(name) as trace:
-            trace("signature", signature)
-
-            # support arbitrary keyword
-            # arguments for trace decorator
-            for k, v in okwargs.items():
-                trace(k, to_dict(v))
-
-            inputs = _inputs(func, args, kwargs)  # type: ignore
-            trace("inputs", inputs)
-
-            try:
-                result = func(*args, **kwargs)  # type: ignore
-                trace("result", _results(result))
-            except Exception as e:
-                trace(
-                    "result",
-                    {
-                        "exception": {
-                            "type": type(e),
-                            "traceback": (traceback.format_tb(tb=e.__traceback__) if e.__traceback__ else None),
-                            "message": str(e),
-                            "args": to_dict(e.args),
-                        }
-                    },
-                )
-                raise e
-
-            return result
-
-    return wrapper
-
-
-def _trace_async(func: Union[Callable, None] = None, **okwargs: Any) -> Callable:
-
-    @wraps(func)  # type: ignore
-    async def wrapper(*args, **kwargs):
-        name, signature = _name(func, args)  # type: ignore
-        with Tracer.start(name) as trace:
-            trace("signature", signature)
-
-            # support arbitrary keyword
-            # arguments for trace decorator
-            for k, v in okwargs.items():
-                trace(k, to_dict(v))
-
-            inputs = _inputs(func, args, kwargs)  # type: ignore
-            trace("inputs", inputs)
-            try:
-                result = await func(*args, **kwargs)  # type: ignore
-                trace("result", _results(result))
-            except Exception as e:
-                trace(
-                    "result",
-                    {
-                        "exception": {
-                            "type": type(e),
-                            "traceback": (traceback.format_tb(tb=e.__traceback__) if e.__traceback__ else None),
-                            "message": str(e),
-                            "args": to_dict(e.args),
-                        }
-                    },
-                )
-                raise e
-
-            return result
-
-    return wrapper
-
-
-def trace(func: Union[Callable, None] = None, **kwargs: Any) -> Callable:
-    if func is None:
-        return partial(trace, **kwargs)
-    wrapped_method = _trace_async if inspect.iscoroutinefunction(func) else _trace_sync
-    return wrapped_method(func, **kwargs)
-
-
-class PromptyTracer:
-    def __init__(self, output_dir: Union[str, None] = None) -> None:
-        if output_dir:
-            self.output = Path(output_dir).resolve().absolute()
-        else:
-            self.output = Path(Path(os.getcwd()) / ".runs").resolve().absolute()
-
-        if not self.output.exists():
-            self.output.mkdir(parents=True, exist_ok=True)
-
-        self.stack: List[Dict[str, Any]] = []
-
-    @contextlib.contextmanager
-    def tracer(self, name: str) -> Iterator[Callable[[str, Any], None]]:
-        try:
-            self.stack.append({"name": name})
-            frame = self.stack[-1]
-            frame["__time"] = {
-                "start": datetime.now(),
-            }
-
-            def add(key: str, value: Any) -> None:
-                if key not in frame:
-                    frame[key] = value
-                # multiple values creates list
-                else:
-                    if isinstance(frame[key], list):
-                        frame[key].append(value)
-                    else:
-                        frame[key] = [frame[key], value]
-
-            yield add
-        finally:
-            frame = self.stack.pop()
-            start: datetime = frame["__time"]["start"]
-            end: datetime = datetime.now()
-
-            # add duration to frame
-            frame["__time"] = {
-                "start": start.strftime("%Y-%m-%dT%H:%M:%S.%f"),
-                "end": end.strftime("%Y-%m-%dT%H:%M:%S.%f"),
-                "duration": int((end - start).total_seconds() * 1000),
-            }
-
-            # hoist usage to parent frame
-            if "result" in frame and isinstance(frame["result"], dict):
-                if "usage" in frame["result"]:
-                    frame["__usage"] = self.hoist_item(
-                        frame["result"]["usage"],
-                        frame["__usage"] if "__usage" in frame else {},
-                    )
-
-            # streamed results may have usage as well
-            if "result" in frame and isinstance(frame["result"], list):
-                for result in frame["result"]:
-                    if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict):
-                        frame["__usage"] = self.hoist_item(
-                            result["usage"],
-                            frame["__usage"] if "__usage" in frame else {},
-                        )
-
-            # add any usage frames from below
-            if "__frames" in frame:
-                for child in frame["__frames"]:
-                    if "__usage" in child:
-                        frame["__usage"] = self.hoist_item(
-                            child["__usage"],
-                            frame["__usage"] if "__usage" in frame else {},
-                        )
-
-            # if stack is empty, dump the frame
-            if len(self.stack) == 0:
-                self.write_trace(frame)
-            # otherwise, append the frame to the parent
-            else:
-                if "__frames" not in self.stack[-1]:
-                    self.stack[-1]["__frames"] = []
-                self.stack[-1]["__frames"].append(frame)
-
-    def hoist_item(self, src: Dict[str, Any], cur: Dict[str, Any]) -> Dict[str, Any]:
-        for key, value in src.items():
-            if value is None or isinstance(value, list) or isinstance(value, dict):
-                continue
-            try:
-                if key not in cur:
-                    cur[key] = value
-                else:
-                    cur[key] += value
-            except:
-                continue
-
-        return cur
-
-    def write_trace(self, frame: Dict[str, Any]) -> None:
-        trace_file = self.output / f"{frame['name']}.{datetime.now().strftime('%Y%m%d.%H%M%S')}.tracy"
-
-        v = importlib.metadata.version("prompty")  # type: ignore
-        enriched_frame = {
-            "runtime": "python",
-            "version": v,
-            "trace": frame,
-        }
-
-        with open(trace_file, "w") as f:
-            json.dump(enriched_frame, f, indent=4)
-
-
-@contextlib.contextmanager
-def console_tracer(name: str) -> Iterator[Callable[[str, Any], None]]:
-    try:
-        print(f"Starting {name}")
-        yield lambda key, value: print(f"{key}:\n{json.dumps(to_dict(value), indent=4)}")
-    finally:
-        print(f"Ending {name}")
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
deleted file mode 100644
index 22f284180ee1..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="import-untyped,return-value"
-# pylint: disable=line-too-long,R,wrong-import-order,global-variable-not-assigned)
-import json
-import os
-import re
-import sys
-from typing import Any, Dict
-from pathlib import Path
-
-
-_yaml_regex = re.compile(
-    r"^\s*" + r"(?:---|\+\+\+)" + r"(.*?)" + r"(?:---|\+\+\+)" + r"\s*(.+)$",
-    re.S | re.M,
-)
-
-
-def load_text(file_path, encoding="utf-8"):
-    with open(file_path, "r", encoding=encoding) as file:
-        return file.read()
-
-
-def load_json(file_path, encoding="utf-8"):
-    return json.loads(load_text(file_path, encoding=encoding))
-
-
-def load_global_config(prompty_path: Path = Path.cwd(), configuration: str = "default") -> Dict[str, Any]:
-    prompty_config_path = prompty_path.joinpath("prompty.json")
-    if os.path.exists(prompty_config_path):
-        c = load_json(prompty_config_path)
-        if configuration in c:
-            return c[configuration]
-        else:
-            raise ValueError(f'Item "{configuration}" not found in "{prompty_config_path}"')
-    else:
-        return {}
-
-
-def load_prompty(file_path, encoding="utf-8") -> Dict[str, Any]:
-    contents = load_text(file_path, encoding=encoding)
-    return parse(contents)
-
-
-def parse(contents):
-    try:
-        import yaml  # type: ignore
-    except ImportError as exc:
-        raise ImportError("Please install pyyaml to use this function. Run `pip install pyyaml`.") from exc
-
-    global _yaml_regex
-
-    fmatter = ""
-    body = ""
-    result = _yaml_regex.search(contents)
-
-    if result:
-        fmatter = result.group(1)
-        body = result.group(2)
-    return {
-        "attributes": yaml.load(fmatter, Loader=yaml.SafeLoader),
-        "body": body,
-        "frontmatter": fmatter,
-    }
-
-
-def remove_leading_empty_space(multiline_str: str) -> str:
-    """
-    Processes a multiline string by:
-    1. Removing empty lines
-    2. Finding the minimum leading spaces
-    3. Indenting all lines to the minimum level
-
-    :param multiline_str: The input multiline string.
-    :type multiline_str: str
-    :return: The processed multiline string.
-    :rtype: str
-    """
-    lines = multiline_str.splitlines()
-    start_index = 0
-    while start_index < len(lines) and lines[start_index].strip() == "":
-        start_index += 1
-
-    # Find the minimum number of leading spaces
-    min_spaces = sys.maxsize
-    for line in lines[start_index:]:
-        if len(line.strip()) == 0:
-            continue
-        spaces = len(line) - len(line.lstrip())
-        spaces += line.lstrip().count("\t") * 2  # Count tabs as 2 spaces
-        min_spaces = min(min_spaces, spaces)
-
-    # Remove leading spaces and indent to the minimum level
-    processed_lines = []
-    for line in lines[start_index:]:
-        processed_lines.append(line[min_spaces:])
-
-    return "\n".join(processed_lines)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py b/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
deleted file mode 100644
index f7937a99074a..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-import copy
-from enum import Enum
-import functools
-import json
-import importlib
-import logging
-import os
-from time import time_ns
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
-from urllib.parse import urlparse
-
-# pylint: disable = no-name-in-module
-from azure.core import CaseInsensitiveEnumMeta  # type: ignore
-from azure.core.settings import settings
-from . import models as _models
-
-try:
-    # pylint: disable = no-name-in-module
-    from azure.core.tracing import AbstractSpan, SpanKind  # type: ignore
-    from opentelemetry.trace import StatusCode, Span
-
-    _tracing_library_available = True
-except ModuleNotFoundError:
-
-    _tracing_library_available = False
-
-
-__all__ = [
-    "AIInferenceInstrumentor",
-]
-
-
-_inference_traces_enabled: bool = False
-_trace_inference_content: bool = False
-_INFERENCE_GEN_AI_SYSTEM_NAME = "az.ai.inference"
-
-
-class TraceType(str, Enum, metaclass=CaseInsensitiveEnumMeta):  # pylint: disable=C4747
-    """An enumeration class to represent different types of traces."""
-
-    INFERENCE = "Inference"
-
-
-class AIInferenceInstrumentor:
-    """
-    A class for managing the trace instrumentation of AI Inference.
-
-    This class allows enabling or disabling tracing for AI Inference.
-    and provides functionality to check whether instrumentation is active.
-
-    """
-
-    def __init__(self):
-        if not _tracing_library_available:
-            raise ModuleNotFoundError(
-                "Azure Core Tracing Opentelemetry is not installed. "
-                "Please install it using 'pip install azure-core-tracing-opentelemetry'"
-            )
-        # In the future we could support different versions from the same library
-        # and have a parameter that specifies the version to use.
-        self._impl = _AIInferenceInstrumentorPreview()
-
-    def instrument(self, enable_content_recording: Optional[bool] = None) -> None:
-        """
-        Enable trace instrumentation for AI Inference.
-
-        :param enable_content_recording: Whether content recording is enabled as part
-            of the traces or not. Content in this context refers to chat message content
-            and function call tool related function names, function parameter names and
-            values. True will enable content recording, False will disable it. If no value
-            s provided, then the value read from environment variable
-            AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED is used. If the environment variable
-            is not found, then the value will default to False. Please note that successive calls
-            to instrument will always apply the content recording value provided with the most
-            recent call to instrument (including applying the environment variable if no value is
-            provided and defaulting to false if the environment variable is not found), even if
-            instrument was already previously called without uninstrument being called in between
-            the instrument calls.
-
-        :type enable_content_recording: bool, optional
-        """
-        self._impl.instrument(enable_content_recording=enable_content_recording)
-
-    def uninstrument(self) -> None:
-        """
-        Disable trace instrumentation for AI Inference.
-
-        Raises:
-            RuntimeError: If instrumentation is not currently enabled.
-
-        This method removes any active instrumentation, stopping the tracing
-        of AI Inference.
-        """
-        self._impl.uninstrument()
-
-    def is_instrumented(self) -> bool:
-        """
-        Check if trace instrumentation for AI Inference is currently enabled.
-
-        :return: True if instrumentation is active, False otherwise.
-        :rtype: bool
-        """
-        return self._impl.is_instrumented()
-
-    def is_content_recording_enabled(self) -> bool:
-        """
-        This function gets the content recording value.
-
-        :return: A bool value indicating whether content recording is enabled.
-        :rtype: bool
-        """
-        return self._impl.is_content_recording_enabled()
-
-
-class _AIInferenceInstrumentorPreview:
-    """
-    A class for managing the trace instrumentation of AI Inference.
-
-    This class allows enabling or disabling tracing for AI Inference.
-    and provides functionality to check whether instrumentation is active.
-    """
-
-    def _str_to_bool(self, s):
-        if s is None:
-            return False
-        return str(s).lower() == "true"
-
-    def instrument(self, enable_content_recording: Optional[bool] = None):
-        """
-        Enable trace instrumentation for AI Inference.
-
-        :param enable_content_recording: Whether content recording is enabled as part
-        of the traces or not. Content in this context refers to chat message content
-        and function call tool related function names, function parameter names and
-        values. True will enable content recording, False will disable it. If no value
-        is provided, then the value read from environment variable
-        AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED is used. If the environment variable
-        is not found, then the value will default to False.
-
-        :type enable_content_recording: bool, optional
-        """
-        if enable_content_recording is None:
-            var_value = os.environ.get("AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED")
-            enable_content_recording = self._str_to_bool(var_value)
-        if not self.is_instrumented():
-            self._instrument_inference(enable_content_recording)
-        else:
-            self._set_content_recording_enabled(enable_content_recording=enable_content_recording)
-
-    def uninstrument(self):
-        """
-        Disable trace instrumentation for AI Inference.
-
-        This method removes any active instrumentation, stopping the tracing
-        of AI Inference.
-        """
-        if self.is_instrumented():
-            self._uninstrument_inference()
-
-    def is_instrumented(self):
-        """
-        Check if trace instrumentation for AI Inference is currently enabled.
-
-        :return: True if instrumentation is active, False otherwise.
-        :rtype: bool
-        """
-        return self._is_instrumented()
-
-    def set_content_recording_enabled(self, enable_content_recording: bool = False) -> None:
-        """This function sets the content recording value.
-
-        :param enable_content_recording: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_recording: bool
-        """
-        self._set_content_recording_enabled(enable_content_recording=enable_content_recording)
-
-    def is_content_recording_enabled(self) -> bool:
-        """This function gets the content recording value.
-
-        :return: A bool value indicating whether content tracing is enabled.
-        :rtype bool
-        """
-        return self._is_content_recording_enabled()
-
-    def _set_attributes(self, span: "AbstractSpan", *attrs: Tuple[str, Any]) -> None:
-        for attr in attrs:
-            key, value = attr
-            if value is not None:
-                span.add_attribute(key, value)
-
-    def _add_request_chat_message_events(self, span: "AbstractSpan", **kwargs: Any) -> int:
-        timestamp = 0
-        for message in kwargs.get("messages", []):
-            try:
-                message = message.as_dict()
-            except AttributeError:
-                pass
-
-            if message.get("role"):
-                timestamp = self._record_event(
-                    span,
-                    f"gen_ai.{message.get('role')}.message",
-                    {
-                        "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                        "gen_ai.event.content": json.dumps(message),
-                    },
-                    timestamp,
-                )
-
-        return timestamp
-
-    def _parse_url(self, url):
-        parsed = urlparse(url)
-        server_address = parsed.hostname
-        port = parsed.port
-        return server_address, port
-
-    def _add_request_chat_attributes(self, span: "AbstractSpan", *args: Any, **kwargs: Any) -> None:
-        client = args[0]
-        endpoint = client._config.endpoint  # pylint: disable=protected-access
-        server_address, port = self._parse_url(endpoint)
-        model = "chat"
-        if kwargs.get("model") is not None:
-            model_value = kwargs.get("model")
-            if model_value is not None:
-                model = model_value
-
-        self._set_attributes(
-            span,
-            ("gen_ai.operation.name", "chat"),
-            ("gen_ai.system", _INFERENCE_GEN_AI_SYSTEM_NAME),
-            ("gen_ai.request.model", model),
-            ("gen_ai.request.max_tokens", kwargs.get("max_tokens")),
-            ("gen_ai.request.temperature", kwargs.get("temperature")),
-            ("gen_ai.request.top_p", kwargs.get("top_p")),
-            ("server.address", server_address),
-        )
-        if port is not None and port != 443:
-            span.add_attribute("server.port", port)
-
-    def _remove_function_call_names_and_arguments(self, tool_calls: list) -> list:
-        tool_calls_copy = copy.deepcopy(tool_calls)
-        for tool_call in tool_calls_copy:
-            if "function" in tool_call:
-                if "name" in tool_call["function"]:
-                    del tool_call["function"]["name"]
-                if "arguments" in tool_call["function"]:
-                    del tool_call["function"]["arguments"]
-                if not tool_call["function"]:
-                    del tool_call["function"]
-        return tool_calls_copy
-
-    def _get_finish_reasons(self, result) -> Optional[List[str]]:
-        if hasattr(result, "choices") and result.choices:
-            finish_reasons: List[str] = []
-            for choice in result.choices:
-                finish_reason = getattr(choice, "finish_reason", None)
-
-                if finish_reason is None:
-                    # If finish_reason is None, default to "none"
-                    finish_reasons.append("none")
-                elif hasattr(finish_reason, "value"):
-                    # If finish_reason has a 'value' attribute (i.e., it's an enum), use it
-                    finish_reasons.append(finish_reason.value)
-                elif isinstance(finish_reason, str):
-                    # If finish_reason is a string, use it directly
-                    finish_reasons.append(finish_reason)
-                else:
-                    # Default to "none"
-                    finish_reasons.append("none")
-
-            return finish_reasons
-        return None
-
-    def _get_finish_reason_for_choice(self, choice):
-        finish_reason = getattr(choice, "finish_reason", None)
-        if finish_reason is not None:
-            return finish_reason.value
-
-        return "none"
-
-    def _add_response_chat_message_events(
-        self, span: "AbstractSpan", result: _models.ChatCompletions, last_event_timestamp_ns: int
-    ) -> None:
-        for choice in result.choices:
-            attributes = {}
-            if _trace_inference_content:
-                full_response: Dict[str, Any] = {
-                    "message": {"content": choice.message.content},
-                    "finish_reason": self._get_finish_reason_for_choice(choice),
-                    "index": choice.index,
-                }
-                if choice.message.tool_calls:
-                    full_response["message"]["tool_calls"] = [tool.as_dict() for tool in choice.message.tool_calls]
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(full_response),
-                }
-            else:
-                response: Dict[str, Any] = {
-                    "finish_reason": self._get_finish_reason_for_choice(choice),
-                    "index": choice.index,
-                }
-                if choice.message.tool_calls:
-                    response["message"] = {}
-                    tool_calls_function_names_and_arguments_removed = self._remove_function_call_names_and_arguments(
-                        choice.message.tool_calls
-                    )
-                    response["message"]["tool_calls"] = [
-                        tool.as_dict() for tool in tool_calls_function_names_and_arguments_removed
-                    ]
-
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(response),
-                }
-            last_event_timestamp_ns = self._record_event(span, "gen_ai.choice", attributes, last_event_timestamp_ns)
-
-    def _add_response_chat_attributes(
-        self,
-        span: "AbstractSpan",
-        result: Union[_models.ChatCompletions, _models.StreamingChatCompletionsUpdate],
-    ) -> None:
-        self._set_attributes(
-            span,
-            ("gen_ai.response.id", result.id),
-            ("gen_ai.response.model", result.model),
-            (
-                "gen_ai.usage.input_tokens",
-                (result.usage.prompt_tokens if hasattr(result, "usage") and result.usage else None),
-            ),
-            (
-                "gen_ai.usage.output_tokens",
-                (result.usage.completion_tokens if hasattr(result, "usage") and result.usage else None),
-            ),
-        )
-        finish_reasons = self._get_finish_reasons(result)
-        if not finish_reasons is None:
-            span.add_attribute("gen_ai.response.finish_reasons", finish_reasons)  # type: ignore
-
-    def _add_request_details(self, span: "AbstractSpan", args: Any, kwargs: Any) -> int:
-        self._add_request_chat_attributes(span, *args, **kwargs)
-        if _trace_inference_content:
-            return self._add_request_chat_message_events(span, **kwargs)
-        return 0
-
-    def _add_response_details(self, span: "AbstractSpan", result: object, last_event_timestamp_ns: int) -> None:
-        if isinstance(result, _models.ChatCompletions):
-            self._add_response_chat_attributes(span, result)
-            self._add_response_chat_message_events(span, result, last_event_timestamp_ns)
-        # TODO add more models here
-
-    def _accumulate_response(self, item, accumulate: Dict[str, Any]) -> None:
-        if item.finish_reason:
-            accumulate["finish_reason"] = item.finish_reason
-        if item.index:
-            accumulate["index"] = item.index
-        if item.delta.content:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("content", "")
-            accumulate["message"]["content"] += item.delta.content
-        if item.delta.tool_calls:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("tool_calls", [])
-            if item.delta.tool_calls is not None:
-                for tool_call in item.delta.tool_calls:
-                    if tool_call.id:
-                        accumulate["message"]["tool_calls"].append(
-                            {
-                                "id": tool_call.id,
-                                "type": "",
-                                "function": {"name": "", "arguments": ""},
-                            }
-                        )
-                    if tool_call.function:
-                        accumulate["message"]["tool_calls"][-1]["type"] = "function"
-                    if tool_call.function and tool_call.function.name:
-                        accumulate["message"]["tool_calls"][-1]["function"]["name"] = tool_call.function.name
-                    if tool_call.function and tool_call.function.arguments:
-                        accumulate["message"]["tool_calls"][-1]["function"]["arguments"] += tool_call.function.arguments
-
-    def _accumulate_async_streaming_response(self, item, accumulate: Dict[str, Any]) -> None:
-        if not "choices" in item:
-            return
-        if "finish_reason" in item["choices"][0] and item["choices"][0]["finish_reason"]:
-            accumulate["finish_reason"] = item["choices"][0]["finish_reason"]
-        if "index" in item["choices"][0] and item["choices"][0]["index"]:
-            accumulate["index"] = item["choices"][0]["index"]
-        if not "delta" in item["choices"][0]:
-            return
-        if "content" in item["choices"][0]["delta"] and item["choices"][0]["delta"]["content"]:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("content", "")
-            accumulate["message"]["content"] += item["choices"][0]["delta"]["content"]
-        if "tool_calls" in item["choices"][0]["delta"] and item["choices"][0]["delta"]["tool_calls"]:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("tool_calls", [])
-            if item["choices"][0]["delta"]["tool_calls"] is not None:
-                for tool_call in item["choices"][0]["delta"]["tool_calls"]:
-                    if tool_call.id:
-                        accumulate["message"]["tool_calls"].append(
-                            {
-                                "id": tool_call.id,
-                                "type": "",
-                                "function": {"name": "", "arguments": ""},
-                            }
-                        )
-                    if tool_call.function:
-                        accumulate["message"]["tool_calls"][-1]["type"] = "function"
-                    if tool_call.function and tool_call.function.name:
-                        accumulate["message"]["tool_calls"][-1]["function"]["name"] = tool_call.function.name
-                    if tool_call.function and tool_call.function.arguments:
-                        accumulate["message"]["tool_calls"][-1]["function"]["arguments"] += tool_call.function.arguments
-
-    def _wrapped_stream(
-        self, stream_obj: _models.StreamingChatCompletions, span: "AbstractSpan", previous_event_timestamp: int
-    ) -> _models.StreamingChatCompletions:
-        class StreamWrapper(_models.StreamingChatCompletions):
-            def __init__(self, stream_obj, instrumentor):
-                super().__init__(stream_obj._response)
-                self._instrumentor = instrumentor
-
-            def __iter__(  # pyright: ignore [reportIncompatibleMethodOverride]
-                self,
-            ) -> Iterator[_models.StreamingChatCompletionsUpdate]:
-                accumulate: Dict[str, Any] = {}
-                try:
-                    chunk = None
-                    for chunk in stream_obj:
-                        for item in chunk.choices:
-                            self._instrumentor._accumulate_response(item, accumulate)
-                        yield chunk
-
-                    if chunk is not None:
-                        self._instrumentor._add_response_chat_attributes(span, chunk)
-
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = exc.__module__ if hasattr(exc, "__module__") and exc.__module__ != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._instrumentor._set_attributes(span, ("error.type", error_type))
-                    raise
-
-                finally:
-                    if stream_obj._done is False:
-                        if accumulate.get("finish_reason") is None:
-                            accumulate["finish_reason"] = "error"
-                    else:
-                        # Only one choice expected with streaming
-                        accumulate["index"] = 0
-                        # Delete message if content tracing is not enabled
-                        if not _trace_inference_content:
-                            if "message" in accumulate:
-                                if "content" in accumulate["message"]:
-                                    del accumulate["message"]["content"]
-                                if not accumulate["message"]:
-                                    del accumulate["message"]
-                            if "message" in accumulate:
-                                if "tool_calls" in accumulate["message"]:
-                                    tool_calls_function_names_and_arguments_removed = (
-                                        self._instrumentor._remove_function_call_names_and_arguments(
-                                            accumulate["message"]["tool_calls"]
-                                        )
-                                    )
-                                    accumulate["message"]["tool_calls"] = list(
-                                        tool_calls_function_names_and_arguments_removed
-                                    )
-                    attributes = {
-                        "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                        "gen_ai.event.content": json.dumps(accumulate),
-                    }
-                    self._instrumentor._record_event(span, "gen_ai.choice", attributes, previous_event_timestamp)
-                    span.finish()
-
-        return StreamWrapper(stream_obj, self)
-
-    def _async_wrapped_stream(
-        self, stream_obj: _models.AsyncStreamingChatCompletions, span: "AbstractSpan", last_event_timestamp_ns: int
-    ) -> _models.AsyncStreamingChatCompletions:
-        class AsyncStreamWrapper(_models.AsyncStreamingChatCompletions):
-            def __init__(self, stream_obj, instrumentor, span, last_event_timestamp_ns):
-                super().__init__(stream_obj._response)
-                self._instrumentor = instrumentor
-                self._accumulate: Dict[str, Any] = {}
-                self._stream_obj = stream_obj
-                self.span = span
-                self._last_result = None
-                self._last_event_timestamp_ns = last_event_timestamp_ns
-
-            async def __anext__(self) -> "_models.StreamingChatCompletionsUpdate":
-                try:
-                    result = await super().__anext__()
-                    self._instrumentor._accumulate_async_streaming_response(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                        result, self._accumulate
-                    )
-                    self._last_result = result
-                except StopAsyncIteration as exc:
-                    self._trace_stream_content()
-                    raise exc
-                return result
-
-            def _trace_stream_content(self) -> None:
-                if self._last_result:
-                    self._instrumentor._add_response_chat_attributes(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                        span, self._last_result
-                    )
-                # Only one choice expected with streaming
-                self._accumulate["index"] = 0
-                # Delete message if content tracing is not enabled
-                if not _trace_inference_content:
-                    if "message" in self._accumulate:
-                        if "content" in self._accumulate["message"]:
-                            del self._accumulate["message"]["content"]
-                            if not self._accumulate["message"]:
-                                del self._accumulate["message"]
-                        if "message" in self._accumulate:
-                            if "tool_calls" in self._accumulate["message"]:
-                                tools_no_recording = self._instrumentor._remove_function_call_names_and_arguments(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                                    self._accumulate["message"]["tool_calls"]
-                                )
-                                self._accumulate["message"]["tool_calls"] = list(tools_no_recording)
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(self._accumulate),
-                }
-                self._last_event_timestamp_ns = self._instrumentor._record_event(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                    span, "gen_ai.choice", attributes, self._last_event_timestamp_ns
-                )
-                span.finish()
-
-        async_stream_wrapper = AsyncStreamWrapper(stream_obj, self, span, last_event_timestamp_ns)
-        return async_stream_wrapper
-
-    def _record_event(
-        self, span: "AbstractSpan", name: str, attributes: Dict[str, Any], last_event_timestamp_ns: int
-    ) -> int:
-        timestamp = time_ns()
-
-        # we're recording multiple events, some of them are emitted within (hundreds of) nanoseconds of each other.
-        # time.time_ns resolution is not high enough on windows to guarantee unique timestamps for each message.
-        # Also Azure Monitor truncates resolution to microseconds and some other backends truncate to milliseconds.
-        #
-        # But we need to give users a way to restore event order, so we're incrementing the timestamp
-        # by 1 microsecond for each message.
-        #
-        # This is a workaround, we'll find a generic and better solution - see
-        # https://github.com/open-telemetry/semantic-conventions/issues/1701
-        if last_event_timestamp_ns > 0 and timestamp <= (last_event_timestamp_ns + 1000):
-            timestamp = last_event_timestamp_ns + 1000
-
-        span.span_instance.add_event(name=name, attributes=attributes, timestamp=timestamp)
-
-        return timestamp
-
-    def _trace_sync_function(
-        self,
-        function: Callable,
-        *,
-        _args_to_ignore: Optional[List[str]] = None,
-        _trace_type=TraceType.INFERENCE,
-        _name: Optional[str] = None,
-    ) -> Callable:
-        """
-        Decorator that adds tracing to a synchronous function.
-
-        :param function: The function to be traced.
-        :type function: Callable
-        :param args_to_ignore: A list of argument names to be ignored in the trace.
-                            Defaults to None.
-        :type: args_to_ignore: [List[str]], optional
-        :param trace_type: The type of the trace. Defaults to TraceType.INFERENCE.
-        :type trace_type: TraceType, optional
-        :param name: The name of the trace, will set to func name if not provided.
-        :type name: str, optional
-        :return: The traced function.
-        :rtype: Callable
-        """
-
-        @functools.wraps(function)
-        def inner(*args, **kwargs):
-
-            span_impl_type = settings.tracing_implementation()
-            if span_impl_type is None:
-                return function(*args, **kwargs)
-
-            class_function_name = function.__qualname__
-
-            if class_function_name.startswith("ChatCompletionsClient.complete"):
-                if kwargs.get("model") is None:
-                    span_name = "chat"
-                else:
-                    model = kwargs.get("model")
-                    span_name = f"chat {model}"
-
-                span = span_impl_type(
-                    name=span_name,
-                    kind=SpanKind.CLIENT,  # pyright: ignore [reportPossiblyUnboundVariable]
-                )
-
-                try:
-                    # tracing events not supported in azure-core-tracing-opentelemetry
-                    # so need to access the span instance directly
-                    with span_impl_type.change_context(span.span_instance):
-                        last_event_timestamp_ns = self._add_request_details(span, args, kwargs)
-                        result = function(*args, **kwargs)
-                        if kwargs.get("stream") is True:
-                            return self._wrapped_stream(result, span, last_event_timestamp_ns)
-                        self._add_response_details(span, result, last_event_timestamp_ns)
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = getattr(exc, "__module__", "")
-                    module = module if module != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._set_attributes(span, ("error.type", error_type))
-                    span.finish()
-                    raise
-
-                span.finish()
-                return result
-
-            # Handle the default case (if the function name does not match)
-            return None  # Ensure all paths return
-
-        return inner
-
-    def _trace_async_function(
-        self,
-        function: Callable,
-        *,
-        _args_to_ignore: Optional[List[str]] = None,
-        _trace_type=TraceType.INFERENCE,
-        _name: Optional[str] = None,
-    ) -> Callable:
-        """
-        Decorator that adds tracing to an asynchronous function.
-
-        :param function: The function to be traced.
-        :type function: Callable
-        :param args_to_ignore: A list of argument names to be ignored in the trace.
-                            Defaults to None.
-        :type: args_to_ignore: [List[str]], optional
-        :param trace_type: The type of the trace. Defaults to TraceType.INFERENCE.
-        :type trace_type: TraceType, optional
-        :param name: The name of the trace, will set to func name if not provided.
-        :type name: str, optional
-        :return: The traced function.
-        :rtype: Callable
-        """
-
-        @functools.wraps(function)
-        async def inner(*args, **kwargs):
-            span_impl_type = settings.tracing_implementation()
-            if span_impl_type is None:
-                return await function(*args, **kwargs)
-
-            class_function_name = function.__qualname__
-
-            if class_function_name.startswith("ChatCompletionsClient.complete"):
-                if kwargs.get("model") is None:
-                    span_name = "chat"
-                else:
-                    model = kwargs.get("model")
-                    span_name = f"chat {model}"
-
-                span = span_impl_type(
-                    name=span_name,
-                    kind=SpanKind.CLIENT,  # pyright: ignore [reportPossiblyUnboundVariable]
-                )
-                try:
-                    # tracing events not supported in azure-core-tracing-opentelemetry
-                    # so need to access the span instance directly
-                    with span_impl_type.change_context(span.span_instance):
-                        last_event_timestamp_ns = self._add_request_details(span, args, kwargs)
-                        result = await function(*args, **kwargs)
-                        if kwargs.get("stream") is True:
-                            return self._async_wrapped_stream(result, span, last_event_timestamp_ns)
-                        self._add_response_details(span, result, last_event_timestamp_ns)
-
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = getattr(exc, "__module__", "")
-                    module = module if module != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._set_attributes(span, ("error.type", error_type))
-                    span.finish()
-                    raise
-
-                span.finish()
-                return result
-
-            # Handle the default case (if the function name does not match)
-            return None  # Ensure all paths return
-
-        return inner
-
-    def _inject_async(self, f, _trace_type, _name):
-        wrapper_fun = self._trace_async_function(f)
-        wrapper_fun._original = f  # pylint: disable=protected-access # pyright: ignore [reportFunctionMemberAccess]
-        return wrapper_fun
-
-    def _inject_sync(self, f, _trace_type, _name):
-        wrapper_fun = self._trace_sync_function(f)
-        wrapper_fun._original = f  # pylint: disable=protected-access # pyright: ignore [reportFunctionMemberAccess]
-        return wrapper_fun
-
-    def _inference_apis(self):
-        sync_apis = (
-            (
-                "azure.ai.inference",
-                "ChatCompletionsClient",
-                "complete",
-                TraceType.INFERENCE,
-                "inference_chat_completions_complete",
-            ),
-        )
-        async_apis = (
-            (
-                "azure.ai.inference.aio",
-                "ChatCompletionsClient",
-                "complete",
-                TraceType.INFERENCE,
-                "inference_chat_completions_complete",
-            ),
-        )
-        return sync_apis, async_apis
-
-    def _inference_api_list(self):
-        sync_apis, async_apis = self._inference_apis()
-        yield sync_apis, self._inject_sync
-        yield async_apis, self._inject_async
-
-    def _generate_api_and_injector(self, apis):
-        for api, injector in apis:
-            for module_name, class_name, method_name, trace_type, name in api:
-                try:
-                    module = importlib.import_module(module_name)
-                    api = getattr(module, class_name)
-                    if hasattr(api, method_name):
-                        yield api, method_name, trace_type, injector, name
-                except AttributeError as e:
-                    # Log the attribute exception with the missing class information
-                    logging.warning(
-                        "AttributeError: The module '%s' does not have the class '%s'. %s",
-                        module_name,
-                        class_name,
-                        str(e),
-                    )
-                except Exception as e:  # pylint: disable=broad-except
-                    # Log other exceptions as a warning, as we're not sure what they might be
-                    logging.warning("An unexpected error occurred: '%s'", str(e))
-
-    def _available_inference_apis_and_injectors(self):
-        """
-        Generates a sequence of tuples containing Inference API classes, method names, and
-        corresponding injector functions.
-
-        :return: A generator yielding tuples.
-        :rtype: tuple
-        """
-        yield from self._generate_api_and_injector(self._inference_api_list())
-
-    def _instrument_inference(self, enable_content_tracing: bool = False):
-        """This function modifies the methods of the Inference API classes to
-        inject logic before calling the original methods.
-        The original methods are stored as _original attributes of the methods.
-
-        :param enable_content_tracing: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_tracing: bool
-        """
-        # pylint: disable=W0603
-        global _inference_traces_enabled
-        global _trace_inference_content
-        if _inference_traces_enabled:
-            raise RuntimeError("Traces already started for azure.ai.inference")
-        _inference_traces_enabled = True
-        _trace_inference_content = enable_content_tracing
-        for (
-            api,
-            method,
-            trace_type,
-            injector,
-            name,
-        ) in self._available_inference_apis_and_injectors():
-            # Check if the method of the api class has already been modified
-            if not hasattr(getattr(api, method), "_original"):
-                setattr(api, method, injector(getattr(api, method), trace_type, name))
-
-    def _uninstrument_inference(self):
-        """This function restores the original methods of the Inference API classes
-        by assigning them back from the _original attributes of the modified methods.
-        """
-        # pylint: disable=W0603
-        global _inference_traces_enabled
-        global _trace_inference_content
-        _trace_inference_content = False
-        for api, method, _, _, _ in self._available_inference_apis_and_injectors():
-            if hasattr(getattr(api, method), "_original"):
-                setattr(api, method, getattr(getattr(api, method), "_original"))
-        _inference_traces_enabled = False
-
-    def _is_instrumented(self):
-        """This function returns True if Inference libary has already been instrumented
-        for tracing and False if it has not been instrumented.
-
-        :return: A value indicating whether the Inference library is currently instrumented or not.
-        :rtype: bool
-        """
-        return _inference_traces_enabled
-
-    def _set_content_recording_enabled(self, enable_content_recording: bool = False) -> None:
-        """This function sets the content recording value.
-
-        :param enable_content_recording: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_recording: bool
-        """
-        global _trace_inference_content  # pylint: disable=W0603
-        _trace_inference_content = enable_content_recording
-
-    def _is_content_recording_enabled(self) -> bool:
-        """This function gets the content recording value.
-
-        :return: A bool value indicating whether content tracing is enabled.
-        :rtype bool
-        """
-        return _trace_inference_content
diff --git a/sdk/ai/azure-ai-inference/sdk_packaging.toml b/sdk/ai/azure-ai-inference/sdk_packaging.toml
new file mode 100644
index 000000000000..e7687fdae93b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/sdk_packaging.toml
@@ -0,0 +1,2 @@
+[packaging]
+auto_update = false
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/setup.py b/sdk/ai/azure-ai-inference/setup.py
index 3de5f549efe0..c7b5395a3f9f 100644
--- a/sdk/ai/azure-ai-inference/setup.py
+++ b/sdk/ai/azure-ai-inference/setup.py
@@ -13,7 +13,7 @@
 
 
 PACKAGE_NAME = "azure-ai-inference"
-PACKAGE_PPRINT_NAME = "Azure AI Inference"
+PACKAGE_PPRINT_NAME = "Azure Ai Inference"
 
 # a-b-c => a/b/c
 package_folder_path = PACKAGE_NAME.replace("-", "/")
@@ -35,7 +35,7 @@
     license="MIT License",
     author="Microsoft Corporation",
     author_email="azpysdkhelp@microsoft.com",
-    url="https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference",
+    url="https://github.com/Azure/azure-sdk-for-python/tree/main/sdk",
     keywords="azure, azure sdk",
     classifiers=[
         "Development Status :: 4 - Beta",
@@ -68,8 +68,4 @@
         "typing-extensions>=4.6.0",
     ],
     python_requires=">=3.8",
-    extras_require={
-        "opentelemetry": ["azure-core-tracing-opentelemetry"],
-        "prompts": ["pyyaml"],
-    },
 )
diff --git a/sdk/ai/azure-ai-inference/tsp-location.yaml b/sdk/ai/azure-ai-inference/tsp-location.yaml
index fffb1db5b101..586442152432 100644
--- a/sdk/ai/azure-ai-inference/tsp-location.yaml
+++ b/sdk/ai/azure-ai-inference/tsp-location.yaml
@@ -1,4 +1,4 @@
 directory: specification/ai/ModelClient
-commit: 1f152afbf84c6febe1f4447d284750f03c7f188f
+commit: 7dca60dfb7fda9a5e4aaeb4494db564b992c5c43
 repo: Azure/azure-rest-api-specs
-additionalDirectories:
+additionalDirectories: