Skip to content

Commit

Permalink
Content safety evals aggregate max from conversations (Azure#39083)
Browse files Browse the repository at this point in the history
* add convo agg type, and have harm evals use max

* analysis

* correct enum name in docs

* refactor checked enum into function field

* cl and analysis

* change enum name and update CL

* change function names to private, allow agg type retrieval

* PR comments

* test serialization

* CL

* CI adjustment

* try again

* perf

* skip perf

* remove skip
  • Loading branch information
MilesHolland authored Jan 22, 2025
1 parent d1ce446 commit 7f904a3
Show file tree
Hide file tree
Showing 14 changed files with 309 additions and 6 deletions.
5 changes: 5 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
- Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
- Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
- Fixed the non adversarial simulator to run in task-free mode
- Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
main score when aggregating per-turn evaluations from a conversation into an overall
evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.

### Other Changes
- Changed minimum required python version to use this package from 3.8 to 3.9
- Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
- Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.

## 1.1.0 (2024-12-12)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
Message,
OpenAIModelConfiguration,
)
from ._constants import AggregationType

__all__ = [
"evaluate",
Expand Down Expand Up @@ -79,4 +80,5 @@
"SexualMultimodalEvaluator",
"ViolenceMultimodalEvaluator",
"ProtectedMaterialMultimodalEvaluator",
"AggregationType",
]
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import enum
from typing import Literal
from azure.ai.evaluation._common._experimental import experimental


class EvaluationMetrics:
Expand Down Expand Up @@ -57,6 +59,22 @@ class EvaluationRunProperties:
EVALUATION_SDK = "_azureml.evaluation_sdk_name"


@experimental
class AggregationType(enum.Enum):
"""Defines how numeric evaluation results should be aggregated
to produce a single value. Used by individual evaluators to combine per-turn results for
a conversation-based input. In general, wherever this enum is used, it is also possible
to directly assign the underlying aggregation function for more complex use cases.
The 'custom' value is generally not an acceptable input, and should only be used as an output
to indicate that a custom aggregation function has been injected."""

MEAN = "mean"
MAX = "max"
MIN = "min"
SUM = "sum"
CUSTOM = "custom"


DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"

CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@

import inspect
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional

from promptflow._utils.async_utils import async_run_allowing_running_loop
from typing_extensions import ParamSpec, TypeAlias, get_overloads

from azure.ai.evaluation._common.math import list_mean
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
from azure.ai.evaluation._common.utils import remove_optional_singletons
from azure.ai.evaluation._constants import AggregationType
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._common._experimental import experimental

from ._conversation_aggregators import GetAggregator, GetAggregatorType

P = ParamSpec("P")
T = TypeVar("T")
Expand Down Expand Up @@ -70,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
:type not_singleton_inputs: List[str]
:param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
:type eval_last_turn: bool
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
to produce a single result.
Default is ~azure.ai.evaluation.AggregationType.MEAN.
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
:param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
overrides the standard aggregator implied by conversation_aggregation_type. None by default.
:type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
"""

# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
Expand All @@ -81,11 +91,17 @@ def __init__(
*,
not_singleton_inputs: List[str] = ["conversation", "kwargs"],
eval_last_turn: bool = False,
conversation_aggregation_type: AggregationType = AggregationType.MEAN,
conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
):
self._not_singleton_inputs = not_singleton_inputs
self._eval_last_turn = eval_last_turn
self._singleton_inputs = self._derive_singleton_inputs()
self._async_evaluator = AsyncEvaluatorBase(self._real_call)
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
if conversation_aggregator_override is not None:
# Type ignore since we already checked for None, but mypy doesn't know that.
self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]

# This needs to be overridden just to change the function header into something more informative,
# and to be able to add a more specific docstring. The actual function contents should just be
Expand Down Expand Up @@ -359,7 +375,7 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
# Find and average all numeric values
for metric, values in evaluation_per_turn.items():
if all(isinstance(value, (int, float)) for value in values):
aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
# Slap the per-turn results back in.
aggregated["evaluation_per_turn"] = evaluation_per_turn
return aggregated
Expand Down Expand Up @@ -387,10 +403,51 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
# Otherwise, aggregate results.
return self._aggregate_results(per_turn_results=per_turn_results)

# ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``

@final
def _to_async(self) -> "AsyncEvaluatorBase":
return self._async_evaluator

@experimental
@final
def _set_conversation_aggregation_type(self, conversation_aggregation_type: AggregationType) -> None:
"""Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
multi-turn conversation into a single top-level result.
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn
results of a conversation to produce a single result.
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
"""
self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)

@experimental
@final
def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
"""Set the conversation aggregator function directly. This function will be applied to all numeric outputs
of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
suit your needs, but use with caution.
:param aggregator: The function to use to aggregate per-turn results.
:type aggregator: Callable[[List[float]], float]
"""
self._conversation_aggregation_function = aggregator

@experimental
@final
def _get_conversation_aggregator_type(self) -> AggregationType:
"""Get the current conversation aggregation type used by this evaluator. This refers to the
method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
is inputted into an evaluator that evaluates each turn individually). The individual inputs
are combined by the function implied here to produce a single overall result.
:return: The conversation aggregation type.
:rtype: ~azure.ai.evaluation.AggregationType
"""
return GetAggregatorType(self._conversation_aggregation_function)


class AsyncEvaluatorBase:
"""The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from azure.ai.evaluation._common.utils import validate_azure_ai_project
from azure.ai.evaluation._exceptions import EvaluationException
from azure.ai.evaluation._common.utils import validate_conversation
from azure.ai.evaluation._constants import AggregationType
from azure.core.credentials import TokenCredential

from . import EvaluatorBase
Expand All @@ -35,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
when this occurs. Default is False, resulting full conversation evaluation and aggregation.
:type eval_last_turn: bool
:param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
to produce a single result.
Default is ~azure.ai.evaluation.AggregationType.MEAN.
:type conversation_aggregation_type: ~azure.ai.evaluation.AggregationType
"""

@override
Expand All @@ -44,8 +49,9 @@ def __init__(
azure_ai_project: dict,
credential: TokenCredential,
eval_last_turn: bool = False,
conversation_aggregation_type: AggregationType = AggregationType.MEAN,
):
super().__init__(eval_last_turn=eval_last_turn)
super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
self._eval_metric = eval_metric
self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
self._credential = credential
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from typing import Callable, List
from azure.ai.evaluation._common.math import list_mean
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
from azure.ai.evaluation._constants import AggregationType


def GetAggregator(aggregation_type: AggregationType) -> Callable[[List[float]], float]:
if aggregation_type == AggregationType.SUM:
return sum
if aggregation_type == AggregationType.MEAN:
return list_mean
if aggregation_type == AggregationType.MAX:
return max
if aggregation_type == AggregationType.MIN:
return min
if aggregation_type == AggregationType.CUSTOM:
msg = (
"Cannot 'get' aggregator function associated with custom aggregation enum."
+ " This enum value should only be outputted as an indicator of an injected"
+ " aggregation function, not inputted directly"
)
raise EvaluationException(
message=msg,
blame=ErrorBlame.UNKNOWN,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.EVALUATE,
)
raise EvaluationException(
message=f"Unaccounted for aggregation type: {aggregation_type}",
blame=ErrorBlame.UNKNOWN,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.EVALUATE,
)


def GetAggregatorType(aggregation_function: Callable) -> AggregationType:
if aggregation_function == sum: # pylint: disable=comparison-with-callable
return AggregationType.SUM
if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
return AggregationType.MEAN
if aggregation_function == max: # pylint: disable=comparison-with-callable
return AggregationType.MAX
if aggregation_function == min: # pylint: disable=comparison-with-callable
return AggregationType.MIN
return AggregationType.CUSTOM
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._constants import AggregationType


@experimental
Expand Down Expand Up @@ -71,6 +72,7 @@ def __init__(
eval_metric=EvaluationMetrics.HATE_FAIRNESS,
azure_ai_project=azure_ai_project,
credential=credential,
conversation_aggregation_type=AggregationType.MAX,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._constants import AggregationType


@experimental
Expand Down Expand Up @@ -65,6 +66,7 @@ def __init__(
eval_metric=EvaluationMetrics.SELF_HARM,
azure_ai_project=azure_ai_project,
credential=credential,
conversation_aggregation_type=AggregationType.MAX,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._constants import AggregationType


@experimental
Expand Down Expand Up @@ -67,6 +68,7 @@ def __init__(
eval_metric=EvaluationMetrics.SEXUAL,
azure_ai_project=azure_ai_project,
credential=credential,
conversation_aggregation_type=AggregationType.MAX,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._constants import AggregationType


@experimental
Expand Down Expand Up @@ -67,6 +68,7 @@ def __init__(
eval_metric=EvaluationMetrics.VIOLENCE,
azure_ai_project=azure_ai_project,
credential=credential,
conversation_aggregation_type=AggregationType.MAX,
)

@overload
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}}
{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii`''\"</>{}{{]", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}}
Loading

0 comments on commit 7f904a3

Please sign in to comment.