Source code for arize.experiments.evaluators.base

"""Base evaluator classes for experiment evaluation."""

from __future__ import annotations

import functools
import inspect
from abc import ABC
from collections.abc import Awaitable, Callable, Mapping, Sequence
from types import MappingProxyType
from typing import TYPE_CHECKING, Any, cast

from arize.experiments.evaluators.types import (
    AnnotatorKind,
    EvaluationResult,
    EvaluatorKind,
    EvaluatorName,
    EvaluatorOutput,
    JSONSerializable,
)

if TYPE_CHECKING:
    from arize.experiments.types import (
        ExampleInput,
        ExampleMetadata,
        ExampleOutput,
        TaskOutput,
    )


[docs] class Evaluator(ABC): """A helper super class to guide the implementation of an `Evaluator` object. Subclasses must implement either the `evaluate` or `async_evaluate` method. Implementing both methods is recommended, but not required. This Class is intended to be subclassed, and should not be instantiated directly. """ _kind: EvaluatorKind _name: EvaluatorName @functools.cached_property def name(self) -> EvaluatorName: """Return the name of this evaluator.""" if hasattr(self, "_name"): return self._name return self.__class__.__name__ @functools.cached_property def kind(self) -> EvaluatorKind: """Return the kind of this evaluator (CODE or LLM).""" if hasattr(self, "_kind"): return self._kind return AnnotatorKind.CODE.value def __new__(cls, *args: object, **kwargs: object) -> Evaluator: """Create a new evaluator instance, preventing direct instantiation of abstract class.""" if cls is Evaluator: raise TypeError( f"{cls.__name__} is an abstract class and should not be instantiated." ) return object.__new__(cls)
[docs] def evaluate( self, *, dataset_row: Mapping[str, JSONSerializable] | None = None, input: ExampleInput = MappingProxyType({}), output: TaskOutput | None = None, experiment_output: TaskOutput | None = None, dataset_output: ExampleOutput = MappingProxyType({}), metadata: ExampleMetadata = MappingProxyType({}), **kwargs: object, ) -> EvaluationResult: """Evaluate the given inputs and produce an evaluation result. This method should be implemented by subclasses to perform the actual evaluation logic. It is recommended to implement both this synchronous method and the asynchronous `async_evaluate` method, but it is not required. Args: dataset_row (Mapping[str, JSONSerializable] | :obj:`None`): A row from the dataset. input (ExampleInput): The input provided for evaluation. output (TaskOutput | :obj:`None`): The output produced by the task. experiment_output (TaskOutput | :obj:`None`): The experiment output for comparison. dataset_output (ExampleOutput): The expected output from the dataset. metadata (ExampleMetadata): Metadata associated with the example. **kwargs (Any): Additional keyword arguments. Raises: NotImplementedError: If the method is not implemented by the subclass. """ # For subclassing, one should implement either this sync method or the # async version. Implementing both is recommended but not required. raise NotImplementedError
[docs] async def async_evaluate( self, *, dataset_row: Mapping[str, JSONSerializable] | None = None, input: ExampleInput = MappingProxyType({}), output: TaskOutput | None = None, experiment_output: TaskOutput | None = None, dataset_output: ExampleOutput = MappingProxyType({}), metadata: ExampleMetadata = MappingProxyType({}), **kwargs: object, ) -> EvaluationResult: """Asynchronously evaluate the given inputs and produce an evaluation result. This method should be implemented by subclasses to perform the actual evaluation logic. It is recommended to implement both this asynchronous method and the synchronous `evaluate` method, but it is not required. Args: dataset_row (Mapping[str, JSONSerializable] | :obj:`None`): A row from the dataset. input (ExampleInput): The input provided for evaluation. output (TaskOutput | :obj:`None`): The output produced by the task. experiment_output (TaskOutput | :obj:`None`): The experiment output for comparison. dataset_output (ExampleOutput): The expected output from the dataset. metadata (ExampleMetadata): Metadata associated with the example. **kwargs (Any): Additional keyword arguments. Returns: EvaluationResult: The result of the evaluation. Raises: NotImplementedError: If the method is not implemented by the subclass. """ # For subclassing, one should implement either this async method or the # sync version. Implementing both is recommended but not required. return self.evaluate( dataset_row=dataset_row, input=input, output=output, experiment_output=experiment_output, dataset_output=dataset_output, metadata=metadata, **kwargs, )
def __init_subclass__( cls, is_abstract: bool = False, **kwargs: object ) -> None: """Validate subclass implementation when inherited. Args: is_abstract: Whether the subclass is abstract and should skip validation. **kwargs: Additional keyword arguments for parent class. """ super().__init_subclass__(**kwargs) if is_abstract: return evaluate_fn_signature = inspect.signature(Evaluator.evaluate) for super_cls in inspect.getmro(cls): if super_cls in (LLMEvaluator, Evaluator): break if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__): if isinstance(evaluate, classmethod): evaluate = evaluate.__func__ if not callable(evaluate): raise TypeError( f"`evaluate()` method should be callable, got {type(evaluate)}" ) # need to remove the first param, i.e. `self` _validate_sig( functools.partial(evaluate, cast("Any", None)), "evaluate" ) return if async_evaluate := super_cls.__dict__.get( Evaluator.async_evaluate.__name__ ): if isinstance(async_evaluate, classmethod): async_evaluate = async_evaluate.__func__ if not callable(async_evaluate): raise TypeError( f"`async_evaluate()` method should be callable, got {type(async_evaluate)}" ) # need to remove the first param, i.e. `self` _validate_sig( functools.partial(async_evaluate, cast("Any", None)), "async_evaluate", ) return raise ValueError( f"Evaluator must implement either " f"`def evaluate{evaluate_fn_signature}` or " f"`async def async_evaluate{evaluate_fn_signature}`" )
def _validate_sig(fn: Callable[..., object], fn_name: str) -> None: sig = inspect.signature(fn) validate_evaluator_signature(sig) for param in sig.parameters.values(): if param.kind is inspect.Parameter.VAR_KEYWORD: return else: raise ValueError( f"`{fn_name}` should allow variadic keyword arguments `**kwargs`" ) def validate_evaluator_signature(sig: inspect.Signature) -> None: """Validate that a function signature is compatible for use as an evaluator. Args: sig: The function signature to validate. Raises: ValueError: If the signature is invalid for use as an evaluator. """ # Check that the wrapped function has a valid signature for use as an evaluator # If it does not, raise an error to exit early before running evaluations params = sig.parameters valid_named_params = { "dataset_row", "input", "output", "experiment_output", "dataset_output", "metadata", } if len(params) == 0: raise ValueError( "Evaluation function must have at least one parameter." ) if len(params) > 1: for not_found in set(params) - valid_named_params: param = params[not_found] if ( param.kind is inspect.Parameter.VAR_KEYWORD or param.default is not inspect.Parameter.empty ): continue raise ValueError( f"Invalid parameter names in evaluation function: {not_found}. " "Parameters names for multi-argument functions must be " f"any of: {', '.join(valid_named_params)}." )
[docs] class CodeEvaluator(Evaluator, ABC, is_abstract=True): """A convenience super class for defining code evaluators. There are functionally no differences between this class and the `Evaluator` class, except that this class has a default `_kind` attribute for AnnotatorKind.CODE. This class is intended to be subclassed, and should not be instantiated directly. """ _kind = str(AnnotatorKind.CODE) def __new__(cls, *args: object, **kwargs: object) -> CodeEvaluator: """Create a new code evaluator instance, preventing direct instantiation of abstract class.""" if cls is CodeEvaluator: raise TypeError( f"{cls.__name__} is an abstract class and should not be instantiated." ) return object.__new__(cls)
[docs] class LLMEvaluator(Evaluator, ABC, is_abstract=True): """A convenience super class for defining LLM evaluators. There are functionally no differences between this class and the `Evaluator` class, except that this class has a default `_kind` attribute for AnnotatorKind.LLM. This class is intended to be subclassed, and should not be instantiated directly. """ _kind = str(AnnotatorKind.LLM) def __new__(cls, *args: object, **kwargs: object) -> LLMEvaluator: """Create a new LLM evaluator instance, preventing direct instantiation of abstract class.""" if cls is LLMEvaluator: raise TypeError( f"{cls.__name__} is an abstract class and should not be instantiated." ) return object.__new__(cls)
ExperimentEvaluator = ( Evaluator | Callable[..., EvaluatorOutput] | Callable[..., Awaitable[EvaluatorOutput]] ) Evaluators = ( ExperimentEvaluator | Sequence[ExperimentEvaluator] | Mapping[EvaluatorName, ExperimentEvaluator] )