Spaces:
Runtime error
Runtime error
| """Interfaces to be implemented by general evaluators.""" | |
| from __future__ import annotations | |
| import asyncio | |
| import logging | |
| from abc import ABC, abstractmethod | |
| from enum import Enum | |
| from functools import partial | |
| from typing import Any, Optional, Sequence, Tuple, Union | |
| from warnings import warn | |
| from langchain_core.agents import AgentAction | |
| from langchain_core.language_models import BaseLanguageModel | |
| from langchain.chains.base import Chain | |
| logger = logging.getLogger(__name__) | |
| class EvaluatorType(str, Enum): | |
| """The types of the evaluators.""" | |
| QA = "qa" | |
| """Question answering evaluator, which grades answers to questions | |
| directly using an LLM.""" | |
| COT_QA = "cot_qa" | |
| """Chain of thought question answering evaluator, which grades | |
| answers to questions using | |
| chain of thought 'reasoning'.""" | |
| CONTEXT_QA = "context_qa" | |
| """Question answering evaluator that incorporates 'context' in the response.""" | |
| PAIRWISE_STRING = "pairwise_string" | |
| """The pairwise string evaluator, which predicts the preferred prediction from | |
| between two models.""" | |
| SCORE_STRING = "score_string" | |
| """The scored string evaluator, which gives a score between 1 and 10 | |
| to a prediction.""" | |
| LABELED_PAIRWISE_STRING = "labeled_pairwise_string" | |
| """The labeled pairwise string evaluator, which predicts the preferred prediction | |
| from between two models based on a ground truth reference label.""" | |
| LABELED_SCORE_STRING = "labeled_score_string" | |
| """The labeled scored string evaluator, which gives a score between 1 and 10 | |
| to a prediction based on a ground truth reference label.""" | |
| AGENT_TRAJECTORY = "trajectory" | |
| """The agent trajectory evaluator, which grades the agent's intermediate steps.""" | |
| CRITERIA = "criteria" | |
| """The criteria evaluator, which evaluates a model based on a | |
| custom set of criteria without any reference labels.""" | |
| LABELED_CRITERIA = "labeled_criteria" | |
| """The labeled criteria evaluator, which evaluates a model based on a | |
| custom set of criteria, with a reference label.""" | |
| STRING_DISTANCE = "string_distance" | |
| """Compare predictions to a reference answer using string edit distances.""" | |
| EXACT_MATCH = "exact_match" | |
| """Compare predictions to a reference answer using exact matching.""" | |
| REGEX_MATCH = "regex_match" | |
| """Compare predictions to a reference answer using regular expressions.""" | |
| PAIRWISE_STRING_DISTANCE = "pairwise_string_distance" | |
| """Compare predictions based on string edit distances.""" | |
| EMBEDDING_DISTANCE = "embedding_distance" | |
| """Compare a prediction to a reference label using embedding distance.""" | |
| PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance" | |
| """Compare two predictions using embedding distance.""" | |
| JSON_VALIDITY = "json_validity" | |
| """Check if a prediction is valid JSON.""" | |
| JSON_EQUALITY = "json_equality" | |
| """Check if a prediction is equal to a reference JSON.""" | |
| JSON_EDIT_DISTANCE = "json_edit_distance" | |
| """Compute the edit distance between two JSON strings after canonicalization.""" | |
| JSON_SCHEMA_VALIDATION = "json_schema_validation" | |
| """Check if a prediction is valid JSON according to a JSON schema.""" | |
| class LLMEvalChain(Chain): | |
| """A base class for evaluators that use an LLM.""" | |
| def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> LLMEvalChain: | |
| """Create a new evaluator from an LLM.""" | |
| class _EvalArgsMixin: | |
| """Mixin for checking evaluation arguments.""" | |
| def requires_reference(self) -> bool: | |
| """Whether this evaluator requires a reference label.""" | |
| return False | |
| def requires_input(self) -> bool: | |
| """Whether this evaluator requires an input string.""" | |
| return False | |
| def _skip_input_warning(self) -> str: | |
| """Warning to show when input is ignored.""" | |
| return f"Ignoring input in {self.__class__.__name__}, as it is not expected." | |
| def _skip_reference_warning(self) -> str: | |
| """Warning to show when reference is ignored.""" | |
| return ( | |
| f"Ignoring reference in {self.__class__.__name__}, as it is not expected." | |
| ) | |
| def _check_evaluation_args( | |
| self, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| ) -> None: | |
| """Check if the evaluation arguments are valid. | |
| Args: | |
| reference (Optional[str], optional): The reference label. | |
| input (Optional[str], optional): The input string. | |
| Raises: | |
| ValueError: If the evaluator requires an input string but none is provided, | |
| or if the evaluator requires a reference label but none is provided. | |
| """ | |
| if self.requires_input and input is None: | |
| raise ValueError(f"{self.__class__.__name__} requires an input string.") | |
| elif input is not None and not self.requires_input: | |
| warn(self._skip_input_warning) | |
| if self.requires_reference and reference is None: | |
| raise ValueError(f"{self.__class__.__name__} requires a reference string.") | |
| elif reference is not None and not self.requires_reference: | |
| warn(self._skip_reference_warning) | |
| class StringEvaluator(_EvalArgsMixin, ABC): | |
| """Grade, tag, or otherwise evaluate predictions relative to their inputs | |
| and/or reference labels.""" | |
| def evaluation_name(self) -> str: | |
| """The name of the evaluation.""" | |
| return self.__class__.__name__ | |
| def requires_reference(self) -> bool: | |
| """Whether this evaluator requires a reference label.""" | |
| return False | |
| def _evaluate_strings( | |
| self, | |
| *, | |
| prediction: Union[str, Any], | |
| reference: Optional[Union[str, Any]] = None, | |
| input: Optional[Union[str, Any]] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate Chain or LLM output, based on optional input and label. | |
| Args: | |
| prediction (str): The LLM or chain prediction to evaluate. | |
| reference (Optional[str], optional): The reference label to evaluate against. | |
| input (Optional[str], optional): The input to consider during evaluation. | |
| **kwargs: Additional keyword arguments, including callbacks, tags, etc. | |
| Returns: | |
| dict: The evaluation results containing the score or value. | |
| It is recommended that the dictionary contain the following keys: | |
| - score: the score of the evaluation, if applicable. | |
| - value: the string value of the evaluation, if applicable. | |
| - reasoning: the reasoning for the evaluation, if applicable. | |
| """ # noqa: E501 | |
| async def _aevaluate_strings( | |
| self, | |
| *, | |
| prediction: Union[str, Any], | |
| reference: Optional[Union[str, Any]] = None, | |
| input: Optional[Union[str, Any]] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate Chain or LLM output, based on optional input and label. | |
| Args: | |
| prediction (str): The LLM or chain prediction to evaluate. | |
| reference (Optional[str], optional): The reference label to evaluate against. | |
| input (Optional[str], optional): The input to consider during evaluation. | |
| **kwargs: Additional keyword arguments, including callbacks, tags, etc. | |
| Returns: | |
| dict: The evaluation results containing the score or value. | |
| It is recommended that the dictionary contain the following keys: | |
| - score: the score of the evaluation, if applicable. | |
| - value: the string value of the evaluation, if applicable. | |
| - reasoning: the reasoning for the evaluation, if applicable. | |
| """ # noqa: E501 | |
| return await asyncio.get_running_loop().run_in_executor( | |
| None, | |
| partial( | |
| self._evaluate_strings, | |
| prediction=prediction, | |
| reference=reference, | |
| input=input, | |
| **kwargs, | |
| ), | |
| ) | |
| def evaluate_strings( | |
| self, | |
| *, | |
| prediction: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate Chain or LLM output, based on optional input and label. | |
| Args: | |
| prediction (str): The LLM or chain prediction to evaluate. | |
| reference (Optional[str], optional): The reference label to evaluate against. | |
| input (Optional[str], optional): The input to consider during evaluation. | |
| **kwargs: Additional keyword arguments, including callbacks, tags, etc. | |
| Returns: | |
| dict: The evaluation results containing the score or value. | |
| """ # noqa: E501 | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return self._evaluate_strings( | |
| prediction=prediction, reference=reference, input=input, **kwargs | |
| ) | |
| async def aevaluate_strings( | |
| self, | |
| *, | |
| prediction: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate Chain or LLM output, based on optional input and label. | |
| Args: | |
| prediction (str): The LLM or chain prediction to evaluate. | |
| reference (Optional[str], optional): The reference label to evaluate against. | |
| input (Optional[str], optional): The input to consider during evaluation. | |
| **kwargs: Additional keyword arguments, including callbacks, tags, etc. | |
| Returns: | |
| dict: The evaluation results containing the score or value. | |
| """ # noqa: E501 | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return await self._aevaluate_strings( | |
| prediction=prediction, reference=reference, input=input, **kwargs | |
| ) | |
| class PairwiseStringEvaluator(_EvalArgsMixin, ABC): | |
| """Compare the output of two models (or two outputs of the same model).""" | |
| def _evaluate_string_pairs( | |
| self, | |
| *, | |
| prediction: str, | |
| prediction_b: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate the output string pairs. | |
| Args: | |
| prediction (str): The output string from the first model. | |
| prediction_b (str): The output string from the second model. | |
| reference (Optional[str], optional): The expected output / reference string. | |
| input (Optional[str], optional): The input string. | |
| **kwargs: Additional keyword arguments, such as callbacks and optional reference strings. | |
| Returns: | |
| dict: A dictionary containing the preference, scores, and/or other information. | |
| """ # noqa: E501 | |
| async def _aevaluate_string_pairs( | |
| self, | |
| *, | |
| prediction: str, | |
| prediction_b: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate the output string pairs. | |
| Args: | |
| prediction (str): The output string from the first model. | |
| prediction_b (str): The output string from the second model. | |
| reference (Optional[str], optional): The expected output / reference string. | |
| input (Optional[str], optional): The input string. | |
| **kwargs: Additional keyword arguments, such as callbacks and optional reference strings. | |
| Returns: | |
| dict: A dictionary containing the preference, scores, and/or other information. | |
| """ # noqa: E501 | |
| return await asyncio.get_running_loop().run_in_executor( | |
| None, | |
| partial( | |
| self._evaluate_string_pairs, | |
| prediction=prediction, | |
| prediction_b=prediction_b, | |
| reference=reference, | |
| input=input, | |
| **kwargs, | |
| ), | |
| ) | |
| def evaluate_string_pairs( | |
| self, | |
| *, | |
| prediction: str, | |
| prediction_b: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate the output string pairs. | |
| Args: | |
| prediction (str): The output string from the first model. | |
| prediction_b (str): The output string from the second model. | |
| reference (Optional[str], optional): The expected output / reference string. | |
| input (Optional[str], optional): The input string. | |
| **kwargs: Additional keyword arguments, such as callbacks and optional reference strings. | |
| Returns: | |
| dict: A dictionary containing the preference, scores, and/or other information. | |
| """ # noqa: E501 | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return self._evaluate_string_pairs( | |
| prediction=prediction, | |
| prediction_b=prediction_b, | |
| reference=reference, | |
| input=input, | |
| **kwargs, | |
| ) | |
| async def aevaluate_string_pairs( | |
| self, | |
| *, | |
| prediction: str, | |
| prediction_b: str, | |
| reference: Optional[str] = None, | |
| input: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate the output string pairs. | |
| Args: | |
| prediction (str): The output string from the first model. | |
| prediction_b (str): The output string from the second model. | |
| reference (Optional[str], optional): The expected output / reference string. | |
| input (Optional[str], optional): The input string. | |
| **kwargs: Additional keyword arguments, such as callbacks and optional reference strings. | |
| Returns: | |
| dict: A dictionary containing the preference, scores, and/or other information. | |
| """ # noqa: E501 | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return await self._aevaluate_string_pairs( | |
| prediction=prediction, | |
| prediction_b=prediction_b, | |
| reference=reference, | |
| input=input, | |
| **kwargs, | |
| ) | |
| class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC): | |
| """Interface for evaluating agent trajectories.""" | |
| def requires_input(self) -> bool: | |
| """Whether this evaluator requires an input string.""" | |
| return True | |
| def _evaluate_agent_trajectory( | |
| self, | |
| *, | |
| prediction: str, | |
| agent_trajectory: Sequence[Tuple[AgentAction, str]], | |
| input: str, | |
| reference: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate a trajectory. | |
| Args: | |
| prediction (str): The final predicted response. | |
| agent_trajectory (List[Tuple[AgentAction, str]]): | |
| The intermediate steps forming the agent trajectory. | |
| input (str): The input to the agent. | |
| reference (Optional[str]): The reference answer. | |
| Returns: | |
| dict: The evaluation result. | |
| """ | |
| async def _aevaluate_agent_trajectory( | |
| self, | |
| *, | |
| prediction: str, | |
| agent_trajectory: Sequence[Tuple[AgentAction, str]], | |
| input: str, | |
| reference: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate a trajectory. | |
| Args: | |
| prediction (str): The final predicted response. | |
| agent_trajectory (List[Tuple[AgentAction, str]]): | |
| The intermediate steps forming the agent trajectory. | |
| input (str): The input to the agent. | |
| reference (Optional[str]): The reference answer. | |
| Returns: | |
| dict: The evaluation result. | |
| """ | |
| return await asyncio.get_running_loop().run_in_executor( | |
| None, | |
| partial( | |
| self._evaluate_agent_trajectory, | |
| prediction=prediction, | |
| agent_trajectory=agent_trajectory, | |
| reference=reference, | |
| input=input, | |
| **kwargs, | |
| ), | |
| ) | |
| def evaluate_agent_trajectory( | |
| self, | |
| *, | |
| prediction: str, | |
| agent_trajectory: Sequence[Tuple[AgentAction, str]], | |
| input: str, | |
| reference: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Evaluate a trajectory. | |
| Args: | |
| prediction (str): The final predicted response. | |
| agent_trajectory (List[Tuple[AgentAction, str]]): | |
| The intermediate steps forming the agent trajectory. | |
| input (str): The input to the agent. | |
| reference (Optional[str]): The reference answer. | |
| Returns: | |
| dict: The evaluation result. | |
| """ | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return self._evaluate_agent_trajectory( | |
| prediction=prediction, | |
| input=input, | |
| agent_trajectory=agent_trajectory, | |
| reference=reference, | |
| **kwargs, | |
| ) | |
| async def aevaluate_agent_trajectory( | |
| self, | |
| *, | |
| prediction: str, | |
| agent_trajectory: Sequence[Tuple[AgentAction, str]], | |
| input: str, | |
| reference: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> dict: | |
| """Asynchronously evaluate a trajectory. | |
| Args: | |
| prediction (str): The final predicted response. | |
| agent_trajectory (List[Tuple[AgentAction, str]]): | |
| The intermediate steps forming the agent trajectory. | |
| input (str): The input to the agent. | |
| reference (Optional[str]): The reference answer. | |
| Returns: | |
| dict: The evaluation result. | |
| """ | |
| self._check_evaluation_args(reference=reference, input=input) | |
| return await self._aevaluate_agent_trajectory( | |
| prediction=prediction, | |
| input=input, | |
| agent_trajectory=agent_trajectory, | |
| reference=reference, | |
| **kwargs, | |
| ) | |