Spaces:

wandb
/

guardrails-genie

Running

File size: 3,978 Bytes

b077b7d
3ad3f59
78a1bf0
b077b7d
 
 
 
2b2ab5b
b207b4c
 
 
 
 
 
 
 
 
 
b077b7d
 
 
2b2ab5b
b207b4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af688eb
2b2ab5b
 
 
 
 
 
b077b7d
2946856
 
 
3ad3f59
 
 
 
 
 
af688eb
2b2ab5b
 
 
b207b4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2ab5b

import weave
from pydantic import BaseModel
from rich.progress import track

from .base import Guardrail


class GuardrailManager(weave.Model):
    """
    GuardrailManager is responsible for managing and executing a series of guardrails
    on a given prompt. It utilizes the `weave` framework to define operations that
    can be applied to the guardrails.

    Attributes:
        guardrails (list[Guardrail]): A list of Guardrail objects that define the
            rules and checks to be applied to the input prompt.
    """

    guardrails: list[Guardrail]

    @weave.op()
    def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
        """
        Execute a series of guardrails on a given prompt and return the results.

        This method iterates over a list of Guardrail objects, applying each guardrail's
        `guard` method to the provided prompt. It collects responses from each guardrail
        and compiles them into a summary report. The function also determines the overall
        safety of the prompt based on the responses from the guardrails.

        Args:
            prompt (str): The input prompt to be evaluated by the guardrails.
            progress_bar (bool, optional): If True, displays a progress bar while
                processing the guardrails. Defaults to True.
            **kwargs: Additional keyword arguments to be passed to each guardrail's
                `guard` method.

        Returns:
            dict: A dictionary containing:
                - "safe" (bool): Indicates whether the prompt is considered safe
                  based on the guardrails' evaluations.
                - "alerts" (list): A list of dictionaries, each containing the name
                  of the guardrail and its response.
                - "summary" (str): A formatted string summarizing the results of
                  each guardrail's evaluation.
        """
        alerts, summaries, safe = [], "", True
        iterable = (
            track(self.guardrails, description="Running guardrails")
            if progress_bar
            else self.guardrails
        )
        for guardrail in iterable:
            response = guardrail.guard(prompt, **kwargs)
            alerts.append(
                {"guardrail_name": guardrail.__class__.__name__, "response": response}
            )
            if isinstance(response, BaseModel):
                safe = safe and response.safe
                summaries += f"**{guardrail.__class__.__name__}**: {response.explanation}\n\n---\n\n"
            else:
                safe = safe and response["safe"]
                summaries += f"**{guardrail.__class__.__name__}**: {response['summary']}\n\n---\n\n"
        return {"safe": safe, "alerts": alerts, "summary": summaries}

    @weave.op()
    def predict(self, prompt: str, **kwargs) -> dict:
        """
        Predicts the safety and potential issues of a given input prompt using the guardrails.

        This function serves as a wrapper around the `guard` method, providing a simplified
        interface for evaluating the input prompt without displaying a progress bar. It
        applies a series of guardrails to the prompt and returns a detailed assessment.

        Args:
            prompt (str): The input prompt to be evaluated by the guardrails.
            **kwargs: Additional keyword arguments to be passed to each guardrail's
                `guard` method.

        Returns:
            dict: A dictionary containing:
                - "safe" (bool): Indicates whether the prompt is considered safe
                  based on the guardrails' evaluations.
                - "alerts" (list): A list of dictionaries, each containing the name
                  of the guardrail and its response.
                - "summary" (str): A formatted string summarizing the results of
                  each guardrail's evaluation.
        """
        return self.guard(prompt, progress_bar=False, **kwargs)