File size: 3,978 Bytes
b077b7d
3ad3f59
78a1bf0
b077b7d
 
 
 
2b2ab5b
b207b4c
 
 
 
 
 
 
 
 
 
b077b7d
 
 
2b2ab5b
b207b4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af688eb
2b2ab5b
 
 
 
 
 
b077b7d
2946856
 
 
3ad3f59
 
 
 
 
 
af688eb
2b2ab5b
 
 
b207b4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2ab5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import weave
from pydantic import BaseModel
from rich.progress import track

from .base import Guardrail


class GuardrailManager(weave.Model):
    """
    GuardrailManager is responsible for managing and executing a series of guardrails
    on a given prompt. It utilizes the `weave` framework to define operations that
    can be applied to the guardrails.

    Attributes:
        guardrails (list[Guardrail]): A list of Guardrail objects that define the
            rules and checks to be applied to the input prompt.
    """

    guardrails: list[Guardrail]

    @weave.op()
    def guard(self, prompt: str, progress_bar: bool = True, **kwargs) -> dict:
        """
        Execute a series of guardrails on a given prompt and return the results.

        This method iterates over a list of Guardrail objects, applying each guardrail's
        `guard` method to the provided prompt. It collects responses from each guardrail
        and compiles them into a summary report. The function also determines the overall
        safety of the prompt based on the responses from the guardrails.

        Args:
            prompt (str): The input prompt to be evaluated by the guardrails.
            progress_bar (bool, optional): If True, displays a progress bar while
                processing the guardrails. Defaults to True.
            **kwargs: Additional keyword arguments to be passed to each guardrail's
                `guard` method.

        Returns:
            dict: A dictionary containing:
                - "safe" (bool): Indicates whether the prompt is considered safe
                  based on the guardrails' evaluations.
                - "alerts" (list): A list of dictionaries, each containing the name
                  of the guardrail and its response.
                - "summary" (str): A formatted string summarizing the results of
                  each guardrail's evaluation.
        """
        alerts, summaries, safe = [], "", True
        iterable = (
            track(self.guardrails, description="Running guardrails")
            if progress_bar
            else self.guardrails
        )
        for guardrail in iterable:
            response = guardrail.guard(prompt, **kwargs)
            alerts.append(
                {"guardrail_name": guardrail.__class__.__name__, "response": response}
            )
            if isinstance(response, BaseModel):
                safe = safe and response.safe
                summaries += f"**{guardrail.__class__.__name__}**: {response.explanation}\n\n---\n\n"
            else:
                safe = safe and response["safe"]
                summaries += f"**{guardrail.__class__.__name__}**: {response['summary']}\n\n---\n\n"
        return {"safe": safe, "alerts": alerts, "summary": summaries}

    @weave.op()
    def predict(self, prompt: str, **kwargs) -> dict:
        """
        Predicts the safety and potential issues of a given input prompt using the guardrails.

        This function serves as a wrapper around the `guard` method, providing a simplified
        interface for evaluating the input prompt without displaying a progress bar. It
        applies a series of guardrails to the prompt and returns a detailed assessment.

        Args:
            prompt (str): The input prompt to be evaluated by the guardrails.
            **kwargs: Additional keyword arguments to be passed to each guardrail's
                `guard` method.

        Returns:
            dict: A dictionary containing:
                - "safe" (bool): Indicates whether the prompt is considered safe
                  based on the guardrails' evaluations.
                - "alerts" (list): A list of dictionaries, each containing the name
                  of the guardrail and its response.
                - "summary" (str): A formatted string summarizing the results of
                  each guardrail's evaluation.
        """
        return self.guard(prompt, progress_bar=False, **kwargs)