File size: 4,541 Bytes
67dbb33
 
785c044
67dbb33
 
785c044
67dbb33
 
 
 
 
 
 
 
 
785c044
 
 
36c3c0f
 
 
78a1bf0
 
 
36c3c0f
 
 
 
 
 
 
 
78a1bf0
785c044
 
 
 
 
 
6780f80
36c3c0f
 
 
78a1bf0
 
 
 
36c3c0f
 
 
 
 
 
 
 
785c044
6780f80
 
785c044
6780f80
 
 
785c044
 
6780f80
 
 
785c044
 
 
 
 
 
 
 
 
36c3c0f
 
 
 
78a1bf0
 
 
36c3c0f
 
 
 
 
 
 
78a1bf0
36c3c0f
 
785c044
 
 
 
 
 
 
 
 
6780f80
 
785c044
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os

import pandas as pd
import pymupdf4llm
import weave
import weave.trace
from firerequests import FireRequests


@weave.op()
def get_markdown_from_pdf_url(url: str) -> str:
    FireRequests().download(url, "temp.pdf", show_progress=False)
    markdown = pymupdf4llm.to_markdown("temp.pdf", show_progress=False)
    os.remove("temp.pdf")
    return markdown


class EvaluationCallManager:
    """
    Manages the evaluation calls for a specific project and entity in Weave.

    This class is responsible for initializing and managing evaluation calls associated with a
    specific project and entity. It provides functionality to collect guardrail guard calls
    from evaluation predictions and scores, and render these calls into a structured format
    suitable for display in Streamlit.

    Args:
        entity (str): The entity name.
        project (str): The project name.
        call_id (str): The call id.
        max_count (int): The maximum number of guardrail guard calls to collect from the evaluation.
    """

    def __init__(self, entity: str, project: str, call_id: str, max_count: int = 10):
        self.base_call = weave.init(f"{entity}/{project}").get_call(call_id=call_id)
        self.max_count = max_count
        self.show_warning_in_app = False
        self.call_list = []

    def collect_guardrail_guard_calls_from_eval(self):
        """
        Collects guardrail guard calls from evaluation predictions and scores.

        This function iterates through the children calls of the base evaluation call,
        extracting relevant guardrail guard calls and their associated scores. It stops
        collecting calls if it encounters an "Evaluation.summarize" operation or if the
        maximum count of guardrail guard calls is reached. The collected calls are stored
        in a list of dictionaries, each containing the input prompt, outputs, and score.

        Returns:
            list: A list of dictionaries, each containing:
                - input_prompt (str): The input prompt for the guard call.
                - outputs (dict): The outputs of the guard call.
                - score (dict): The score of the guard call.
        """
        guard_calls, count = [], 0
        for eval_predict_and_score_call in self.base_call.children():
            if "Evaluation.summarize" in eval_predict_and_score_call._op_name:
                break
            guardrail_predict_call = eval_predict_and_score_call.children()[0]
            guard_call = guardrail_predict_call.children()[0]
            score_call = eval_predict_and_score_call.children()[1]
            guard_calls.append(
                {
                    "input_prompt": str(guard_call.inputs["prompt"]),
                    "outputs": dict(guard_call.output),
                    "score": dict(score_call.output),
                }
            )
            count += 1
            if count >= self.max_count:
                self.show_warning_in_app = True
                break
        return guard_calls

    def render_calls_to_streamlit(self):
        """
        Renders the collected guardrail guard calls into a pandas DataFrame suitable for
        display in Streamlit.

        This function processes the collected guardrail guard calls stored in `self.call_list` and
        organizes them into a dictionary format that can be easily converted into a pandas DataFrame.
        The DataFrame contains columns for the input prompts, the safety status of the outputs, and
        the correctness of the predictions for each guardrail.

        The structure of the DataFrame is as follows:
        - The first column contains the input prompts.
        - Subsequent columns contain the safety status and prediction correctness for each guardrail.

        Returns:
            pd.DataFrame: A DataFrame containing the input prompts, safety status, and prediction
                          correctness for each guardrail.
        """
        dataframe = {
            "input_prompt": [
                call["input_prompt"] for call in self.call_list[0]["calls"]
            ]
        }
        for guardrail_call in self.call_list:
            dataframe[guardrail_call["guardrail_name"] + ".safe"] = [
                call["outputs"]["safe"] for call in guardrail_call["calls"]
            ]
            dataframe[guardrail_call["guardrail_name"] + ".prediction_correctness"] = [
                call["score"]["correct"] for call in guardrail_call["calls"]
            ]
        return pd.DataFrame(dataframe)