File size: 7,126 Bytes
4fe4082
 
ae6a8c0
4fe4082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from dataclasses import dataclass

import gradio as gr

from database import PostgreSQL, Entry
from models import ALL_SUPPORTED_MODELS
from utils import *

db = PostgreSQL()


@dataclass
class State:
    dataset: str = "mmlu"
    topic: str = DEFAULT_TOPIC
    model: str = random.choice(MODELS)

    card_path: str = None
    card: Card = None
    summarizer_model: str = DEFAULT_SUMMARIZER
    card_summary: str = None

    qa: str = None
    model_answer: str = None
    ground_truth: bool = None  # if the model correctly answers the question

    submitted: bool = False

    def __post_init__(self):
        if any([self.card_path is None, self.card is None, self.card_summary is None]):
            assert all([self.card_path is None, self.card is None, self.card_summary is None])
            self.card, self.card_path = sample_random_card(self.dataset, self.topic, self.model)
        # init qa
        if any([self.qa is None, self.model_answer is None, self.ground_truth is None]):
            assert all([self.qa is None, self.model_answer is None, self.ground_truth is None])
            self.qa, self.model_answer, self.ground_truth = sample_random_qa(self.dataset, self.topic, self.model)
        if self.card_summary is None:
            self.card_summary = summarize_card(self.summarizer_model, self.card, self.qa)
            # self.card_summary = ""


def submit_guess(guess: str, reasoning: str, confidence: int, state: State):
    """submit_button.click()
    This function mutates the state.
    """
    if guess is None:
        return "Please make a guess and then submit!", "", state
    if state.submitted:
        return "You have already submitted your guess!", state.model_answer, state
    guess = True if guess == "Correct" else False
    result = "You are right!" if (state.ground_truth == guess) else "You are wrong!"

    # need to store: topic, model, card, question, guess, reasoning, confidence
    entry = Entry(state.model, state.card_path, state.topic, state.qa, guess, state.ground_truth, reasoning, confidence)
    db.insert(entry)

    state.submitted = True
    return result, state.model_answer, state


def next_guess(state: State):
    """next_button.click()
    This function mutates the state.
    """
    state.qa, state.model_answer, state.ground_truth = sample_random_qa(state.dataset, state.topic, state.model)
    state.card_summary = summarize_card(state.summarizer_model, state.card, state.qa)
    state.submitted = False
    return state.qa, state.card_summary, "", "", state


def re_summarize(state: State):
    """re_summarize_button.click()
    This function mutates the state.
    """
    state.card_summary = summarize_card(state.summarizer_model, state.card, state.qa)
    return state.card_summary, state


def switch_card(state: State):
    """switch_card_button.click()
    This function mutates the state.
    """
    old_model = state.model
    state.model = random.choice(MODELS)
    state.card, state.card_path = sample_random_card(state.dataset, state.topic, state.model)
    next_guess(state)
    return old_model, state.qa, "", state.card.get_markdown_str(), state.card_summary, "", state


def init_app():
    theme = gr.themes.Default(
        primary_hue="orange",
        secondary_hue="blue",
        neutral_hue="gray",
        text_size=gr.themes.Size(
            name="text_custom",
            xxs="10px",
            xs="12px",
            sm="14px",
            md="16px",
            lg="20px",
            xl="24px",
            xxl="28px",
        ),
    )

    with gr.Blocks(theme=theme) as app:
        gr_state = gr.State(State())  # this state if only for the current user
        s = gr_state.value
        s: State
        with gr.Row():  # header
            with gr.Column(scale=1):
                with gr.Group():
                    gr_dataset = gr.Dropdown(label="Select a Dataset", choices=DATASETS, value=s.dataset)
                    gr_topic = gr.Dropdown(label="Select a Topic", choices=TOPICS[s.dataset], value=s.topic)
                    gr_switch_card_button = gr.Button("Switch Evaluation Card")
                    gr_previous_model = gr.Textbox(label="Previous Model (A model may have multiple cards.)", value="", interactive=False)
            with gr.Column(scale=2):
                gr_instruction = gr.Markdown(value=read_all("prompts/instructions.md"))
        with gr.Row():
            with gr.Column(scale=1):  # question/guess column
                gr_question = gr.Textbox(label="Question", value=s.qa, interactive=False, show_copy_button=True)
                with gr.Group():
                    gr_reasoning = gr.Textbox(label="Explanation for Your Guess", lines=1, placeholder="Reason your decision (optional)")
                    gr_guess = gr.Radio(label="I believe the model will answer this question", choices=["Correct", "Incorrect"])
                    gr_confidence = gr.Slider(label="Confidence", minimum=1, maximum=5, step=1, value=3)
                    gr_guess_result = gr.Textbox(label="Result", value="",
                                                 placeholder="We will show the result once you submit your guess! :>", interactive=False)
                gr_submit_button = gr.Button("Submit")
                gr_next_button = gr.Button("Next Entry (will not change the full card)")
            with gr.Column(scale=2):  # card column
                with gr.Accordion(label="Full Evaluation Card", open=False):
                    gr_full_card = gr.Markdown(label="Full Evaluation Card", value=s.card.get_markdown_str())
                with gr.Group():
                    with gr.Row():
                        gr_summarizer = gr.Dropdown(label="Select a Model as the Summarizer", choices=ALL_SUPPORTED_MODELS, value=DEFAULT_SUMMARIZER, scale=2, interactive=True)
                        gr_re_summarize_button = gr.Button("Re-generate Summary", scale=1)
                    with gr.Accordion(label="Evaluation Card Summary", open=True):
                        gr_relevant_card = gr.Markdown(value=s.card_summary)
                gr_model_answer = gr.Textbox(label="Model's Answer", value="", interactive=False, show_copy_button=True)

        gr_submit_button.click(fn=submit_guess,
                               inputs=[gr_guess, gr_reasoning, gr_confidence, gr_state],
                               outputs=[gr_guess_result, gr_model_answer, gr_state])
        gr_next_button.click(fn=next_guess,
                             inputs=[gr_state],
                             outputs=[gr_question, gr_relevant_card, gr_model_answer, gr_guess_result, gr_state])
        gr_re_summarize_button.click(fn=re_summarize,
                                     inputs=[gr_state],
                                     outputs=[gr_relevant_card, gr_state])
        gr_switch_card_button.click(fn=switch_card,
                                    inputs=[gr_state],
                                    outputs=[gr_previous_model, gr_question, gr_guess_result, gr_full_card, gr_relevant_card, gr_model_answer, gr_state])

    app.launch()


if __name__ == "__main__":
    init_app()