Spaces:
Running
on
Zero
Running
on
Zero
Added Context preload and inference interruption (#2)
Browse files- Added Context preload and inference interruption (189d8573833ade641554ffcb21e5f5e30412ea75)
Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>
app.py
CHANGED
@@ -2,17 +2,17 @@ import gradio as gr
|
|
2 |
import random
|
3 |
import pandas as pd
|
4 |
import os
|
|
|
|
|
5 |
from utils.data_loader import get_random_example
|
6 |
from utils.models import generate_summaries, model_names
|
7 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
8 |
from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
with open(css_path, 'r') as f:
|
13 |
-
css_content = f.read()
|
14 |
|
15 |
-
# Feedback options
|
16 |
feedback_options = {
|
17 |
"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
|
18 |
"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
|
@@ -20,15 +20,11 @@ feedback_options = {
|
|
20 |
"neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"]
|
21 |
}
|
22 |
|
23 |
-
def
|
24 |
-
"""
|
25 |
-
|
26 |
-
agg_results = load_leaderboard_data()
|
27 |
-
|
28 |
example = get_random_example()
|
29 |
-
|
30 |
-
s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
|
31 |
-
|
32 |
context_desc = example.get('processed_context_desc', '')
|
33 |
if context_desc:
|
34 |
context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
|
@@ -37,36 +33,98 @@ def load_new_question_improved(agg_results=None, show_full=False):
|
|
37 |
context_html = get_context_html(example, show_full=show_full)
|
38 |
|
39 |
return [
|
40 |
-
example,
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
]
|
67 |
|
68 |
def select_vote_improved(winner_choice):
|
69 |
-
"""
|
70 |
feedback_choices = feedback_options.get(winner_choice, [])
|
71 |
|
72 |
btn_a_classes = ["vote-button"]
|
@@ -84,18 +142,18 @@ def select_vote_improved(winner_choice):
|
|
84 |
btn_neither_classes.append("selected")
|
85 |
|
86 |
return [
|
87 |
-
winner_choice,
|
88 |
-
gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),
|
89 |
-
gr.update(visible=True),
|
90 |
-
gr.update(interactive=True),
|
91 |
-
gr.update(elem_classes=btn_a_classes),
|
92 |
-
gr.update(elem_classes=btn_b_classes),
|
93 |
-
gr.update(elem_classes=btn_tie_classes),
|
94 |
-
gr.update(elem_classes=btn_neither_classes)
|
95 |
]
|
96 |
|
97 |
def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
|
98 |
-
"""Processes vote
|
99 |
if winner is None:
|
100 |
print("Warning: Submit called without a winner selected.")
|
101 |
return {}
|
@@ -123,7 +181,9 @@ def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
|
|
123 |
|
124 |
# Prepare Results Table
|
125 |
results_list = []
|
126 |
-
all_models = list(set(list(updated_results["wins"].keys()) +
|
|
|
|
|
127 |
|
128 |
for model in sorted(all_models):
|
129 |
wins = updated_results["wins"].get(model, 0)
|
@@ -146,34 +206,26 @@ def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
|
|
146 |
results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
|
147 |
|
148 |
return [
|
149 |
-
True,
|
150 |
-
|
151 |
-
gr.update(interactive=False),
|
152 |
-
gr.update(interactive=False),
|
153 |
-
gr.update(
|
154 |
-
gr.update(interactive=False),
|
155 |
-
gr.update(
|
156 |
-
gr.update(
|
157 |
-
gr.update(visible=False), # submit_button
|
158 |
-
gr.update(visible=True), # results_reveal_area
|
159 |
-
gr.update(interactive=False), # random_question_btn
|
160 |
-
gr.update(value=results_df, visible=True), # results_table_display
|
161 |
-
gr.update(elem_classes=["results-revealed"]), # main_interface_area
|
162 |
-
gr.update(interactive=True), # context_toggle_btn
|
163 |
-
gr.update(value=m_a), # model_a_reveal
|
164 |
-
gr.update(value=m_b) # model_b_reveal
|
165 |
]
|
166 |
|
167 |
-
# Create embedded CSS
|
168 |
-
css_html = f"<style>{css_content}</style>"
|
169 |
-
|
170 |
# Create Gradio interface
|
171 |
with gr.Blocks(theme=gr.themes.Default(
|
172 |
primary_hue=gr.themes.colors.orange,
|
173 |
secondary_hue=gr.themes.colors.slate
|
174 |
)) as demo:
|
175 |
-
#
|
176 |
-
|
|
|
|
|
|
|
177 |
|
178 |
# State Variables
|
179 |
current_example = gr.State({})
|
@@ -191,7 +243,6 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
191 |
with gr.Tabs() as tabs:
|
192 |
# Main Arena Tab
|
193 |
with gr.TabItem("Arena", id="arena-tab"):
|
194 |
-
# Main title and description
|
195 |
gr.Markdown("# RAG Summarizer Arena")
|
196 |
gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.")
|
197 |
|
@@ -206,10 +257,9 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
206 |
query_display = gr.Markdown(value="Loading question...", elem_classes="query-text")
|
207 |
random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button")
|
208 |
|
209 |
-
# Context description
|
210 |
context_description = gr.Markdown("", elem_classes="context-description")
|
211 |
-
|
212 |
-
# Context section
|
213 |
with gr.Row(elem_id="context-header-row"):
|
214 |
gr.Markdown("### Context Provided", elem_classes="context-title")
|
215 |
context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
|
@@ -236,11 +286,9 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
236 |
vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"])
|
237 |
vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"])
|
238 |
|
239 |
-
# Feedback
|
240 |
with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
|
241 |
feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
|
242 |
-
|
243 |
-
# Submit button
|
244 |
submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button")
|
245 |
|
246 |
# Results area
|
@@ -258,7 +306,7 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
258 |
model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
|
259 |
|
260 |
gr.HTML("<div style='height: 10px;'></div>")
|
261 |
-
|
262 |
# Try another button
|
263 |
with gr.Row(elem_classes=["control-buttons"]):
|
264 |
try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
|
@@ -269,99 +317,93 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
269 |
gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
|
270 |
results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
|
271 |
|
272 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
context_toggle_btn.click(
|
274 |
fn=toggle_context_display,
|
275 |
inputs=[current_example, show_full_context],
|
276 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
277 |
)
|
278 |
|
|
|
279 |
demo.load(
|
280 |
-
fn=
|
281 |
inputs=[],
|
282 |
-
outputs=[
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
random_question_btn.click(
|
294 |
-
fn=load_new_question_improved,
|
295 |
-
inputs=[],
|
296 |
-
outputs=[
|
297 |
-
current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
|
298 |
-
selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
|
299 |
-
query_display, context_description, context_display, context_toggle_btn,
|
300 |
-
summary_a_display, summary_b_display,
|
301 |
-
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
|
302 |
-
feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
|
303 |
-
main_interface_area
|
304 |
-
]
|
305 |
-
)
|
306 |
-
|
307 |
-
vote_button_a.click(
|
308 |
-
fn=lambda: select_vote_improved('left'),
|
309 |
-
inputs=None,
|
310 |
-
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
|
311 |
-
)
|
312 |
-
vote_button_b.click(
|
313 |
-
fn=lambda: select_vote_improved('right'),
|
314 |
-
inputs=None,
|
315 |
-
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
|
316 |
-
)
|
317 |
-
vote_button_tie.click(
|
318 |
-
fn=lambda: select_vote_improved('tie'),
|
319 |
-
inputs=None,
|
320 |
-
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
|
321 |
-
)
|
322 |
-
vote_button_neither.click(
|
323 |
-
fn=lambda: select_vote_improved('neither'),
|
324 |
-
inputs=None,
|
325 |
-
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
|
326 |
)
|
327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
feedback_checkboxes.change(
|
329 |
fn=update_feedback,
|
330 |
inputs=[feedback_checkboxes],
|
331 |
outputs=[feedback_list]
|
332 |
)
|
333 |
|
|
|
334 |
submit_button.click(
|
335 |
fn=submit_vote_fixed,
|
336 |
inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
|
337 |
-
outputs=[
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
submit_button,
|
343 |
-
results_reveal_area,
|
344 |
-
random_question_btn,
|
345 |
-
results_table_display,
|
346 |
-
main_interface_area,
|
347 |
-
context_toggle_btn,
|
348 |
-
model_a_reveal,
|
349 |
-
model_b_reveal
|
350 |
-
]
|
351 |
-
)
|
352 |
-
|
353 |
-
try_another_btn.click(
|
354 |
-
fn=load_new_question_improved,
|
355 |
-
inputs=[],
|
356 |
-
outputs=[
|
357 |
-
current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
|
358 |
-
selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
|
359 |
-
query_display, context_description, context_display, context_toggle_btn,
|
360 |
-
summary_a_display, summary_b_display,
|
361 |
-
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
|
362 |
-
feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
|
363 |
-
main_interface_area
|
364 |
-
]
|
365 |
)
|
366 |
|
367 |
if __name__ == "__main__":
|
|
|
2 |
import random
|
3 |
import pandas as pd
|
4 |
import os
|
5 |
+
import threading
|
6 |
+
from threading import Event
|
7 |
from utils.data_loader import get_random_example
|
8 |
from utils.models import generate_summaries, model_names
|
9 |
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
10 |
from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
|
11 |
|
12 |
+
# Global interrupt mechanism for model generation
|
13 |
+
generation_interrupt = Event()
|
|
|
|
|
14 |
|
15 |
+
# Feedback options for different voting outcomes
|
16 |
feedback_options = {
|
17 |
"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
|
18 |
"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
|
|
|
20 |
"neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"]
|
21 |
}
|
22 |
|
23 |
+
def load_context():
|
24 |
+
"""Load a new question and context (fast operation)"""
|
25 |
+
generation_interrupt.clear()
|
|
|
|
|
26 |
example = get_random_example()
|
27 |
+
|
|
|
|
|
28 |
context_desc = example.get('processed_context_desc', '')
|
29 |
if context_desc:
|
30 |
context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
|
|
|
33 |
context_html = get_context_html(example, show_full=show_full)
|
34 |
|
35 |
return [
|
36 |
+
example,
|
37 |
+
gr.update(value=example['question']),
|
38 |
+
gr.update(value=context_desc, visible=bool(context_desc)),
|
39 |
+
gr.update(value=context_html),
|
40 |
+
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
|
41 |
+
show_full
|
42 |
+
]
|
43 |
+
|
44 |
+
def generate_model_summaries_with_timeout(example, timeout=30):
|
45 |
+
"""Run model inference in a separate thread with timeout for interruptibility"""
|
46 |
+
import threading
|
47 |
+
import time
|
48 |
+
|
49 |
+
result = {
|
50 |
+
"model_a": "",
|
51 |
+
"model_b": "",
|
52 |
+
"summary_a": "",
|
53 |
+
"summary_b": "",
|
54 |
+
"completed": False
|
55 |
+
}
|
56 |
+
|
57 |
+
if generation_interrupt.is_set():
|
58 |
+
return result
|
59 |
+
|
60 |
+
def run_generation():
|
61 |
+
try:
|
62 |
+
m_a_name, m_b_name = random.sample(model_names, 2)
|
63 |
+
s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
|
64 |
+
|
65 |
+
if not generation_interrupt.is_set():
|
66 |
+
result["model_a"] = m_a_name
|
67 |
+
result["model_b"] = m_b_name
|
68 |
+
result["summary_a"] = s_a
|
69 |
+
result["summary_b"] = s_b
|
70 |
+
result["completed"] = True
|
71 |
+
except Exception as e:
|
72 |
+
print(f"Error in generation thread: {e}")
|
73 |
+
|
74 |
+
generation_thread = threading.Thread(target=run_generation)
|
75 |
+
generation_thread.daemon = True
|
76 |
+
generation_thread.start()
|
77 |
+
|
78 |
+
start_time = time.time()
|
79 |
+
while time.time() - start_time < timeout:
|
80 |
+
if generation_interrupt.is_set() or not generation_thread.is_alive() or result["completed"]:
|
81 |
+
break
|
82 |
+
time.sleep(0.1)
|
83 |
+
|
84 |
+
return result
|
85 |
+
|
86 |
+
def process_generation_result(result):
|
87 |
+
"""Process the results from the threaded generation function"""
|
88 |
+
if not result["completed"]:
|
89 |
+
# Generation was interrupted or failed
|
90 |
+
return [
|
91 |
+
"", "", "", "", None, [], False, load_leaderboard_data(),
|
92 |
+
gr.update(value="Generation was interrupted or timed out. Please try again."),
|
93 |
+
gr.update(value="Generation was interrupted or timed out. Please try again."),
|
94 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
95 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
96 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
97 |
+
gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]),
|
98 |
+
gr.update(choices=[], value=[], interactive=False, visible=False),
|
99 |
+
gr.update(visible=False),
|
100 |
+
gr.update(interactive=False, visible=True),
|
101 |
+
gr.update(visible=False),
|
102 |
+
gr.update(interactive=True),
|
103 |
+
gr.update(elem_classes=[])
|
104 |
+
]
|
105 |
+
|
106 |
+
# Generation completed successfully
|
107 |
+
agg_results = load_leaderboard_data()
|
108 |
+
return [
|
109 |
+
result["model_a"], result["model_b"],
|
110 |
+
result["summary_a"], result["summary_b"],
|
111 |
+
None, [], False, agg_results,
|
112 |
+
gr.update(value=result["summary_a"]),
|
113 |
+
gr.update(value=result["summary_b"]),
|
114 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
115 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
116 |
+
gr.update(interactive=True, elem_classes=["vote-button"]),
|
117 |
+
gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]),
|
118 |
+
gr.update(choices=[], value=[], interactive=False, visible=False),
|
119 |
+
gr.update(visible=False),
|
120 |
+
gr.update(interactive=False, visible=True),
|
121 |
+
gr.update(visible=False),
|
122 |
+
gr.update(interactive=True),
|
123 |
+
gr.update(elem_classes=[])
|
124 |
]
|
125 |
|
126 |
def select_vote_improved(winner_choice):
|
127 |
+
"""Updates UI based on vote selection"""
|
128 |
feedback_choices = feedback_options.get(winner_choice, [])
|
129 |
|
130 |
btn_a_classes = ["vote-button"]
|
|
|
142 |
btn_neither_classes.append("selected")
|
143 |
|
144 |
return [
|
145 |
+
winner_choice,
|
146 |
+
gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),
|
147 |
+
gr.update(visible=True),
|
148 |
+
gr.update(interactive=True),
|
149 |
+
gr.update(elem_classes=btn_a_classes),
|
150 |
+
gr.update(elem_classes=btn_b_classes),
|
151 |
+
gr.update(elem_classes=btn_tie_classes),
|
152 |
+
gr.update(elem_classes=btn_neither_classes)
|
153 |
]
|
154 |
|
155 |
def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
|
156 |
+
"""Processes vote and updates leaderboard"""
|
157 |
if winner is None:
|
158 |
print("Warning: Submit called without a winner selected.")
|
159 |
return {}
|
|
|
181 |
|
182 |
# Prepare Results Table
|
183 |
results_list = []
|
184 |
+
all_models = list(set(list(updated_results["wins"].keys()) +
|
185 |
+
list(updated_results["losses"].keys()) +
|
186 |
+
list(updated_results["ties"].keys())))
|
187 |
|
188 |
for model in sorted(all_models):
|
189 |
wins = updated_results["wins"].get(model, 0)
|
|
|
206 |
results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
|
207 |
|
208 |
return [
|
209 |
+
True, updated_results,
|
210 |
+
gr.update(interactive=False), gr.update(interactive=False),
|
211 |
+
gr.update(interactive=False), gr.update(interactive=False),
|
212 |
+
gr.update(interactive=False), gr.update(visible=True),
|
213 |
+
gr.update(visible=False), gr.update(visible=True),
|
214 |
+
gr.update(interactive=False), gr.update(value=results_df, visible=True),
|
215 |
+
gr.update(elem_classes=["results-revealed"]),
|
216 |
+
gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
]
|
218 |
|
|
|
|
|
|
|
219 |
# Create Gradio interface
|
220 |
with gr.Blocks(theme=gr.themes.Default(
|
221 |
primary_hue=gr.themes.colors.orange,
|
222 |
secondary_hue=gr.themes.colors.slate
|
223 |
)) as demo:
|
224 |
+
# Load CSS
|
225 |
+
css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
|
226 |
+
with open(css_path, 'r') as f:
|
227 |
+
css_content = f.read()
|
228 |
+
gr.HTML(f"<style>{css_content}</style>")
|
229 |
|
230 |
# State Variables
|
231 |
current_example = gr.State({})
|
|
|
243 |
with gr.Tabs() as tabs:
|
244 |
# Main Arena Tab
|
245 |
with gr.TabItem("Arena", id="arena-tab"):
|
|
|
246 |
gr.Markdown("# RAG Summarizer Arena")
|
247 |
gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.")
|
248 |
|
|
|
257 |
query_display = gr.Markdown(value="Loading question...", elem_classes="query-text")
|
258 |
random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button")
|
259 |
|
260 |
+
# Context description and display
|
261 |
context_description = gr.Markdown("", elem_classes="context-description")
|
262 |
+
|
|
|
263 |
with gr.Row(elem_id="context-header-row"):
|
264 |
gr.Markdown("### Context Provided", elem_classes="context-title")
|
265 |
context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
|
|
|
286 |
vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"])
|
287 |
vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"])
|
288 |
|
289 |
+
# Feedback and Submit sections
|
290 |
with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
|
291 |
feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
|
|
|
|
|
292 |
submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button")
|
293 |
|
294 |
# Results area
|
|
|
306 |
model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
|
307 |
|
308 |
gr.HTML("<div style='height: 10px;'></div>")
|
309 |
+
|
310 |
# Try another button
|
311 |
with gr.Row(elem_classes=["control-buttons"]):
|
312 |
try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
|
|
|
317 |
gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
|
318 |
results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
|
319 |
|
320 |
+
# Generic function to handle starting a new example
|
321 |
+
def handle_new_example_click():
|
322 |
+
generation_interrupt.set() # Interrupt any ongoing generation
|
323 |
+
return load_context()[0]
|
324 |
+
|
325 |
+
def update_ui_for_new_context(example):
|
326 |
+
return [
|
327 |
+
gr.update(value=example['question']),
|
328 |
+
gr.update(value=example.get('processed_context_desc', ''), visible=bool(example.get('processed_context_desc', ''))),
|
329 |
+
gr.update(value=get_context_html(example, False)),
|
330 |
+
gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
|
331 |
+
False
|
332 |
+
]
|
333 |
+
|
334 |
+
# Event handling
|
335 |
+
# Toggle context display
|
336 |
context_toggle_btn.click(
|
337 |
fn=toggle_context_display,
|
338 |
inputs=[current_example, show_full_context],
|
339 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
340 |
)
|
341 |
|
342 |
+
# Initial loading - context first, then summaries
|
343 |
demo.load(
|
344 |
+
fn=load_context,
|
345 |
inputs=[],
|
346 |
+
outputs=[current_example, query_display, context_description, context_display,
|
347 |
+
context_toggle_btn, show_full_context]
|
348 |
+
).then(
|
349 |
+
fn=lambda example: process_generation_result(generate_model_summaries_with_timeout(example)),
|
350 |
+
inputs=[current_example],
|
351 |
+
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
|
352 |
+
selected_winner, feedback_list, show_results_state, results_agg,
|
353 |
+
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
|
354 |
+
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
|
355 |
+
submit_button, results_reveal_area, random_question_btn, main_interface_area]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
)
|
357 |
|
358 |
+
# Random Question and Try Another buttons with interruption
|
359 |
+
for btn in [random_question_btn, try_another_btn]:
|
360 |
+
btn.click(
|
361 |
+
fn=handle_new_example_click,
|
362 |
+
inputs=[],
|
363 |
+
outputs=[current_example]
|
364 |
+
).then(
|
365 |
+
fn=update_ui_for_new_context,
|
366 |
+
inputs=[current_example],
|
367 |
+
outputs=[query_display, context_description, context_display,
|
368 |
+
context_toggle_btn, show_full_context]
|
369 |
+
).then(
|
370 |
+
fn=lambda example: process_generation_result(generate_model_summaries_with_timeout(example)),
|
371 |
+
inputs=[current_example],
|
372 |
+
outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
|
373 |
+
selected_winner, feedback_list, show_results_state, results_agg,
|
374 |
+
summary_a_display, summary_b_display, vote_button_a, vote_button_b,
|
375 |
+
vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
|
376 |
+
submit_button, results_reveal_area, random_question_btn, main_interface_area]
|
377 |
+
)
|
378 |
+
|
379 |
+
# Vote button handlers
|
380 |
+
for btn, choice in zip(
|
381 |
+
[vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
|
382 |
+
['left', 'right', 'tie', 'neither']
|
383 |
+
):
|
384 |
+
btn.click(
|
385 |
+
fn=lambda choice=choice: select_vote_improved(choice),
|
386 |
+
inputs=None,
|
387 |
+
outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button,
|
388 |
+
vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
|
389 |
+
)
|
390 |
+
|
391 |
+
# Update feedback when checkboxes change
|
392 |
feedback_checkboxes.change(
|
393 |
fn=update_feedback,
|
394 |
inputs=[feedback_checkboxes],
|
395 |
outputs=[feedback_list]
|
396 |
)
|
397 |
|
398 |
+
# Process vote submission and reveal results
|
399 |
submit_button.click(
|
400 |
fn=submit_vote_fixed,
|
401 |
inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
|
402 |
+
outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
|
403 |
+
vote_button_tie, vote_button_neither, feedback_checkboxes,
|
404 |
+
feedback_section, submit_button, results_reveal_area,
|
405 |
+
random_question_btn, results_table_display, main_interface_area,
|
406 |
+
context_toggle_btn, model_a_reveal, model_b_reveal]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
)
|
408 |
|
409 |
if __name__ == "__main__":
|