aizip-dev commited on
Commit
7198068
·
verified ·
1 Parent(s): 31bbdd9

update instruction

Browse files
Files changed (1) hide show
  1. app.py +36 -89
app.py CHANGED
@@ -9,35 +9,25 @@ from utils.models import generate_summaries, model_names
9
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
10
  from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
11
  from utils.vote_logger import save_vote_details
12
- from utils.shared import generation_interrupt # Import from shared module
13
 
14
- # Feedback options for different voting outcomes
15
  feedback_options = {
16
  "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
17
  "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
18
- "tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)",
19
- "Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
20
  "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
21
  "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
22
  }
23
 
24
  def load_context(set_interrupt=False):
25
- """
26
- Load a new question and context
27
-
28
- Parameters:
29
- - set_interrupt: If True, will interrupt any ongoing inference before loading
30
- """
31
  if set_interrupt:
32
- # Interrupt any ongoing inference
33
  generation_interrupt.set()
34
- time.sleep(0.2) # Short delay to allow threads to detect interrupt
35
 
36
- # Always clear the flag before starting new work
37
  generation_interrupt.clear()
38
  example = get_random_example()
39
 
40
- # Format the context description
41
  context_desc = example.get('processed_context_desc', '')
42
  if context_desc:
43
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
@@ -55,14 +45,11 @@ def load_context(set_interrupt=False):
55
  ]
56
 
57
  def load_leaderboard():
58
- """Loads and displays the leaderboard data"""
59
  results = load_leaderboard_data()
60
  leaderboard_html = generate_leaderboard_html(results)
61
  return leaderboard_html
62
 
63
  def generate_model_summaries(example):
64
- """Run model inference"""
65
-
66
  result = {
67
  "model_a": "",
68
  "model_b": "",
@@ -77,7 +64,6 @@ def generate_model_summaries(example):
77
  try:
78
  m_a_name, m_b_name = random.sample(model_names, 2)
79
 
80
- # Track the partial completion state
81
  result["model_a"] = m_a_name
82
  result["model_b"] = m_b_name
83
 
@@ -86,16 +72,14 @@ def generate_model_summaries(example):
86
  if not generation_interrupt.is_set():
87
  result["summary_a"] = s_a
88
  result["summary_b"] = s_b
89
- result["completed"] = bool(s_a and s_b) # Only mark complete if both have content
90
  except Exception as e:
91
  print(f"Error in generation: {e}")
92
 
93
  return result
94
 
95
  def process_generation_result(result):
96
- """Process the results from the generation function"""
97
  if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
98
- # Either generation was interrupted or both summaries aren't ready
99
  return [
100
  result.get("model_a", ""),
101
  result.get("model_b", ""),
@@ -104,7 +88,7 @@ def process_generation_result(result):
104
  None, [], False, load_leaderboard_data(),
105
  gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
106
  gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
107
- gr.update(interactive=False, elem_classes=["vote-button"]), # Explicitly disable
108
  gr.update(interactive=False, elem_classes=["vote-button"]),
109
  gr.update(interactive=False, elem_classes=["vote-button"]),
110
  gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
@@ -116,10 +100,8 @@ def process_generation_result(result):
116
  gr.update(elem_classes=[])
117
  ]
118
 
119
- # Only enable voting when both summaries are complete and non-empty
120
  buttons_interactive = bool(result["summary_a"] and result["summary_b"])
121
 
122
- # Generation completed successfully
123
  agg_results = load_leaderboard_data()
124
  return [
125
  result["model_a"], result["model_b"],
@@ -144,7 +126,6 @@ def process_example(example):
144
  return process_generation_result(result)
145
 
146
  def select_vote_improved(winner_choice):
147
- """Updates UI based on vote selection"""
148
  feedback_choices = feedback_options.get(winner_choice, [])
149
 
150
  btn_a_classes = ["vote-button"]
@@ -173,36 +154,31 @@ def select_vote_improved(winner_choice):
173
  ]
174
 
175
  def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
176
- """Handle vote submission - logs details and updates leaderboard"""
177
  if winner is None:
178
  print("Warning: Submit called without a winner selected.")
179
  return {}
180
 
181
- # Save detailed vote information
182
  save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
183
-
184
- # Update Elo ratings and get UI updates
185
  return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
186
 
187
  def show_loading_state():
188
- """Show loading state while fetching new content"""
189
  return [
190
  gr.update(value="Loading new question and summaries...", interactive=False),
191
  gr.update(value="Loading new question and summaries...", interactive=False),
192
- gr.update(interactive=False), # For vote_button_a
193
- gr.update(interactive=False), # For vote_button_b
194
- gr.update(interactive=False), # For vote_button_tie
195
- gr.update(interactive=False) # For vote_button_neither
 
 
 
 
196
  ]
197
 
198
  def handle_new_example_click():
199
- """Handle clicking 'Get new example' button"""
200
- # Use the centralized approach - set_interrupt=True tells load_context to handle interruption
201
  return load_context(set_interrupt=True)[0]
202
 
203
  def update_ui_for_new_context(example):
204
- """Update UI with new context information"""
205
- # Format the context description
206
  context_desc = example.get('processed_context_desc', '')
207
  if context_desc:
208
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
@@ -215,42 +191,30 @@ def update_ui_for_new_context(example):
215
  False
216
  ]
217
 
218
- # Resource cleanup function for unload event
219
  def cleanup_on_disconnect():
220
- """Clean up resources when browser disconnects"""
221
  print(f"Browser disconnected. Cleaning up resources...")
222
  generation_interrupt.set()
223
- # No need for time.sleep here as this is just setting the flag
224
- # Threads will detect it on their next check
225
 
226
- # Create Gradio interface
227
  with gr.Blocks(theme=gr.themes.Default(
228
  primary_hue=gr.themes.colors.orange,
229
  secondary_hue=gr.themes.colors.slate
230
  )) as demo:
231
- # Load CSS
232
  css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
233
 
234
- # Load the CSS file
235
  with open(css_path, 'r') as f:
236
  css_content = f.read()
237
 
238
- # Create HTML components with CSS
239
  gr.HTML(f"<style>{css_content}</style>")
240
 
241
- # Add JavaScript to handle browser unload events
242
  unload_js = """
243
  <script>
244
- // This runs when the page is about to be closed or refreshed
245
  window.addEventListener('beforeunload', function(e) {
246
- // Send a synchronous request to the server
247
  navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash);
248
  });
249
  </script>
250
  """
251
  gr.HTML(unload_js)
252
 
253
- # State Variables
254
  current_example = gr.State({})
255
  model_a_name = gr.State("")
256
  model_b_name = gr.State("")
@@ -262,44 +226,45 @@ with gr.Blocks(theme=gr.themes.Default(
262
  results_agg = gr.State(load_leaderboard_data())
263
  show_full_context = gr.State(False)
264
 
265
- # Create Tabs
266
  with gr.Tabs() as tabs:
267
- # Main Arena Tab
268
  with gr.TabItem("Arena", id="arena-tab"):
269
- gr.Markdown("# RAG SLM Summarizer/Generator Arena")
270
  gr.Markdown("""
271
- 1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n
272
- 2️⃣ Compare answers generated by two different models side-by-side\n
273
- 3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""")
 
 
 
 
 
 
 
274
 
275
  gr.HTML("<hr>")
276
 
277
- # Main container
278
  with gr.Column(elem_id="main-interface-area") as main_interface_area:
279
- # Query section
280
  with gr.Row(elem_id="query-title-row"):
281
- gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading")
282
 
283
  with gr.Row(elem_id="query-container"):
284
  with gr.Row(elem_classes="query-box-row"):
285
  query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
286
  random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
287
 
288
- # Context description and display
289
  context_description = gr.Markdown("", elem_classes="context-description")
290
 
291
  gr.HTML("<hr>")
292
 
293
  with gr.Row(elem_id="context-header-row"):
294
- gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title")
295
  context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
296
 
297
  context_display = gr.HTML(value="Loading context...", label="Context Chunks")
298
 
299
  gr.Markdown("---")
300
- gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading")
301
 
302
- # Model summaries - Add ID for JavaScript to target and disable autoscroll
303
  with gr.Row(elem_id="summary-containers"):
304
  with gr.Column(scale=1):
305
  with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
@@ -308,7 +273,7 @@ with gr.Blocks(theme=gr.themes.Default(
308
  lines=10,
309
  interactive=False,
310
  show_copy_button=True,
311
- autoscroll=False, # Disable auto-scrolling
312
  elem_id="summary-a-display"
313
  )
314
  with gr.Column(scale=1):
@@ -318,13 +283,12 @@ with gr.Blocks(theme=gr.themes.Default(
318
  lines=10,
319
  interactive=False,
320
  show_copy_button=True,
321
- autoscroll=False, # Disable auto-scrolling
322
  elem_id="summary-b-display"
323
  )
324
 
325
  gr.HTML("<hr>")
326
 
327
- # Voting section
328
  gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
329
  with gr.Row():
330
  vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
@@ -332,17 +296,14 @@ with gr.Blocks(theme=gr.themes.Default(
332
  vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
333
  vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
334
 
335
- # Feedback and Submit sections
336
  with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
337
  feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
338
  submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
339
 
340
- # Results area
341
  with gr.Column(visible=False) as results_reveal_area:
342
  gr.Markdown("---")
343
  gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
344
 
345
- # Model reveal section
346
  with gr.Row():
347
  with gr.Column(scale=1):
348
  gr.Markdown("### Model A was:", elem_classes="section-heading")
@@ -353,11 +314,9 @@ with gr.Blocks(theme=gr.themes.Default(
353
 
354
  gr.HTML("<hr>")
355
 
356
- # Try another button
357
  with gr.Row(elem_classes=["control-buttons"]):
358
  try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
359
 
360
- # Leaderboard Tab
361
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
362
  gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
363
  gr.Markdown("View performance statistics for all models ranked by Elo rating.")
@@ -374,18 +333,14 @@ The Elo rating system provides a more accurate ranking than simple win rates:
374
 
375
  results_table_display = gr.HTML(label="Model Performance")
376
 
377
- # Event handling
378
- # Toggle context display
379
  context_toggle_btn.click(
380
  fn=toggle_context_display,
381
  inputs=[current_example, show_full_context],
382
  outputs=[show_full_context, context_display, context_toggle_btn]
383
  )
384
 
385
- # Initial loading - context first, then summaries
386
- # Uses load_context without interruption since it's the first load
387
  demo.load(
388
- fn=load_context, # Default is set_interrupt=False
389
  inputs=[],
390
  outputs=[current_example, query_display, context_description, context_display,
391
  context_toggle_btn, show_full_context]
@@ -399,34 +354,31 @@ The Elo rating system provides a more accurate ranking than simple win rates:
399
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
400
  )
401
 
402
- # Load leaderboard content on app start
403
  demo.load(
404
  fn=load_leaderboard,
405
  inputs=[],
406
  outputs=[results_table_display]
407
  )
408
 
409
- # Use a single event chain for each button, structured to update UI first, then run inference
410
  for btn in [random_question_btn, try_another_btn]:
411
  btn.click(
412
- # Step 1: Show loading state immediately
413
  fn=show_loading_state,
414
  inputs=[],
415
- outputs=[summary_a_display, summary_b_display, vote_button_a,
416
- vote_button_b, vote_button_tie, vote_button_neither]
 
 
 
417
  ).then(
418
- # Step 2: Get new example
419
  fn=handle_new_example_click,
420
  inputs=[],
421
  outputs=[current_example]
422
  ).then(
423
- # Step 3: Update context UI immediately
424
  fn=update_ui_for_new_context,
425
  inputs=[current_example],
426
  outputs=[query_display, context_description, context_display,
427
  context_toggle_btn, show_full_context]
428
  ).then(
429
- # Step 4: Then process example for model outputs
430
  fn=process_example,
431
  inputs=[current_example],
432
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
@@ -436,7 +388,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
436
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
437
  )
438
 
439
- # Vote button handlers
440
  for btn, choice in zip(
441
  [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
442
  ['left', 'right', 'tie', 'neither']
@@ -448,14 +399,12 @@ The Elo rating system provides a more accurate ranking than simple win rates:
448
  vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
449
  )
450
 
451
- # Update feedback when checkboxes change
452
  feedback_checkboxes.change(
453
  fn=update_feedback,
454
  inputs=[feedback_checkboxes],
455
  outputs=[feedback_list]
456
  )
457
 
458
- # Process vote submission and reveal results
459
  submit_button.click(
460
  fn=handle_vote_submission,
461
  inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
@@ -466,7 +415,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
466
  context_toggle_btn, model_a_reveal, model_b_reveal]
467
  )
468
 
469
- # Refresh leaderboard when switching to the leaderboard tab
470
  tabs.select(
471
  fn=load_leaderboard,
472
  inputs=[],
@@ -474,7 +422,6 @@ The Elo rating system provides a more accurate ranking than simple win rates:
474
  api_name="refresh_leaderboard"
475
  )
476
 
477
- # Register unload event for browser disconnections
478
  demo.unload(cleanup_on_disconnect)
479
 
480
  if __name__ == "__main__":
 
9
  from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
10
  from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
11
  from utils.vote_logger import save_vote_details
12
+ from utils.shared import generation_interrupt
13
 
 
14
  feedback_options = {
15
  "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
16
  "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
17
+ "tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)",
18
+ "Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"],
19
  "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
20
  "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
21
  }
22
 
23
  def load_context(set_interrupt=False):
 
 
 
 
 
 
24
  if set_interrupt:
 
25
  generation_interrupt.set()
26
+ time.sleep(0.2)
27
 
 
28
  generation_interrupt.clear()
29
  example = get_random_example()
30
 
 
31
  context_desc = example.get('processed_context_desc', '')
32
  if context_desc:
33
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
 
45
  ]
46
 
47
  def load_leaderboard():
 
48
  results = load_leaderboard_data()
49
  leaderboard_html = generate_leaderboard_html(results)
50
  return leaderboard_html
51
 
52
  def generate_model_summaries(example):
 
 
53
  result = {
54
  "model_a": "",
55
  "model_b": "",
 
64
  try:
65
  m_a_name, m_b_name = random.sample(model_names, 2)
66
 
 
67
  result["model_a"] = m_a_name
68
  result["model_b"] = m_b_name
69
 
 
72
  if not generation_interrupt.is_set():
73
  result["summary_a"] = s_a
74
  result["summary_b"] = s_b
75
+ result["completed"] = bool(s_a and s_b)
76
  except Exception as e:
77
  print(f"Error in generation: {e}")
78
 
79
  return result
80
 
81
  def process_generation_result(result):
 
82
  if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
 
83
  return [
84
  result.get("model_a", ""),
85
  result.get("model_b", ""),
 
88
  None, [], False, load_leaderboard_data(),
89
  gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
90
  gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
91
+ gr.update(interactive=False, elem_classes=["vote-button"]),
92
  gr.update(interactive=False, elem_classes=["vote-button"]),
93
  gr.update(interactive=False, elem_classes=["vote-button"]),
94
  gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
 
100
  gr.update(elem_classes=[])
101
  ]
102
 
 
103
  buttons_interactive = bool(result["summary_a"] and result["summary_b"])
104
 
 
105
  agg_results = load_leaderboard_data()
106
  return [
107
  result["model_a"], result["model_b"],
 
126
  return process_generation_result(result)
127
 
128
  def select_vote_improved(winner_choice):
 
129
  feedback_choices = feedback_options.get(winner_choice, [])
130
 
131
  btn_a_classes = ["vote-button"]
 
154
  ]
155
 
156
  def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
 
157
  if winner is None:
158
  print("Warning: Submit called without a winner selected.")
159
  return {}
160
 
 
161
  save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)
 
 
162
  return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)
163
 
164
  def show_loading_state():
 
165
  return [
166
  gr.update(value="Loading new question and summaries...", interactive=False),
167
  gr.update(value="Loading new question and summaries...", interactive=False),
168
+ gr.update(interactive=False),
169
+ gr.update(interactive=False),
170
+ gr.update(interactive=False),
171
+ gr.update(interactive=False),
172
+ gr.update(visible=False),
173
+ gr.update(interactive=False),
174
+ gr.update(visible=False),
175
+ gr.update(interactive=False)
176
  ]
177
 
178
  def handle_new_example_click():
 
 
179
  return load_context(set_interrupt=True)[0]
180
 
181
  def update_ui_for_new_context(example):
 
 
182
  context_desc = example.get('processed_context_desc', '')
183
  if context_desc:
184
  context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
 
191
  False
192
  ]
193
 
 
194
  def cleanup_on_disconnect():
 
195
  print(f"Browser disconnected. Cleaning up resources...")
196
  generation_interrupt.set()
 
 
197
 
 
198
  with gr.Blocks(theme=gr.themes.Default(
199
  primary_hue=gr.themes.colors.orange,
200
  secondary_hue=gr.themes.colors.slate
201
  )) as demo:
 
202
  css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
203
 
 
204
  with open(css_path, 'r') as f:
205
  css_content = f.read()
206
 
 
207
  gr.HTML(f"<style>{css_content}</style>")
208
 
 
209
  unload_js = """
210
  <script>
 
211
  window.addEventListener('beforeunload', function(e) {
 
212
  navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash);
213
  });
214
  </script>
215
  """
216
  gr.HTML(unload_js)
217
 
 
218
  current_example = gr.State({})
219
  model_a_name = gr.State("")
220
  model_b_name = gr.State("")
 
226
  results_agg = gr.State(load_leaderboard_data())
227
  show_full_context = gr.State(False)
228
 
 
229
  with gr.Tabs() as tabs:
 
230
  with gr.TabItem("Arena", id="arena-tab"):
231
+ gr.Markdown("# SLM RAG Summarization/Generation Arena")
232
  gr.Markdown("""
233
+ 🏟️ This arena evaluates small language models on document QA tasks with retrieved context. Models should provide **grounded, comprehensive** answers or **properly decline** with clarification when information is insufficient.
234
+
235
+ 1️⃣ **Review the query and context** - ✨Highlighted text✨ contains key information that should be included in good answers
236
+
237
+ 2️⃣ **Compare answers** generated by two different models working with the same query and context
238
+
239
+ 3️⃣ **Vote for the better response** or select 'Tie/Neither' if appropriate
240
+
241
+ > **Note:** Highlights are abbreviated contexts based on ground truth (via GPT-4o). Full Context shows the actual text provided to the models.
242
+ """)
243
 
244
  gr.HTML("<hr>")
245
 
 
246
  with gr.Column(elem_id="main-interface-area") as main_interface_area:
 
247
  with gr.Row(elem_id="query-title-row"):
248
+ gr.Markdown("### 💬 Query - Question About Document Content", elem_classes="section-heading")
249
 
250
  with gr.Row(elem_id="query-container"):
251
  with gr.Row(elem_classes="query-box-row"):
252
  query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
253
  random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")
254
 
 
255
  context_description = gr.Markdown("", elem_classes="context-description")
256
 
257
  gr.HTML("<hr>")
258
 
259
  with gr.Row(elem_id="context-header-row"):
260
+ gr.Markdown("### 📋 Context - Retrieved Content from the Document", elem_classes="context-title")
261
  context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
262
 
263
  context_display = gr.HTML(value="Loading context...", label="Context Chunks")
264
 
265
  gr.Markdown("---")
266
+ gr.Markdown("### 🔍 Compare Models - Are these Grouded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
267
 
 
268
  with gr.Row(elem_id="summary-containers"):
269
  with gr.Column(scale=1):
270
  with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
 
273
  lines=10,
274
  interactive=False,
275
  show_copy_button=True,
276
+ autoscroll=False,
277
  elem_id="summary-a-display"
278
  )
279
  with gr.Column(scale=1):
 
283
  lines=10,
284
  interactive=False,
285
  show_copy_button=True,
286
+ autoscroll=False,
287
  elem_id="summary-b-display"
288
  )
289
 
290
  gr.HTML("<hr>")
291
 
 
292
  gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
293
  with gr.Row():
294
  vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
 
296
  vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
297
  vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)
298
 
 
299
  with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
300
  feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
301
  submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")
302
 
 
303
  with gr.Column(visible=False) as results_reveal_area:
304
  gr.Markdown("---")
305
  gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
306
 
 
307
  with gr.Row():
308
  with gr.Column(scale=1):
309
  gr.Markdown("### Model A was:", elem_classes="section-heading")
 
314
 
315
  gr.HTML("<hr>")
316
 
 
317
  with gr.Row(elem_classes=["control-buttons"]):
318
  try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
319
 
 
320
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
321
  gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
322
  gr.Markdown("View performance statistics for all models ranked by Elo rating.")
 
333
 
334
  results_table_display = gr.HTML(label="Model Performance")
335
 
 
 
336
  context_toggle_btn.click(
337
  fn=toggle_context_display,
338
  inputs=[current_example, show_full_context],
339
  outputs=[show_full_context, context_display, context_toggle_btn]
340
  )
341
 
 
 
342
  demo.load(
343
+ fn=load_context,
344
  inputs=[],
345
  outputs=[current_example, query_display, context_description, context_display,
346
  context_toggle_btn, show_full_context]
 
354
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
355
  )
356
 
 
357
  demo.load(
358
  fn=load_leaderboard,
359
  inputs=[],
360
  outputs=[results_table_display]
361
  )
362
 
 
363
  for btn in [random_question_btn, try_another_btn]:
364
  btn.click(
 
365
  fn=show_loading_state,
366
  inputs=[],
367
+ outputs=[
368
+ summary_a_display, summary_b_display,
369
+ vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
370
+ feedback_section, submit_button, results_reveal_area, random_question_btn
371
+ ]
372
  ).then(
 
373
  fn=handle_new_example_click,
374
  inputs=[],
375
  outputs=[current_example]
376
  ).then(
 
377
  fn=update_ui_for_new_context,
378
  inputs=[current_example],
379
  outputs=[query_display, context_description, context_display,
380
  context_toggle_btn, show_full_context]
381
  ).then(
 
382
  fn=process_example,
383
  inputs=[current_example],
384
  outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
 
388
  submit_button, results_reveal_area, random_question_btn, main_interface_area]
389
  )
390
 
 
391
  for btn, choice in zip(
392
  [vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
393
  ['left', 'right', 'tie', 'neither']
 
399
  vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
400
  )
401
 
 
402
  feedback_checkboxes.change(
403
  fn=update_feedback,
404
  inputs=[feedback_checkboxes],
405
  outputs=[feedback_list]
406
  )
407
 
 
408
  submit_button.click(
409
  fn=handle_vote_submission,
410
  inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
 
415
  context_toggle_btn, model_a_reveal, model_b_reveal]
416
  )
417
 
 
418
  tabs.select(
419
  fn=load_leaderboard,
420
  inputs=[],
 
422
  api_name="refresh_leaderboard"
423
  )
424
 
 
425
  demo.unload(cleanup_on_disconnect)
426
 
427
  if __name__ == "__main__":