justinxzhao commited on
Commit
e893baa
1 Parent(s): 707a231

Add descriptions and switch order of tabs

Browse files
Files changed (1) hide show
  1. app.py +67 -14
app.py CHANGED
@@ -9,6 +9,18 @@ import random
9
  st.set_page_config(layout="wide")
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def prep_rankings_table(df, y_column):
13
  # Create a copy of the dataframe.
14
  df_copy = df.copy()
@@ -121,9 +133,50 @@ def app():
121
  st.session_state.instruction_options
122
  )
123
 
124
- st.title("AlpacaEval Visualizations")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Load the data
129
  df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
@@ -135,7 +188,7 @@ def app():
135
  # Prepare the model selector options
136
  model_options = df_response_judging["generator_2"].unique().tolist()
137
 
138
- with outer_tabs[0]:
139
  # Define the preset groups
140
  presets = {
141
  "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
@@ -159,8 +212,6 @@ def app():
159
  options=["custom", "gpt", "claude", "moa", "llama"],
160
  )
161
 
162
- st.divider()
163
-
164
  # Add multiselect for custom model selection
165
  if preset_selection == "custom":
166
  selected_models = st.multiselect(
@@ -169,6 +220,8 @@ def app():
169
  else:
170
  selected_models = presets[preset_selection]
171
 
 
 
172
  def create_scatter_plot(df, y_column, selected_models, title):
173
  fig = go.Figure()
174
 
@@ -266,7 +319,7 @@ def app():
266
 
267
  return fig, r_squared_words, r_squared_tokens
268
 
269
- st.markdown("## Overall win rate")
270
  y_column1 = "length_controlled_winrate"
271
  y_column2 = "win_rate"
272
  y_column3 = "discrete_win_rate"
@@ -326,7 +379,7 @@ def app():
326
  f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
327
  )
328
 
329
- st.markdown("## Length bias in battles")
330
 
331
  df_response_judging_copy = df_response_judging.copy()
332
  if not selected_models:
@@ -406,9 +459,9 @@ def app():
406
  st.dataframe(df)
407
 
408
  # Data explorer
409
- with outer_tabs[1]:
410
  # Add randomize button at the top of the app
411
- st.markdown("## Choose example")
412
  st.button(
413
  ":game_die: Randomize!",
414
  on_click=randomize_selection,
@@ -450,12 +503,12 @@ def app():
450
 
451
  st.divider()
452
 
453
- st.markdown(f"## Selected instruction")
454
  st.info(st.session_state.selected_instruction)
455
 
456
  st.divider()
457
 
458
- st.markdown(f"## Overall Battles")
459
  all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
460
  "output_1"
461
  ].apply(lambda x: len(x.split()))
@@ -517,7 +570,7 @@ def app():
517
  better_models["output_2_num_words"] > num_words_for_fixed_model
518
  ]
519
  col3.markdown(
520
- f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
521
  )
522
  if shorter_models.size != 0:
523
  shorter_models_string = ""
@@ -539,7 +592,7 @@ def app():
539
  col3.write("None")
540
 
541
  # Judging details.
542
- st.markdown(f"## Individual Battle Details")
543
  judging_details = df_response_judging[
544
  (df_response_judging["generator_1"] == fixed_model)
545
  & (df_response_judging["generator_2"] == st.session_state.selected_model)
@@ -577,7 +630,7 @@ def app():
577
  )
578
 
579
  # Create two columns for model selectors
580
- st.markdown("## Responses")
581
  col1, col2 = st.columns(2)
582
 
583
  with col1:
 
9
  st.set_page_config(layout="wide")
10
 
11
 
12
+ # Custom CSS to center title and header
13
+ center_css = """
14
+ <style>
15
+ h1, h2, h3, h6{
16
+ text-align: center;
17
+ }
18
+ </style>
19
+ """
20
+
21
+ st.markdown(center_css, unsafe_allow_html=True)
22
+
23
+
24
  def prep_rankings_table(df, y_column):
25
  # Create a copy of the dataframe.
26
  df_copy = df.copy()
 
133
  st.session_state.instruction_options
134
  )
135
 
136
+ st.title("🦙 AlpacaEval Explorer 🦙")
137
+
138
+ st.markdown(
139
+ "### An interactive tool to analyze and explore the data behind the [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) in more depth"
140
+ )
141
+
142
+ st.markdown(
143
+ "###### Created and maintained by [Justin Zhao](https://x.com/justinxzhao)"
144
+ )
145
+
146
+ col1, col2, col3 = st.columns(3)
147
+
148
+ with col1:
149
+ with st.expander("About AlpacaEval"):
150
+ st.markdown(
151
+ """- [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval) is an evaluation benchmark to assess the performance of large language models (LLMs).
152
+ - It has high correlation with Chatbot Arena, and is a fast and affordable benchmark for chat LLMs that uses LLMs (specifically GPT-4) to estimate response quality.
153
+ - LLM responses are assessed in a pairwise fashion (arena), where each model's responses are compared to a reference model's responses.
154
+ - The reference model is GPT-4-1106. The LLM Judge is also GPT-4-1106.
155
 
156
+ """
157
+ )
158
+
159
+ with col2:
160
+ with st.expander("About this tool"):
161
+ st.markdown(
162
+ """- There are 2 main tabs: **Data explorer** and **Length bias explorer**.
163
+ - Use the Data explorer to look at individual pairwise battles between models.
164
+ - Use the Length bias explorer to look at how response lengths affect win rates.
165
+ """
166
+ )
167
+
168
+ with col3:
169
+ with st.expander("Motivation"):
170
+ st.markdown(
171
+ """
172
+ - Several arena-based benchmarks (ours included) have demonstrated that a clear ranking among LLMs can be established, but there is a general dearth of analysis and understanding as to why the rankings are the way they are. For example, it's hard to discern how factors like feel and style
173
+ are weighed against correctness.
174
+ - I created this tool to provide a more interactive and intuitive way to explore the data behind the AlpacaEval leaderboard. It allows users to easily compare responses between models, look at individual battles, and analyze how response lengths affect win rates.
175
+ - If you have any feedback on the tool, please reach out on [Twitter](https://twitter.com/justinxzhao)!
176
+ """
177
+ )
178
+
179
+ outer_tabs = st.tabs(["Data explorer", "Length bias explorer"])
180
 
181
  # Load the data
182
  df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
 
188
  # Prepare the model selector options
189
  model_options = df_response_judging["generator_2"].unique().tolist()
190
 
191
+ with outer_tabs[1]:
192
  # Define the preset groups
193
  presets = {
194
  "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
 
212
  options=["custom", "gpt", "claude", "moa", "llama"],
213
  )
214
 
 
 
215
  # Add multiselect for custom model selection
216
  if preset_selection == "custom":
217
  selected_models = st.multiselect(
 
220
  else:
221
  selected_models = presets[preset_selection]
222
 
223
+ st.divider()
224
+
225
  def create_scatter_plot(df, y_column, selected_models, title):
226
  fig = go.Figure()
227
 
 
319
 
320
  return fig, r_squared_words, r_squared_tokens
321
 
322
+ st.markdown("#### Overall win rate")
323
  y_column1 = "length_controlled_winrate"
324
  y_column2 = "win_rate"
325
  y_column3 = "discrete_win_rate"
 
379
  f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
380
  )
381
 
382
+ st.markdown("#### Length bias in battles")
383
 
384
  df_response_judging_copy = df_response_judging.copy()
385
  if not selected_models:
 
459
  st.dataframe(df)
460
 
461
  # Data explorer
462
+ with outer_tabs[0]:
463
  # Add randomize button at the top of the app
464
+ st.markdown("#### Choose example")
465
  st.button(
466
  ":game_die: Randomize!",
467
  on_click=randomize_selection,
 
503
 
504
  st.divider()
505
 
506
+ st.markdown(f"#### Selected instruction")
507
  st.info(st.session_state.selected_instruction)
508
 
509
  st.divider()
510
 
511
+ st.markdown(f"#### Overall Battles")
512
  all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
513
  "output_1"
514
  ].apply(lambda x: len(x.split()))
 
570
  better_models["output_2_num_words"] > num_words_for_fixed_model
571
  ]
572
  col3.markdown(
573
+ f"##### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
574
  )
575
  if shorter_models.size != 0:
576
  shorter_models_string = ""
 
592
  col3.write("None")
593
 
594
  # Judging details.
595
+ st.markdown(f"#### Individual Battle Details")
596
  judging_details = df_response_judging[
597
  (df_response_judging["generator_1"] == fixed_model)
598
  & (df_response_judging["generator_2"] == st.session_state.selected_model)
 
630
  )
631
 
632
  # Create two columns for model selectors
633
+ st.markdown("#### Responses")
634
  col1, col2 = st.columns(2)
635
 
636
  with col1: