felix commited on
Commit
5d6c941
1 Parent(s): 0f541ca
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -111,12 +111,14 @@ if compare_mode:
111
 
112
  hf_llm_diagrams = extract_images('hf_llm_diagram', imgs)
113
  bigcode_diagrams = extract_images('bigcode', imgs)
114
- mt_bench_diagrams = extract_images('mt_bench_leaderboard', imgs)
 
115
  opencompass_diagrams = extract_images('opencompass_leaderboard', imgs)
116
 
117
  compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs)
118
  compare_bigcode_diagrams = extract_images('bigcode', compare_imgs)
119
- compare_mt_bench_diagrams = extract_images('mt_bench_leaderboard', compare_imgs)
 
120
  compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs)
121
 
122
  # Display each category side by side
@@ -136,6 +138,9 @@ if compare_mode:
136
  # Displaying MT-Bench Models Leaderboard
137
  display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard")
138
 
 
 
 
139
  # Displaying OpenCompass Models Leaderboard
140
  display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard")
141
 
@@ -168,11 +173,12 @@ else:
168
  # Extracting images that start with "hf_llm_diagram"
169
  hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
170
  bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
171
- mt_bench_diagrams = [img for img in imgs if 'mt_bench_leaderboard' in os.path.basename(img)]
 
172
  opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]
173
 
174
  # Getting the remaining images
175
- remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
176
 
177
  st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
178
  cols = st.columns(2)
@@ -213,6 +219,12 @@ else:
213
 
214
  print_model_list(mt_bench_diagrams[0],st,True)
215
 
 
 
 
 
 
 
216
  st.subheader("OpenCompass Models Leaderboard", divider=True)
217
  cols = st.columns(2)
218
  cols[0].image(opencompass_diagrams[0], use_column_width="auto")
@@ -238,7 +250,7 @@ st.write(
238
  <p>Leaderboards tracked:</p>
239
  <ul>
240
  <li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
241
- <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li>
242
  <li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
243
  <li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
244
  <li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
 
111
 
112
  hf_llm_diagrams = extract_images('hf_llm_diagram', imgs)
113
  bigcode_diagrams = extract_images('bigcode', imgs)
114
+ mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', imgs)
115
+ arena_diagrams = extract_images('lmsys_leaderboard_arena', imgs)
116
  opencompass_diagrams = extract_images('opencompass_leaderboard', imgs)
117
 
118
  compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs)
119
  compare_bigcode_diagrams = extract_images('bigcode', compare_imgs)
120
+ compare_mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', compare_imgs)
121
+ compare_arena_diagrams = extract_images('lmsys_leaderboard_arena', compare_imgs)
122
  compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs)
123
 
124
  # Display each category side by side
 
138
  # Displaying MT-Bench Models Leaderboard
139
  display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard")
140
 
141
+ # Displaying Arena Models Leaderboard
142
+ display_side_by_side(arena_diagrams, compare_arena_diagrams, "LMSYS Arena Elo Models Leaderboard")
143
+
144
  # Displaying OpenCompass Models Leaderboard
145
  display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard")
146
 
 
173
  # Extracting images that start with "hf_llm_diagram"
174
  hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
175
  bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
176
+ mt_bench_diagrams = [img for img in imgs if 'lmsys_leaderboard_mt_bench' in os.path.basename(img)]
177
+ arena_diagrams = [img for img in imgs if 'lmsys_leaderboard_arena' in os.path.basename(img)]
178
  opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]
179
 
180
  # Getting the remaining images
181
+ remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(arena_diagrams) - set(opencompass_diagrams))
182
 
183
  st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
184
  cols = st.columns(2)
 
219
 
220
  print_model_list(mt_bench_diagrams[0],st,True)
221
 
222
+ st.subheader("LMSYS Arena Elo Models Leaderboard", divider=True)
223
+ cols = st.columns(2)
224
+ cols[0].image(arena_diagrams[0], use_column_width="auto")
225
+
226
+ print_model_list(arena_diagrams[0],st,True)
227
+
228
  st.subheader("OpenCompass Models Leaderboard", divider=True)
229
  cols = st.columns(2)
230
  cols[0].image(opencompass_diagrams[0], use_column_width="auto")
 
250
  <p>Leaderboards tracked:</p>
251
  <ul>
252
  <li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
253
+ <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench and Arena Elo</a>MT-Bench is GPT4 judged evaluation of models, Arena Elo is users ranking outputs between models.</li>
254
  <li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
255
  <li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
256
  <li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>