laiviet commited on
Commit
5faacb0
1 Parent(s): 8c2ee0f

Fix intro and sort order

Browse files
Files changed (2) hide show
  1. app.py +3 -3
  2. content.py +11 -3
app.py CHANGED
@@ -99,12 +99,12 @@ def get_leaderboard_df(performance_dict, pretrained_models):
99
  if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
100
  continue
101
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
102
- notes = ' '.join([pretrained, lang_name, lang])
103
  row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
104
  df.append(row)
105
 
106
  df = pd.DataFrame.from_records(df, columns=COLS)
107
- df = df.sort_values(by=[AVERAGE_COL], ascending=False)
108
  df = df[COLS]
109
 
110
  return df
@@ -140,7 +140,7 @@ with demo:
140
 
141
  with gr.Box():
142
  search_bar = gr.Textbox(
143
- placeholder="Search models...", show_label=False, elem_id="search-bar"
144
  )
145
 
146
  leaderboard_table = gr.components.Dataframe(
 
99
  if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
100
  continue
101
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
102
+ notes = ' '.join([pretrained, lang_name])
103
  row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
104
  df.append(row)
105
 
106
  df = pd.DataFrame.from_records(df, columns=COLS)
107
+ df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
108
  df = df[COLS]
109
 
110
  return df
 
140
 
141
  with gr.Box():
142
  search_bar = gr.Textbox(
143
+ placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
144
  )
145
 
146
  leaderboard_table = gr.components.Dataframe(
content.py CHANGED
@@ -3,21 +3,29 @@ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Le
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
- This leaderboard shows the performance of pretrained models in 29 languages including Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnameseon four benchmarks:
 
 
 
 
 
 
 
 
7
 
8
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
9
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
10
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
11
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
12
 
13
- The evaluation data was translated into 29 languages using ChatGPT.
14
 
15
  """
16
 
17
  HOW_TO = f"""
18
  ## How to list your model performance on this leaderboard:
19
 
20
- Send an email with title [Open mLLM Loaderboard] to vietl@uoregon.edu with the huggingface's model name.
21
 
22
  We will run your model on the four benchmarks and add it to the leaderboard.
23
  """
 
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
+ This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages,
7
+ emphasizing on non-English languages to democratize benefits of LLMs to broader society.
8
+ Our current leaderboard provides evaluation data for 29 languages, i.e.,
9
+ Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
10
+ French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
11
+ Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
12
+ Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
13
+ Both multilingual and language-specific LLMs are welcome in this leaderboard.
14
+ We currently evaluate models over four benchmarks:
15
 
16
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
17
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
18
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
19
  - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
20
 
21
+ The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo).
22
 
23
  """
24
 
25
  HOW_TO = f"""
26
  ## How to list your model performance on this leaderboard:
27
 
28
+ Send an email with title [Open mLLM Loaderboard] to vietl@uoregon.edu with the Huggingface's model name.
29
 
30
  We will run your model on the four benchmarks and add it to the leaderboard.
31
  """