yuchenlin commited on
Commit
9e6ac7d
β€’
1 Parent(s): 06f30ec

add the Open LLM bench

Browse files
Files changed (3) hide show
  1. app.py +66 -24
  2. open-llm-leaderboard.json +0 -0
  3. scrape-open-llm-leaderboard +1 -0
app.py CHANGED
@@ -5,6 +5,7 @@ import numpy as np
5
  import pandas as pd
6
  import gradio as gr
7
  import pandas as pd
 
8
  from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS
9
 
10
  LAST_UPDATED = "Feb 28th 2024"
@@ -66,27 +67,35 @@ def make_clickable_model(model_name, model_info):
66
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
67
 
68
 
69
- def build_demo(original_df, TYPES):
70
  with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
71
  # gr.HTML(BANNER, elem_id="banner")
72
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
73
 
74
- # with gr.Tabs(elem_classes="tab-buttons") as tabs:
75
- # with gr.TabItem("πŸ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
76
- leaderboard_table = gr.components.Dataframe(
77
- value=original_df,
78
- datatype=TYPES,
79
- height=1000,
80
- wrap=False,
81
- elem_id="leaderboard-table",
82
- interactive=False,
83
- visible=True,
84
- min_width=60,
85
- )
86
-
87
- # with gr.TabItem("πŸ“ˆ Metrics", elem_id="od-benchmark-tab-table", id=1):
88
- # gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
89
-
 
 
 
 
 
 
 
 
90
 
91
  gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
92
 
@@ -108,9 +117,34 @@ if __name__ == "__main__":
108
  parser.add_argument("--result_file", help="Path to results table", default="leaderboard_data.jsonl")
109
  args = parser.parse_args()
110
 
111
- bench_results = args.result_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- original_df = pd.read_json(bench_results, lines=True)
 
 
114
 
115
  print(original_df.columns)
116
 
@@ -119,21 +153,29 @@ if __name__ == "__main__":
119
  original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x, model_info)))
120
  else:
121
  original_df[col] = original_df[col].apply(formatter) # For numerical values
122
-
123
  # Define the first column explicitly, add 'Overall' as the second column, and then append the rest excluding 'Overall'
124
  new_order = [original_df.columns[0], 'Overall'] + [col for col in original_df.columns if col not in [original_df.columns[0], 'Overall']]
125
-
126
  # Reorder the DataFrame columns using the new order
127
  reordered_df = original_df[new_order]
128
-
129
  reordered_df.sort_values(by='Overall', inplace=True, ascending=False)
130
-
131
  reordered_df.rename(columns=column_names, inplace=True)
132
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  # COLS = [c.name for c in fields(AutoEvalColumn)]
134
  # TYPES = [c.type for c in fields(AutoEvalColumn)]
135
 
136
  TYPES = ["markdown", "number"]
137
- demo = build_demo(reordered_df, TYPES)
138
  demo.launch(share=args.share)
139
 
 
5
  import pandas as pd
6
  import gradio as gr
7
  import pandas as pd
8
+ import json
9
  from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS
10
 
11
  LAST_UPDATED = "Feb 28th 2024"
 
67
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
68
 
69
 
70
+ def build_demo(original_df, full_df, TYPES):
71
  with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
72
  # gr.HTML(BANNER, elem_id="banner")
73
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
74
 
75
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
76
+ with gr.TabItem("πŸ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
77
+ leaderboard_table = gr.components.Dataframe(
78
+ value=original_df,
79
+ datatype=TYPES,
80
+ height=1000,
81
+ wrap=False,
82
+ elem_id="leaderboard-table",
83
+ interactive=False,
84
+ visible=True,
85
+ min_width=60,
86
+ )
87
+
88
+ with gr.TabItem("πŸ‘ URIAL + πŸ€— OpenLLM", elem_id="od-benchmark-tab-table", id=1):
89
+ leaderboard_table_full = gr.components.Dataframe(
90
+ value=full_df,
91
+ datatype=TYPES,
92
+ height=1000,
93
+ wrap=False,
94
+ elem_id="leaderboard-table-full",
95
+ interactive=False,
96
+ visible=True,
97
+ min_width=60,
98
+ )
99
 
100
  gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
101
 
 
117
  parser.add_argument("--result_file", help="Path to results table", default="leaderboard_data.jsonl")
118
  args = parser.parse_args()
119
 
120
+ all_model_hf_ids = {v["hf_name"]: k for k, v in model_info.items()}
121
+
122
+ # Load Open LLM Leaderboard
123
+ with open("open-llm-leaderboard.json") as f:
124
+ open_llm_leaderbaord = json.load(f)
125
+ full_leaderboard = {}
126
+ for item in open_llm_leaderbaord:
127
+ if item["Model"] in all_model_hf_ids:
128
+ # print(item["Model"])
129
+ # print(item["Average \u2b06\ufe0f"])
130
+ full_bench_item = {}
131
+ # full_bench_item["hf_name"] = item["Model"]
132
+ full_bench_item["model_name"] = all_model_hf_ids[item["Model"]]
133
+ tasks = ["HellaSwag", "ARC", "Winogrande", "TruthfulQA", "MMLU", "GSM8K"]
134
+ for task in tasks:
135
+ full_bench_item[task] = item[task]
136
+ full_bench_item["HF_AVG"] = item["Average \u2b06\ufe0f"]
137
+ full_leaderboard[all_model_hf_ids[item["Model"]]] = full_bench_item
138
+ # Load URIAL Leaderboard
139
+ with open("leaderboard_data.jsonl") as f:
140
+ for line in f:
141
+ item = json.loads(line)
142
+ if item["model"] in full_leaderboard:
143
+ full_leaderboard[item["model"]]["URIAL_AVG"] = item["Overall"]
144
 
145
+
146
+ # Process the URIAL Benchmark Tab
147
+ original_df = pd.read_json(args.result_file, lines=True)
148
 
149
  print(original_df.columns)
150
 
 
153
  original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x, model_info)))
154
  else:
155
  original_df[col] = original_df[col].apply(formatter) # For numerical values
 
156
  # Define the first column explicitly, add 'Overall' as the second column, and then append the rest excluding 'Overall'
157
  new_order = [original_df.columns[0], 'Overall'] + [col for col in original_df.columns if col not in [original_df.columns[0], 'Overall']]
 
158
  # Reorder the DataFrame columns using the new order
159
  reordered_df = original_df[new_order]
 
160
  reordered_df.sort_values(by='Overall', inplace=True, ascending=False)
 
161
  reordered_df.rename(columns=column_names, inplace=True)
162
 
163
+ # Process the Full Benchmark Tab
164
+ full_df = pd.DataFrame(full_leaderboard).T
165
+ full_df = full_df.reset_index()
166
+ full_df.rename(columns={"index": "model"}, inplace=True)
167
+ full_df = full_df[["model", "URIAL_AVG", "HF_AVG", "HellaSwag", "ARC", "Winogrande", "TruthfulQA", "MMLU", "GSM8K"]]
168
+ full_df.sort_values(by='URIAL_AVG', inplace=True, ascending=False)
169
+ full_df["model"] = full_df["model"].apply(lambda x: make_clickable_model(x, model_info))
170
+ full_df.rename(columns=column_names, inplace=True)
171
+ # apply formatter to numerical columns
172
+ for col in full_df.columns:
173
+ if col not in ["Model"]:
174
+ full_df[col] = full_df[col].apply(formatter) # For numerical values
175
  # COLS = [c.name for c in fields(AutoEvalColumn)]
176
  # TYPES = [c.type for c in fields(AutoEvalColumn)]
177
 
178
  TYPES = ["markdown", "number"]
179
+ demo = build_demo(reordered_df, full_df, TYPES)
180
  demo.launch(share=args.share)
181
 
open-llm-leaderboard.json ADDED
The diff for this file is too large to render. See raw diff
 
scrape-open-llm-leaderboard ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit ed2132d6989c336d684c3e3c4681aa458ace24ca