Upload 11 files
Browse files- README.md +13 -14
- app.py +303 -0
- constants.py +67 -0
- file/AV-Odyssey_performance.csv +19 -0
- requirements.txt +70 -0
- src/__pycache__/utils_display.cpython-311.pyc +0 -0
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-311.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
- src/auto_leaderboard/model_metadata_type.py +30 -0
- src/utils_display.py +99 -0
README.md
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license:
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: Av-Odyssey Bench Leaderboard
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.40.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: cc-by-4.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
app.py
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import pdb
|
6 |
+
import tempfile
|
7 |
+
import re
|
8 |
+
from constants import *
|
9 |
+
from src.auto_leaderboard.model_metadata_type import ModelType
|
10 |
+
|
11 |
+
|
12 |
+
global data_component, filter_component
|
13 |
+
|
14 |
+
def validate_model_size(s):
|
15 |
+
pattern = r'^\d+B$|^-$'
|
16 |
+
if re.match(pattern, s):
|
17 |
+
return s
|
18 |
+
else:
|
19 |
+
return '-'
|
20 |
+
|
21 |
+
def upload_file(files):
|
22 |
+
file_paths = [file.name for file in files]
|
23 |
+
return file_paths
|
24 |
+
|
25 |
+
def prediction_analyse(prediction_content):
|
26 |
+
# pdb.set_trace()
|
27 |
+
predictions = prediction_content.split("\n")
|
28 |
+
|
29 |
+
# 读取 ground_truth JSON 文件
|
30 |
+
with open("./file/SEED-Bench-1.json", "r") as file:
|
31 |
+
ground_truth_data = json.load(file)["questions"]
|
32 |
+
|
33 |
+
# 将 ground_truth 数据转换为以 question_id 为键的字典
|
34 |
+
ground_truth = {item["question_id"]: item for item in ground_truth_data}
|
35 |
+
|
36 |
+
# 初始化结果统计字典
|
37 |
+
results = {i: {"correct": 0, "total": 0} for i in range(1, 13)}
|
38 |
+
|
39 |
+
# 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
|
40 |
+
for prediction in predictions:
|
41 |
+
# pdb.set_trace()
|
42 |
+
prediction = prediction.strip()
|
43 |
+
if not prediction:
|
44 |
+
continue
|
45 |
+
try:
|
46 |
+
prediction = json.loads(prediction)
|
47 |
+
except json.JSONDecodeError:
|
48 |
+
print(f"Warning: Skipping invalid JSON data in line: {prediction}")
|
49 |
+
continue
|
50 |
+
question_id = prediction["question_id"]
|
51 |
+
if question_id not in ground_truth:
|
52 |
+
continue
|
53 |
+
gt_item = ground_truth[question_id]
|
54 |
+
question_type_id = gt_item["question_type_id"]
|
55 |
+
|
56 |
+
if prediction["prediction"] == gt_item["answer"]:
|
57 |
+
results[question_type_id]["correct"] += 1
|
58 |
+
|
59 |
+
results[question_type_id]["total"] += 1
|
60 |
+
|
61 |
+
return results
|
62 |
+
|
63 |
+
|
64 |
+
def add_new_eval(
|
65 |
+
input_file,
|
66 |
+
model_name_textbox: str,
|
67 |
+
revision_name_textbox: str,
|
68 |
+
model_link: str,
|
69 |
+
):
|
70 |
+
if input_file is None:
|
71 |
+
return "Error! Empty file!"
|
72 |
+
else:
|
73 |
+
model_size = validate_model_size(model_size)
|
74 |
+
# v1 evaluation
|
75 |
+
content = input_file.decode("utf-8")
|
76 |
+
prediction = prediction_analyse(content)
|
77 |
+
csv_data = pd.read_csv(CSV_DIR)
|
78 |
+
|
79 |
+
Start_dimension, End_dimension = 1, 13
|
80 |
+
if Evaluation_dimension == 'Image':
|
81 |
+
End_dimension = 10
|
82 |
+
elif Evaluation_dimension == 'Video':
|
83 |
+
Start_dimension = 10
|
84 |
+
each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
|
85 |
+
|
86 |
+
# count for average image\video\all
|
87 |
+
total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
|
88 |
+
total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
|
89 |
+
|
90 |
+
total_image = sum(prediction[i]["total"] for i in range(1, 10))
|
91 |
+
total_video = sum(prediction[i]["total"] for i in range(10, 13))
|
92 |
+
|
93 |
+
if Evaluation_dimension != 'Video':
|
94 |
+
average_accuracy_image = round(total_correct_image / total_image * 100, 1)
|
95 |
+
else:
|
96 |
+
average_accuracy_image = 0
|
97 |
+
|
98 |
+
if Evaluation_dimension != 'Image':
|
99 |
+
average_accuracy_video = round(total_correct_video / total_video * 100, 1)
|
100 |
+
else:
|
101 |
+
average_accuracy_video = 0
|
102 |
+
|
103 |
+
if Evaluation_dimension == 'All':
|
104 |
+
overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
|
105 |
+
else:
|
106 |
+
overall_accuracy = 0
|
107 |
+
|
108 |
+
if LLM_type == 'Other':
|
109 |
+
LLM_name = LLM_name_textbox
|
110 |
+
else:
|
111 |
+
LLM_name = LLM_type
|
112 |
+
|
113 |
+
if revision_name_textbox == '':
|
114 |
+
col = csv_data.shape[0]
|
115 |
+
model_name = model_name_textbox
|
116 |
+
else:
|
117 |
+
model_name = revision_name_textbox
|
118 |
+
model_name_list = csv_data['Model']
|
119 |
+
name_list = [name.split(']')[0][1:] for name in model_name_list]
|
120 |
+
if revision_name_textbox not in name_list:
|
121 |
+
col = csv_data.shape[0]
|
122 |
+
else:
|
123 |
+
col = name_list.index(revision_name_textbox)
|
124 |
+
|
125 |
+
if model_link == '':
|
126 |
+
model_name = model_name # no url
|
127 |
+
else:
|
128 |
+
model_name = '[' + model_name + '](' + model_link + ')'
|
129 |
+
|
130 |
+
# add new data
|
131 |
+
new_data = [
|
132 |
+
model_name,
|
133 |
+
LLM_name,
|
134 |
+
model_size,
|
135 |
+
overall_accuracy,
|
136 |
+
average_accuracy_image,
|
137 |
+
average_accuracy_video,
|
138 |
+
each_task_accuracy[1],
|
139 |
+
each_task_accuracy[2],
|
140 |
+
each_task_accuracy[3],
|
141 |
+
each_task_accuracy[4],
|
142 |
+
each_task_accuracy[5],
|
143 |
+
each_task_accuracy[6],
|
144 |
+
each_task_accuracy[7],
|
145 |
+
each_task_accuracy[8],
|
146 |
+
each_task_accuracy[9],
|
147 |
+
each_task_accuracy[10],
|
148 |
+
each_task_accuracy[11],
|
149 |
+
each_task_accuracy[12],
|
150 |
+
]
|
151 |
+
csv_data.loc[col] = new_data
|
152 |
+
csv_data = csv_data.to_csv(CSV_DIR, index=False)
|
153 |
+
|
154 |
+
csv_task_data.loc[col] = new_data
|
155 |
+
csv_task_data = csv_task_data.to_csv(CSV_TASK_DIR, index=False)
|
156 |
+
return 0
|
157 |
+
|
158 |
+
def get_baseline_df():
|
159 |
+
df = pd.read_csv(CSV_DIR)
|
160 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
161 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
162 |
+
df = df[present_columns]
|
163 |
+
return df
|
164 |
+
|
165 |
+
def get_all_df():
|
166 |
+
df = pd.read_csv(CSV_DIR)
|
167 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
168 |
+
return df
|
169 |
+
|
170 |
+
|
171 |
+
def switch_version(version):
|
172 |
+
return f"当前版本: {version}"
|
173 |
+
|
174 |
+
block = gr.Blocks()
|
175 |
+
|
176 |
+
|
177 |
+
with block:
|
178 |
+
gr.Markdown(
|
179 |
+
LEADERBORAD_INTRODUCTION
|
180 |
+
)
|
181 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
182 |
+
# table seed-bench-v1
|
183 |
+
with gr.TabItem("🏅 AV-Odyssey Benchmark", elem_id="av-odyssey-tab-table", id=1):
|
184 |
+
with gr.Row():
|
185 |
+
with gr.Accordion("Citation", open=False):
|
186 |
+
citation_button = gr.Textbox(
|
187 |
+
value=CITATION_BUTTON_TEXT,
|
188 |
+
label=CITATION_BUTTON_LABEL,
|
189 |
+
elem_id="citation-button",
|
190 |
+
).style(show_copy_button=True)
|
191 |
+
|
192 |
+
gr.Markdown(
|
193 |
+
TABLE_INTRODUCTION
|
194 |
+
)
|
195 |
+
|
196 |
+
# selection for column part:
|
197 |
+
checkbox_group = gr.CheckboxGroup(
|
198 |
+
choices=TASK_INFO,
|
199 |
+
value=AVG_INFO,
|
200 |
+
label="Evaluation Dimension",
|
201 |
+
interactive=True,
|
202 |
+
)
|
203 |
+
|
204 |
+
|
205 |
+
baseline_value = get_baseline_df()
|
206 |
+
baseline_header = MODEL_INFO + checkbox_group.value
|
207 |
+
baseline_datatype = ['markdown'] * len(MODEL_INFO) + ['number'] * len(checkbox_group.value)
|
208 |
+
# 创建数据帧组件
|
209 |
+
data_component = gr.components.Dataframe(
|
210 |
+
value=baseline_value,
|
211 |
+
headers=baseline_header,
|
212 |
+
type="pandas",
|
213 |
+
datatype=baseline_datatype,
|
214 |
+
interactive=False,
|
215 |
+
visible=True,
|
216 |
+
)
|
217 |
+
|
218 |
+
def on_filter_model_size_method_change(selected_columns):
|
219 |
+
|
220 |
+
updated_data = get_all_df()
|
221 |
+
|
222 |
+
# columns:
|
223 |
+
selected_columns = [item for item in TASK_INFO if item in selected_columns]
|
224 |
+
present_columns = MODEL_INFO + selected_columns
|
225 |
+
updated_data = updated_data[present_columns]
|
226 |
+
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
|
227 |
+
updated_headers = present_columns
|
228 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
229 |
+
|
230 |
+
filter_component = gr.components.Dataframe(
|
231 |
+
value=updated_data,
|
232 |
+
headers=updated_headers,
|
233 |
+
type="pandas",
|
234 |
+
datatype=update_datatype,
|
235 |
+
interactive=False,
|
236 |
+
visible=True,
|
237 |
+
)
|
238 |
+
# pdb.set_trace()
|
239 |
+
|
240 |
+
return filter_component.value
|
241 |
+
|
242 |
+
def on_average_type_change(average_type):
|
243 |
+
return get_baseline_df()
|
244 |
+
|
245 |
+
checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[checkbox_group], outputs=data_component)
|
246 |
+
|
247 |
+
# table 2
|
248 |
+
with gr.TabItem("📝 About", elem_id="av-odyssey-tab-table", id=2):
|
249 |
+
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
|
250 |
+
|
251 |
+
# table 3
|
252 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="av-odyssey-tab-table", id=3):
|
253 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
|
254 |
+
|
255 |
+
with gr.Row():
|
256 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
257 |
+
|
258 |
+
with gr.Row():
|
259 |
+
gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
|
260 |
+
|
261 |
+
with gr.Row():
|
262 |
+
with gr.Column():
|
263 |
+
model_name_textbox = gr.Textbox(
|
264 |
+
label="Model name", placeholder="VideoLLaMA2"
|
265 |
+
)
|
266 |
+
revision_name_textbox = gr.Textbox(
|
267 |
+
label="Revision Model Name", placeholder="VideoLLaMA2"
|
268 |
+
)
|
269 |
+
model_link = gr.Textbox(
|
270 |
+
label="Model Link", placeholder="https://huggingface.co/DAMO-NLP-SG/VideoLLaMA2.1-7B-16F"
|
271 |
+
)
|
272 |
+
|
273 |
+
|
274 |
+
with gr.Column():
|
275 |
+
|
276 |
+
input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
|
277 |
+
submit_button = gr.Button("Submit Eval")
|
278 |
+
|
279 |
+
submission_result = gr.Markdown()
|
280 |
+
submit_button.click(
|
281 |
+
add_new_eval,
|
282 |
+
inputs = [
|
283 |
+
input_file,
|
284 |
+
model_name_textbox,
|
285 |
+
revision_name_textbox,
|
286 |
+
model_link
|
287 |
+
],
|
288 |
+
)
|
289 |
+
|
290 |
+
|
291 |
+
def refresh_data():
|
292 |
+
value1 = get_baseline_df()
|
293 |
+
|
294 |
+
return value1
|
295 |
+
|
296 |
+
with gr.Row():
|
297 |
+
data_run = gr.Button("Refresh")
|
298 |
+
data_run.click(
|
299 |
+
refresh_data, outputs=data_component
|
300 |
+
)
|
301 |
+
|
302 |
+
|
303 |
+
block.launch()
|
constants.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is .py for store constants
|
2 |
+
MODEL_INFO = ["Model"]
|
3 |
+
TASK_INFO = [
|
4 |
+
"Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy",
|
5 |
+
"Instrument Recognition", "Singer Recognition", "Gunshot Recognition", "Bird Recognition", "Animal Recognition",
|
6 |
+
"Transportation Recognition", "Material Recognition", "Scene Recognition", "Hazard Recognition", "Action Recognition",
|
7 |
+
"Eating Sound Recognition", "Speech Sentiment Analysis", "Meme Understanding", "Music Sentiment Analysis", "Music Genre Classification",
|
8 |
+
"Dance and Music Matching", "Film and Music Matching", "Music Score Matching", "Audio 3D Angle Estimation", "Audio Distance Estimation",
|
9 |
+
"Audio Time Estimation", "Audio-Visual Synchronization", "Action Sequencing", "Hallucination Evaluation",
|
10 |
+
"Action Prediction", "Action Tracing"]
|
11 |
+
|
12 |
+
AVG_INFO = ["Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy"]
|
13 |
+
DATA_TITILE_TYPE = ["markdown"] * len(MODEL_INFO) + ["number"] * len(TASK_INFO)
|
14 |
+
CSV_DIR = "./file/AV-Odyssey_performance.csv"
|
15 |
+
|
16 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
17 |
+
|
18 |
+
DATA_NUM = [200, 200, 200, 200, 200, 200, 200, 200, 108, 196, 200, 200, 20, 97, 200, 200, 200, 200, 20, 20, 200, 200, 200, 200, 199, 195]
|
19 |
+
|
20 |
+
LEADERBORAD_INTRODUCTION = """# AV-Odyssey Bench Leaderboard
|
21 |
+
Welcome to the leaderboard of the AV-Odyssey Bench! 🏆
|
22 |
+
|
23 |
+
AV-Odyssey Bench, a comprehensive audio-visual benchmark designed to assess whether those MLLMs can truly understand the audio-visual information. This benchmark encompasses 4,555 carefully crafted problems, each incorporating text, visual, and audio components. To successfully infer answers, models must effectively leverage clues from both visual and audio inputs.
|
24 |
+
Please refer to [AV-Odyssey paper](https://arxiv.org/abs/2307.16125) for more details.
|
25 |
+
|
26 |
+
"""
|
27 |
+
|
28 |
+
|
29 |
+
SUBMIT_INTRODUCTION = """# Submit on AV-Odyssey Bench Introduction
|
30 |
+
|
31 |
+
|
32 |
+
Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows:
|
33 |
+
```shell
|
34 |
+
{"question_id": "5_0", "prediction": "B"}
|
35 |
+
{"question_id": "3_0", "prediction": "B"}
|
36 |
+
```
|
37 |
+
|
38 |
+
## Submit Example
|
39 |
+
|
40 |
+
|
41 |
+
## If you have any questions, please contact [libohao1998@gmail.com](libohao1998@gmail.com).
|
42 |
+
"""
|
43 |
+
|
44 |
+
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
|
45 |
+
We use accurancy(%) as the primary evaluation metric for each tasks.
|
46 |
+
|
47 |
+
Performance Average Type is All Average means that calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions.
|
48 |
+
|
49 |
+
If you have any questions, please feel free to contact us.
|
50 |
+
"""
|
51 |
+
|
52 |
+
LEADERBORAD_INFO = """
|
53 |
+
Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini 1.5 Pro, and Reka Core, have expanded their capabilities to include vision and audio modalities. While these models demonstrate impressive performance across a wide range of audio-visual applications, our proposed DeafTest reveals that MLLMs often struggle with simple tasks humans find trivial:
|
54 |
+
1) determining which of two sounds is louder, and
|
55 |
+
2) determining which of two sounds has a higher pitch.
|
56 |
+
Motivated by these observations, we introduce AV-Odyssey Bench. This benchmark encompasses 26 different tasks and 4,555 carefully crafted problems, each incorporating text, visual, and audio components. All data are newly collected and annotated by humans, not from any existing audio-visual dataset. AV-Odyssey Bench demonstrates three major features:
|
57 |
+
1. Comprehensive Audio Attributes;
|
58 |
+
2. Extensive Domains;
|
59 |
+
3. Interleaved Text, Audio, and Visual components.
|
60 |
+
"""
|
61 |
+
|
62 |
+
|
63 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
64 |
+
CITATION_BUTTON_TEXT = r"""{
|
65 |
+
|
66 |
+
}"""
|
67 |
+
|
file/AV-Odyssey_performance.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Avg. All,Avg. Timbre,Avg. Tone,Avg. Melody,Avg. Space,Avg. Time,Avg. Hallucination,Avg. Intricacy,Instrument Recognition,Singer Recognition,Gunshot Recognition,Bird Recognition,Animal Recognition,Transportation Recognition,Material Recognition,Scene Recognition,Hazard Recognition,Action Recognition,Eating Sound Recognition,Speech Sentiment Analysis,Meme Understanding,Music Sentiment Analysis,Music Genre Classification,Dance and Music Matching,Film and Music Matching,Music Score Matching,Audio 3D Angle Estimation,Audio Distance Estimation,Audio Time Estimation,Audio-Visual Synchronization,Action Sequencing,Hallucination Evaluation,Action Prediction,Action Tracing
|
2 |
+
[Unified-IO-2 L](https://unified-io-2.allenai.org/),26.0,23.8,24.1,28.8,15.0,26.8,30.0,30.4,20.5,22.5,25.5,18.5,27.0,26.5,23.0,28.0,21.3,20.9,26.5,24.5,20.0,27.9,31.0,27.5,32.5,24.5,15.0,15.0,28.0,25.5,27.0,30.0,27.1,33.8
|
3 |
+
[Unified-IO-2 XL](https://unified-io-2.allenai.org/),26.3,24.3,23.2,27.8,22.5,25.3,31.5,34.8,20.0,23.5,24.0,20.5,27.5,26.0,27.5,30.0,19.4,19.9,26.5,23.0,25.0,26.9,30.5,27.0,31.5,22.5,30.0,15.0,26.5,25.5,24.0,31.5,35.7,33.8
|
4 |
+
[Unified-IO-2 XXL](https://unified-io-2.allenai.org/),27.2,26.3,22.7,26.4,32.5,26.8,24.5,33.8,29.5,24.0,23.5,29.0,23.5,25.5,30.5,26.5,23.1,27.0,25.5,23.0,20.0,23.9,31.5,27.5,24.5,23.5,50.0,15.0,28.0,25.0,27.5,24.5,33.2,34.4
|
5 |
+
[OneLLM](https://github.com/csuhan/OneLLM),27.4,25.0,25.5,21.5,37.5,29.3,25.5,38.4,26.0,21.5,27.0,26.0,22.0,20.0,29.5,24.5,26.9,23.0,29.5,26.0,20.0,20.8,23.5,26.5,18.5,18.0,45.0,30.0,31.5,29.5,27.0,25.5,41.7,34.9
|
6 |
+
[PandaGPT](https://panda-gpt.github.io/),26.7,23.5,23.2,27.6,45.0,23.8,28.0,23.9,20.0,21.5,23.0,17.5,26.0,26.5,28.0,27.0,23.1,21.4,24.5,23.5,20.0,21.6,28.0,27.0,32.5,26.0,45.0,45.0,18.5,26.0,27.0,28.0,19.6,28.2
|
7 |
+
[Video-llama](https://github.com/DAMO-NLP-SG/Video-LLaMA),26.1,25.5,22.3,24.4,30.0,26.2,25.0,30.7,22.5,24.5,27.0,26.5,27.0,23.5,28.0,25.0,25.0,26.0,25.5,23.0,15.0,25.8,24.0,20.0,25.0,28.0,45.0,15.0,28.5,23.5,26.5,25.0,28.6,32.8
|
8 |
+
[VideoLLaMA2](https://github.com/DAMO-NLP-SG/VideoLLaMA2),26.8,24.1,25.5,26.4,30.0,27.2,33.0,34.5,22.5,24.0,27.0,17.0,23.5,27.5,26.5,26.5,19.4,23.0,25.5,26.0,20.0,26.8,29.0,25.5,30.5,20.5,45.0,15.0,28.5,26.5,26.5,33.0,28.6,40.5
|
9 |
+
[AnyGPT](https://junzhan2000.github.io/AnyGPT.github.io/),26.1,24.6,25.0,26.4,27.5,29.2,29.0,25.7,22.5,28.5,28.0,17.5,24.0,25.5,23.0,28.0,25.9,20.4,27.5,25.5,20.0,23.4,29.5,25.5,26.0,26.0,40.0,15.0,30.5,28.0,29.0,29.0,21.1,30.3
|
10 |
+
[NExT-GPT](https://next-gpt.github.io/),25.5,23.2,20.9,27.8,30.0,28.8,28.5,23.6,21.0,23.5,25.5,21.5,25.5,25.5,21.0,24.0,19.4,23.0,24.0,21.5,15.0,23.7,26.0,28.0,31.0,28.0,45.0,15.0,31.5,24.0,31.0,28.5,20.6,26.7
|
11 |
+
[VITA](https://vita-home.github.io/),26.4,24.1,26.4,27.8,22.5,26.3,31.0,36.8,22.0,20.5,24.5,21.5,27.5,25.0,23.5,28.5,21.3,19.4,29.5,24.5,45.0,26.9,26.0,27.5,33.5,24.5,25.0,20.0,26.5,25.5,27.0,31.0,34.2,39.5
|
12 |
+
[Gemini 1.5 Flash](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),27.8,27.2,25.0,28.8,30.0,25.3,28.5,31.2,24.5,24.0,23.5,17.0,32.5,26.0,22.5,29.5,34.3,48.0,21.5,23.5,40.0,21.3,31.0,27.5,32.5,28.0,30.0,30.0,27.5,23.5,25.0,28.5,27.6,34.9
|
13 |
+
[Gemini 1.5 Flash-8B](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),26.8,25.1,24.5,28.9,27.5,27.5,29.0,30.2,16.5,22.5,24.0,19.0,28.0,26.5,27.0,29.0,26.9,32.7,24.5,24.5,25.0,25.9,33.0,27.5,32.0,24.5,40.0,15.0,31.0,25.5,26.0,29.0,25.6,34.9
|
14 |
+
[Gemini 1.5 Pro](https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf),30.8,30.8,31.4,31.3,37.5,27.7,20.5,33.0,33.0,26.0,29.0,25.0,25.5,26.0,29.5,30.0,38.0,57.7,22.5,29.5,50.0,25.4,42.5,28.0,28.5,29.0,35.0,40.0,30.0,24.5,28.5,20.5,32.2,33.8
|
15 |
+
[Reka Core](https://arxiv.org/abs/2404.12387),26.9,26.7,27.7,26.4,22.5,26.5,24.0,34.3,32.5,20.0,26.5,25.0,24.0,27.0,30.0,27.0,25.0,34.2,21.5,28.5,20.0,22.8,24.5,27.5,30.0,25.5,25.0,20.0,30.0,25.5,24.0,24.0,33.7,34.9
|
16 |
+
[Reka Flash](https://arxiv.org/abs/2404.12387),26.3,25.5,24.1,27.2,30.0,27.5,31.5,24.1,20.0,22.5,26.5,26.0,28.5,26.5,26.5,29.0,28.7,22.4,25.0,24.5,20.0,30.5,29.5,27.5,25.5,24.5,45.0,15.0,30.0,25.5,27.0,31.5,19.1,29.2
|
17 |
+
[Reka Edge](https://arxiv.org/abs/2404.12387),25.0,23.8,20.5,26.3,22.5,25.5,22.5,36.8,21.5,24.0,30.5,20.0,19.5,22.5,20.5,25.5,25.9,23.5,29.0,20.5,20.0,24.9,24.5,27.5,30.0,24.0,30.0,15.0,30.0,25.5,21.0,22.5,38.2,35.4
|
18 |
+
[GPT-4o visual caption](https://openai.com/index/hello-gpt-4o/),32.3,37.4,28.6,32.3,27.5,25.5,23.0,28.9,33.0,30.5,24.0,26.5,43.0,42.0,32.5,39.0,49.1,67.3,30.5,26.0,55.0,24.4,48.0,27.0,34.5,23.5,25.0,30.0,21.5,22.5,32.5,23.0,32.2,25.6
|
19 |
+
[GPT-4o audio caption](https://openai.com/index/hello-gpt-4o/),34.5,38.6,31.8,33.6,32.5,27.5,25.0,26.1,40.0,38.0,27.5,26.5,45.0,42.0,27.0,41.0,42.6,62.2,35.5,28.0,70.0,24.4,56.5,27.5,32.5,22.5,30.0,35.0,23.5,25.5,33.5,25.0,30.2,22.0
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
APScheduler==3.10.1
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
contourpy==1.0.7
|
13 |
+
cycler==0.11.0
|
14 |
+
datasets==2.12.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.11.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.27.0
|
23 |
+
gradio_client==0.1.3
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.13.4
|
28 |
+
idna==3.4
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
orjson==3.8.10
|
41 |
+
packaging==23.1
|
42 |
+
pandas==2.0.0
|
43 |
+
Pillow==9.5.0
|
44 |
+
plotly==5.14.1
|
45 |
+
pyarrow==11.0.0
|
46 |
+
pydantic==1.10.7
|
47 |
+
pydub==0.25.1
|
48 |
+
pyparsing==3.0.9
|
49 |
+
pyrsistent==0.19.3
|
50 |
+
python-dateutil==2.8.2
|
51 |
+
python-multipart==0.0.6
|
52 |
+
pytz==2023.3
|
53 |
+
pytz-deprecation-shim==0.1.0.post0
|
54 |
+
PyYAML==6.0
|
55 |
+
requests==2.28.2
|
56 |
+
semantic-version==2.10.0
|
57 |
+
six==1.16.0
|
58 |
+
sniffio==1.3.0
|
59 |
+
starlette==0.26.1
|
60 |
+
toolz==0.12.0
|
61 |
+
tqdm==4.65.0
|
62 |
+
transformers==4.28.1
|
63 |
+
typing_extensions==4.5.0
|
64 |
+
tzdata==2023.3
|
65 |
+
tzlocal==4.3
|
66 |
+
uc-micro-py==1.0.1
|
67 |
+
urllib3==1.26.15
|
68 |
+
uvicorn==0.21.1
|
69 |
+
websockets==11.0.1
|
70 |
+
yarl==1.8.2
|
src/__pycache__/utils_display.cpython-311.pyc
ADDED
Binary file (6.25 kB). View file
|
|
src/__pycache__/utils_display.cpython-38.pyc
ADDED
Binary file (4.29 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-311.pyc
ADDED
Binary file (1.72 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
ADDED
Binary file (1.25 kB). View file
|
|
src/auto_leaderboard/model_metadata_type.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
import glob
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from typing import Dict, List
|
7 |
+
|
8 |
+
from ..utils_display import AutoEvalColumn
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class ModelInfo:
|
12 |
+
name: str
|
13 |
+
symbol: str # emoji
|
14 |
+
|
15 |
+
model_type_symbols = {
|
16 |
+
"LLM": "🟢",
|
17 |
+
"ImageLLM": "🔶",
|
18 |
+
"VideoLLM": "⭕",
|
19 |
+
"Other": "🟦",
|
20 |
+
}
|
21 |
+
|
22 |
+
class ModelType(Enum):
|
23 |
+
PT = ModelInfo(name="LLM", symbol="🟢")
|
24 |
+
FT = ModelInfo(name="ImageLLM", symbol="🔶")
|
25 |
+
IFT = ModelInfo(name="VideoLLM", symbol="⭕")
|
26 |
+
RL = ModelInfo(name="Other", symbol="🟦")
|
27 |
+
|
28 |
+
def to_str(self, separator = " "):
|
29 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
30 |
+
|
src/utils_display.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
# These classes are for user facing column names, to avoid having to change them
|
4 |
+
# all around the code when a modif is needed
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
displayed_by_default: bool
|
10 |
+
hidden: bool = False
|
11 |
+
|
12 |
+
def fields(raw_class):
|
13 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
14 |
+
|
15 |
+
@dataclass(frozen=True)
|
16 |
+
class AutoEvalColumn: # Auto evals column
|
17 |
+
model_type_symbol = ColumnContent("T", "str", True)
|
18 |
+
model = ColumnContent("Model", "markdown", True)
|
19 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
20 |
+
arc = ColumnContent("ARC", "number", True)
|
21 |
+
hellaswag = ColumnContent("HellaSwag", "number", True)
|
22 |
+
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
+
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
24 |
+
model_type = ColumnContent("Type", "str", False)
|
25 |
+
precision = ColumnContent("Precision", "str", False, True)
|
26 |
+
license = ColumnContent("Hub License", "str", False)
|
27 |
+
params = ColumnContent("#Params (B)", "number", False)
|
28 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
29 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
30 |
+
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
31 |
+
|
32 |
+
@dataclass(frozen=True)
|
33 |
+
class EloEvalColumn: # Elo evals column
|
34 |
+
model = ColumnContent("Model", "markdown", True)
|
35 |
+
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
36 |
+
human_all = ColumnContent("Human (all)", "number", True)
|
37 |
+
human_instruct = ColumnContent("Human (instruct)", "number", True)
|
38 |
+
human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
|
39 |
+
|
40 |
+
|
41 |
+
@dataclass(frozen=True)
|
42 |
+
class EvalQueueColumn: # Queue column
|
43 |
+
model = ColumnContent("model", "markdown", True)
|
44 |
+
revision = ColumnContent("revision", "str", True)
|
45 |
+
private = ColumnContent("private", "bool", True)
|
46 |
+
precision = ColumnContent("precision", "bool", True)
|
47 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
48 |
+
status = ColumnContent("status", "str", True)
|
49 |
+
|
50 |
+
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
51 |
+
|
52 |
+
|
53 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
54 |
+
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
55 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
56 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
57 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
58 |
+
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
59 |
+
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
60 |
+
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
61 |
+
|
62 |
+
|
63 |
+
def model_hyperlink(link, model_name):
|
64 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
65 |
+
|
66 |
+
|
67 |
+
def make_clickable_model(model_name):
|
68 |
+
link = f"https://huggingface.co/{model_name}"
|
69 |
+
|
70 |
+
if model_name in LLAMAS:
|
71 |
+
link = LLAMA_LINK
|
72 |
+
model_name = model_name.split("/")[1]
|
73 |
+
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
74 |
+
link = VICUNA_LINK
|
75 |
+
model_name = "stable-vicuna-13b"
|
76 |
+
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
77 |
+
link = ALPACA_LINK
|
78 |
+
model_name = "alpaca-13b"
|
79 |
+
if model_name == "dolly-12b":
|
80 |
+
link = DOLLY_LINK
|
81 |
+
elif model_name == "vicuna-13b":
|
82 |
+
link = VICUNA_LINK
|
83 |
+
elif model_name == "koala-13b":
|
84 |
+
link = KOALA_LINK
|
85 |
+
elif model_name == "oasst-12b":
|
86 |
+
link = OASST_LINK
|
87 |
+
#else:
|
88 |
+
# link = MODEL_PAGE
|
89 |
+
|
90 |
+
return model_hyperlink(link, model_name)
|
91 |
+
|
92 |
+
def styled_error(error):
|
93 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
94 |
+
|
95 |
+
def styled_warning(warn):
|
96 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
97 |
+
|
98 |
+
def styled_message(message):
|
99 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|