Spaces:
Running
Running
update
Browse files- app.py +11 -3
- constants.py +36 -32
- file/result.csv +7 -6
app.py
CHANGED
@@ -30,6 +30,7 @@ def add_new_eval(
|
|
30 |
model_link: str,
|
31 |
model_type: str,
|
32 |
model_size: str,
|
|
|
33 |
):
|
34 |
if input_file is None:
|
35 |
return "Error! Empty file!"
|
@@ -94,6 +95,7 @@ def add_new_eval(
|
|
94 |
input_data[22],
|
95 |
input_data[23],
|
96 |
input_data[24],
|
|
|
97 |
]
|
98 |
print(len(new_data), col)
|
99 |
print(csv_data.loc[col-1])
|
@@ -156,6 +158,8 @@ with block:
|
|
156 |
updated_data = get_all_df()[present_columns]
|
157 |
updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
|
158 |
updated_headers = present_columns
|
|
|
|
|
159 |
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
160 |
|
161 |
filter_component = gr.components.Dataframe(
|
@@ -190,13 +194,13 @@ with block:
|
|
190 |
with gr.Row():
|
191 |
with gr.Column():
|
192 |
model_name_textbox = gr.Textbox(
|
193 |
-
label="Model name", placeholder="
|
194 |
)
|
195 |
revision_name_textbox = gr.Textbox(
|
196 |
-
label="Revision Model Name", placeholder="
|
197 |
)
|
198 |
model_link = gr.Textbox(
|
199 |
-
label="Model Link", placeholder="https://
|
200 |
)
|
201 |
model_type = gr.Dropdown(
|
202 |
choices=[
|
@@ -213,6 +217,9 @@ with block:
|
|
213 |
model_size = gr.Textbox(
|
214 |
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
|
215 |
)
|
|
|
|
|
|
|
216 |
|
217 |
with gr.Column():
|
218 |
|
@@ -229,6 +236,7 @@ with block:
|
|
229 |
model_link,
|
230 |
model_type,
|
231 |
model_size,
|
|
|
232 |
],
|
233 |
# outputs = submission_result,
|
234 |
)
|
|
|
30 |
model_link: str,
|
31 |
model_type: str,
|
32 |
model_size: str,
|
33 |
+
notes: str,
|
34 |
):
|
35 |
if input_file is None:
|
36 |
return "Error! Empty file!"
|
|
|
95 |
input_data[22],
|
96 |
input_data[23],
|
97 |
input_data[24],
|
98 |
+
notes,
|
99 |
]
|
100 |
print(len(new_data), col)
|
101 |
print(csv_data.loc[col-1])
|
|
|
158 |
updated_data = get_all_df()[present_columns]
|
159 |
updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
|
160 |
updated_headers = present_columns
|
161 |
+
print(updated_headers)
|
162 |
+
print([COLUMN_NAMES.index(x) for x in updated_headers])
|
163 |
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
164 |
|
165 |
filter_component = gr.components.Dataframe(
|
|
|
194 |
with gr.Row():
|
195 |
with gr.Column():
|
196 |
model_name_textbox = gr.Textbox(
|
197 |
+
label="Model name", placeholder="Video-LLaVA"
|
198 |
)
|
199 |
revision_name_textbox = gr.Textbox(
|
200 |
+
label="Revision Model Name", placeholder="Video-LLaVA"
|
201 |
)
|
202 |
model_link = gr.Textbox(
|
203 |
+
label="Model Link", placeholder="https://huggingface.co/LanguageBind/Video-LLaVA-7B"
|
204 |
)
|
205 |
model_type = gr.Dropdown(
|
206 |
choices=[
|
|
|
217 |
model_size = gr.Textbox(
|
218 |
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
|
219 |
)
|
220 |
+
notes = gr.Textbox(
|
221 |
+
label="Notes", placeholder="Other details of the model or evaluation, e.g., which answer prompt is used."
|
222 |
+
)
|
223 |
|
224 |
with gr.Column():
|
225 |
|
|
|
236 |
model_link,
|
237 |
model_type,
|
238 |
model_size,
|
239 |
+
notes,
|
240 |
],
|
241 |
# outputs = submission_result,
|
242 |
)
|
constants.py
CHANGED
@@ -6,16 +6,18 @@ TASK_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Match
|
|
6 |
"Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
|
7 |
"Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
|
8 |
"Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
|
9 |
-
"Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"
|
|
|
10 |
|
11 |
AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
|
12 |
-
DATA_TITILE_TYPE = ["markdown",
|
13 |
"number", "number", "number", "number", "number",
|
14 |
"number", "number", "number", "number",
|
15 |
"number", "number", "number", "number",
|
16 |
"number", "number", "number", "number",
|
17 |
"number", "number", "number", "number",
|
18 |
-
"number", "number", "number", "number",
|
|
|
19 |
CSV_DIR = "./file/result.csv"
|
20 |
|
21 |
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
@@ -30,49 +32,51 @@ TempCompass is a benchmark to evaluate the temporal perception ability of Video
|
|
30 |
SUBMIT_INTRODUCTION = """
|
31 |
# TempCompass Leaderboard
|
32 |
|
33 |
-
Welcome to the leaderboard of the
|
34 |
|
35 |
## Submit Instruction
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
|
41 |
-
```python
|
42 |
-
{
|
43 |
-
"question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
|
44 |
-
"gt-answer": "D. fighting",
|
45 |
-
"video-llm-prediction": "D",
|
46 |
-
"match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
|
47 |
-
"rating": 1
|
48 |
-
}
|
49 |
-
```
|
50 |
-
|
51 |
-
For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
|
52 |
-
```python
|
53 |
-
{
|
54 |
-
"chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
|
55 |
-
"chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
|
56 |
-
"video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
|
57 |
-
"gt-answer": "A. dunking a basketball",
|
58 |
-
"rating": 0
|
59 |
-
}
|
60 |
-
```
|
61 |
|
62 |
|
63 |
### Submit Example
|
64 |
For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
|
65 |
1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
|
66 |
2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
|
67 |
-
3. Select ‘
|
68 |
4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
|
69 |
5. Fill in ‘7B’ in ‘Model size’.
|
70 |
-
6. Upload
|
71 |
7. Click the ‘Submit Eval’ button.
|
72 |
8. Click ‘Refresh’ to obtain the uploaded leaderboard.
|
73 |
-
|
74 |
"""
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
|
77 |
We use accurancy(%) as the primary evaluation metric for each tasks.
|
78 |
"""
|
|
|
6 |
"Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
|
7 |
"Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
|
8 |
"Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
|
9 |
+
"Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation",
|
10 |
+
"Notes"]
|
11 |
|
12 |
AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
|
13 |
+
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown",
|
14 |
"number", "number", "number", "number", "number",
|
15 |
"number", "number", "number", "number",
|
16 |
"number", "number", "number", "number",
|
17 |
"number", "number", "number", "number",
|
18 |
"number", "number", "number", "number",
|
19 |
+
"number", "number", "number", "number",
|
20 |
+
"markdown"]
|
21 |
CSV_DIR = "./file/result.csv"
|
22 |
|
23 |
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
|
|
32 |
SUBMIT_INTRODUCTION = """
|
33 |
# TempCompass Leaderboard
|
34 |
|
35 |
+
Welcome to the leaderboard of the TempCompass! 🏆
|
36 |
|
37 |
## Submit Instruction
|
38 |
+
|
39 |
+
1.Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start). You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://huggingface.co/spaces/lyx97/TempCompass/tree/main/file/example_eval_results))
|
40 |
+
|
41 |
+
2. Merge the four JSON files using [merge_eval_result.py](https://huggingface.co/spaces/lyx97/TempCompass/blob/main/merge_eval_result.py) and obtain `merged_result.json` (an example can be found [here](https://huggingface.co/spaces/lyx97/TempCompass/blob/main/file/example_eval_results/merged_result.json)).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
### Submit Example
|
45 |
For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
|
46 |
1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
|
47 |
2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
|
48 |
+
3. Select ‘VideoLLM’ in ‘Model Type’.
|
49 |
4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
|
50 |
5. Fill in ‘7B’ in ‘Model size’.
|
51 |
+
6. Upload `merged_result.json`.
|
52 |
7. Click the ‘Submit Eval’ button.
|
53 |
8. Click ‘Refresh’ to obtain the uploaded leaderboard.
|
|
|
54 |
"""
|
55 |
|
56 |
+
# """
|
57 |
+
# For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
|
58 |
+
# ```python
|
59 |
+
# {
|
60 |
+
# "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
|
61 |
+
# "gt-answer": "D. fighting",
|
62 |
+
# "video-llm-prediction": "D",
|
63 |
+
# "match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
|
64 |
+
# "rating": 1
|
65 |
+
# }
|
66 |
+
# ```
|
67 |
+
|
68 |
+
# For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
|
69 |
+
# ```python
|
70 |
+
# {
|
71 |
+
# "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
|
72 |
+
# "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
|
73 |
+
# "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
|
74 |
+
# "gt-answer": "A. dunking a basketball",
|
75 |
+
# "rating": 0
|
76 |
+
# }
|
77 |
+
# ```
|
78 |
+
# """
|
79 |
+
|
80 |
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
|
81 |
We use accurancy(%) as the primary evaluation metric for each tasks.
|
82 |
"""
|
file/result.csv
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
|
2 |
-
|
3 |
-
[
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
1 |
+
Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation,Notes
|
2 |
+
[Video-LLaVA](https://huggingface.co/LanguageBind/Video-LLaVA-7B),VideoLLM,7B,49.77,45.57,56.38,63.34,34.83,76.04,74.32,87.88,50.76,35.65,50.28,58.42,23.2,35.22,51.82,53.82,28.67,37.75,49.21,59.0,38.25,40.97,51.12,58.33,33.59,"answer prompt: ""Please directly give the best option:"""
|
3 |
+
[VideoChat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2),VideoLLM,7B,48.81,42.91,58.01,53.69,38.52,76.92,72.8,73.4,54.04,33.44,53.82,49.48,32.73,29.55,53.85,51.68,30.96,35.43,51.31,48.0,34.25,36.81,53.79,45.83,41.41,"answer prompt: ""Please directly give the best option:"""
|
4 |
+
[LLaMA-VID](https://github.com/dvlab-research/LLaMA-VID),VideoLLM,7B,45.61,38.04,52.96,56.02,34.78,61.24,63.01,73.4,53.03,29.65,49.16,51.2,21.91,29.85,48.79,50.46,27.98,33.77,48.43,53.67,35.5,34.03,52.68,51.74,35.94,"answer prompt: ""Please directly give the best option:"""
|
5 |
+
[mPLUG-Owl-video](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-video),VideoLLM,7B,44.15,36.39,54.42,48.5,34.43,55.92,64.36,57.24,46.46,31.55,51.21,43.64,30.41,32.24,50.61,41.9,28.21,27.48,51.31,48.67,31.25,32.99,52.01,51.74,36.46,"answer prompt: ""Please directly give the best option:"""
|
6 |
+
[PandaGPT](https://huggingface.co/openllmplayground/pandagpt_13b_max_len_400),VideoLLM,13B,41.64,34.37,51.81,51.56,27.5,36.69,53.04,49.16,23.74,33.75,50.84,49.48,26.03,34.03,49.6,55.96,25.69,32.12,53.66,54.33,29.75,35.07,52.23,48.26,32.55,"answer prompt: ""Please directly give the best option:"""
|
7 |
+
[Valley2-7B](https://huggingface.co/luoruipu1/Valley2-7b),VideoLLM,7B,37.49,29.56,53.49,34.6,26.35,43.49,58.11,36.7,24.75,24.92,52.51,36.77,21.91,29.55,52.02,33.64,20.41,18.54,50.26,36.0,35.75,29.86,52.9,29.86,29.43,"answer prompt: ""Please directly give the best option:"""
|