lyx97 commited on
Commit
ce8627b
1 Parent(s): 6de388e
Files changed (3) hide show
  1. app.py +11 -3
  2. constants.py +36 -32
  3. file/result.csv +7 -6
app.py CHANGED
@@ -30,6 +30,7 @@ def add_new_eval(
30
  model_link: str,
31
  model_type: str,
32
  model_size: str,
 
33
  ):
34
  if input_file is None:
35
  return "Error! Empty file!"
@@ -94,6 +95,7 @@ def add_new_eval(
94
  input_data[22],
95
  input_data[23],
96
  input_data[24],
 
97
  ]
98
  print(len(new_data), col)
99
  print(csv_data.loc[col-1])
@@ -156,6 +158,8 @@ with block:
156
  updated_data = get_all_df()[present_columns]
157
  updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
158
  updated_headers = present_columns
 
 
159
  update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
160
 
161
  filter_component = gr.components.Dataframe(
@@ -190,13 +194,13 @@ with block:
190
  with gr.Row():
191
  with gr.Column():
192
  model_name_textbox = gr.Textbox(
193
- label="Model name", placeholder="Chat-UniVi-7B"
194
  )
195
  revision_name_textbox = gr.Textbox(
196
- label="Revision Model Name", placeholder="Chat-UniVi-7B"
197
  )
198
  model_link = gr.Textbox(
199
- label="Model Link", placeholder="https://github.com/PKU-YuanGroup/Chat-UniVi"
200
  )
201
  model_type = gr.Dropdown(
202
  choices=[
@@ -213,6 +217,9 @@ with block:
213
  model_size = gr.Textbox(
214
  label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
215
  )
 
 
 
216
 
217
  with gr.Column():
218
 
@@ -229,6 +236,7 @@ with block:
229
  model_link,
230
  model_type,
231
  model_size,
 
232
  ],
233
  # outputs = submission_result,
234
  )
 
30
  model_link: str,
31
  model_type: str,
32
  model_size: str,
33
+ notes: str,
34
  ):
35
  if input_file is None:
36
  return "Error! Empty file!"
 
95
  input_data[22],
96
  input_data[23],
97
  input_data[24],
98
+ notes,
99
  ]
100
  print(len(new_data), col)
101
  print(csv_data.loc[col-1])
 
158
  updated_data = get_all_df()[present_columns]
159
  updated_data = updated_data.sort_values(by=present_columns[1], ascending=False)
160
  updated_headers = present_columns
161
+ print(updated_headers)
162
+ print([COLUMN_NAMES.index(x) for x in updated_headers])
163
  update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
164
 
165
  filter_component = gr.components.Dataframe(
 
194
  with gr.Row():
195
  with gr.Column():
196
  model_name_textbox = gr.Textbox(
197
+ label="Model name", placeholder="Video-LLaVA"
198
  )
199
  revision_name_textbox = gr.Textbox(
200
+ label="Revision Model Name", placeholder="Video-LLaVA"
201
  )
202
  model_link = gr.Textbox(
203
+ label="Model Link", placeholder="https://huggingface.co/LanguageBind/Video-LLaVA-7B"
204
  )
205
  model_type = gr.Dropdown(
206
  choices=[
 
217
  model_size = gr.Textbox(
218
  label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
219
  )
220
+ notes = gr.Textbox(
221
+ label="Notes", placeholder="Other details of the model or evaluation, e.g., which answer prompt is used."
222
+ )
223
 
224
  with gr.Column():
225
 
 
236
  model_link,
237
  model_type,
238
  model_size,
239
+ notes,
240
  ],
241
  # outputs = submission_result,
242
  )
constants.py CHANGED
@@ -6,16 +6,18 @@ TASK_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Match
6
  "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
7
  "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
8
  "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
9
- "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"]
 
10
 
11
  AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
12
- DATA_TITILE_TYPE = ["markdown",
13
  "number", "number", "number", "number", "number",
14
  "number", "number", "number", "number",
15
  "number", "number", "number", "number",
16
  "number", "number", "number", "number",
17
  "number", "number", "number", "number",
18
- "number", "number", "number", "number",]
 
19
  CSV_DIR = "./file/result.csv"
20
 
21
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
@@ -30,49 +32,51 @@ TempCompass is a benchmark to evaluate the temporal perception ability of Video
30
  SUBMIT_INTRODUCTION = """
31
  # TempCompass Leaderboard
32
 
33
- Welcome to the leaderboard of the Video-Bench! 🏆
34
 
35
  ## Submit Instruction
36
- Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
37
-
38
- You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
39
-
40
- For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
41
- ```python
42
- {
43
- "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
44
- "gt-answer": "D. fighting",
45
- "video-llm-prediction": "D",
46
- "match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
47
- "rating": 1
48
- }
49
- ```
50
-
51
- For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
52
- ```python
53
- {
54
- "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
55
- "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
56
- "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
57
- "gt-answer": "A. dunking a basketball",
58
- "rating": 0
59
- }
60
- ```
61
 
62
 
63
  ### Submit Example
64
  For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
65
  1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
66
  2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
67
- 3. Select ‘ImageLLM’ in ‘Model Type’.
68
  4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
69
  5. Fill in ‘7B’ in ‘Model size’.
70
- 6. Upload `<task_type>.json`.
71
  7. Click the ‘Submit Eval’ button.
72
  8. Click ‘Refresh’ to obtain the uploaded leaderboard.
73
-
74
  """
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
77
  We use accurancy(%) as the primary evaluation metric for each tasks.
78
  """
 
6
  "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
7
  "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
8
  "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
9
+ "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation",
10
+ "Notes"]
11
 
12
  AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
13
+ DATA_TITILE_TYPE = ["markdown", "markdown", "markdown",
14
  "number", "number", "number", "number", "number",
15
  "number", "number", "number", "number",
16
  "number", "number", "number", "number",
17
  "number", "number", "number", "number",
18
  "number", "number", "number", "number",
19
+ "number", "number", "number", "number",
20
+ "markdown"]
21
  CSV_DIR = "./file/result.csv"
22
 
23
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
 
32
  SUBMIT_INTRODUCTION = """
33
  # TempCompass Leaderboard
34
 
35
+ Welcome to the leaderboard of the TempCompass! 🏆
36
 
37
  ## Submit Instruction
38
+
39
+ 1.Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start). You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://huggingface.co/spaces/lyx97/TempCompass/tree/main/file/example_eval_results))
40
+
41
+ 2. Merge the four JSON files using [merge_eval_result.py](https://huggingface.co/spaces/lyx97/TempCompass/blob/main/merge_eval_result.py) and obtain `merged_result.json` (an example can be found [here](https://huggingface.co/spaces/lyx97/TempCompass/blob/main/file/example_eval_results/merged_result.json)).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  ### Submit Example
45
  For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
46
  1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
47
  2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
48
+ 3. Select ‘VideoLLM’ in ‘Model Type’.
49
  4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
50
  5. Fill in ‘7B’ in ‘Model size’.
51
+ 6. Upload `merged_result.json`.
52
  7. Click the ‘Submit Eval’ button.
53
  8. Click ‘Refresh’ to obtain the uploaded leaderboard.
 
54
  """
55
 
56
+ # """
57
+ # For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
58
+ # ```python
59
+ # {
60
+ # "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
61
+ # "gt-answer": "D. fighting",
62
+ # "video-llm-prediction": "D",
63
+ # "match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
64
+ # "rating": 1
65
+ # }
66
+ # ```
67
+
68
+ # For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
69
+ # ```python
70
+ # {
71
+ # "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
72
+ # "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
73
+ # "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
74
+ # "gt-answer": "A. dunking a basketball",
75
+ # "rating": 0
76
+ # }
77
+ # ```
78
+ # """
79
+
80
  TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
81
  We use accurancy(%) as the primary evaluation metric for each tasks.
82
  """
file/result.csv CHANGED
@@ -1,6 +1,7 @@
1
- Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation
2
- Random,Others,-,48.31,66.71,33.8,61.53,47.24,18.16,30.12,21.56,64.13,83.28,70.82,72.75,72.49,83.65,65.98,60.6,67.75,39.83,10.06,48.97,73.41,28.69,25.93,90.31,65.94
3
- [VideoChat-7B](https://github.com/OpenGVLab/Ask-Anything),Video-LLM,7B,26.47,94.12,42.23,55.56,71.9,35.08,86.8,97.23,95.45,91.23,69.17,19.82,45.5,32.3,48.16,31.83,19.13,44.73,20.71,36.68,61.13,87.71,28.19,26.12,16.33
4
- Gemini,Video-LLM,-,5.1,61.4,65.71,35.03,50.61,12.5,18.74,33.16,8.16,21.18,3.02,37.25,75.82,87.79,31.66,83.32,41.48,47.26,33.73,54.57,31.64,58.51,4.88,55.22,65.75
5
- llava_phi_2.7,Image-LLM,-,97.64,81.61,39.3,54.9,17.11,33.57,13.78,76.95,90.81,3.07,5.98,14.63,23.62,15.46,88.03,22.58,21.46,88.25,35.72,85.05,58.54,86.19,74.07,57.24,0.9
6
- ,[],-,49.77,45.57,56.38,63.34,34.83,76.04,74.32,87.88,50.76,35.65,50.28,58.42,23.2,35.22,51.82,53.82,28.67,37.75,49.21,59.0,38.25,40.97,51.12,58.33,33.59
 
 
1
+ Model,Model Type,Model Size,Avg. All,Avg. Multi-Choice,Avg. Yes/No,Avg. Caption Matching,Avg. Caption Generation,Action. Multi-Choice,Action. Yes/No,Action. Caption Matching,Action. Caption Generation,Direction. Multi-Choice,Direction. Yes/No,Direction. Caption Matching,Direction. Caption Generation,Speed. Multi-Choice,Speed. Yes/No,Speed. Caption Matching,Speed. Caption Generation,Event Order. Multi-Choice,Event Order. Yes/No,Event Order. Caption Matching,Event Order. Caption Generation,Attribute Change. Multi-Choice,Attribute Change. Yes/No,Attribute Change. Caption Matching,Attribute Change. Caption Generation,Notes
2
+ [Video-LLaVA](https://huggingface.co/LanguageBind/Video-LLaVA-7B),VideoLLM,7B,49.77,45.57,56.38,63.34,34.83,76.04,74.32,87.88,50.76,35.65,50.28,58.42,23.2,35.22,51.82,53.82,28.67,37.75,49.21,59.0,38.25,40.97,51.12,58.33,33.59,"answer prompt: ""Please directly give the best option:"""
3
+ [VideoChat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2),VideoLLM,7B,48.81,42.91,58.01,53.69,38.52,76.92,72.8,73.4,54.04,33.44,53.82,49.48,32.73,29.55,53.85,51.68,30.96,35.43,51.31,48.0,34.25,36.81,53.79,45.83,41.41,"answer prompt: ""Please directly give the best option:"""
4
+ [LLaMA-VID](https://github.com/dvlab-research/LLaMA-VID),VideoLLM,7B,45.61,38.04,52.96,56.02,34.78,61.24,63.01,73.4,53.03,29.65,49.16,51.2,21.91,29.85,48.79,50.46,27.98,33.77,48.43,53.67,35.5,34.03,52.68,51.74,35.94,"answer prompt: ""Please directly give the best option:"""
5
+ [mPLUG-Owl-video](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-video),VideoLLM,7B,44.15,36.39,54.42,48.5,34.43,55.92,64.36,57.24,46.46,31.55,51.21,43.64,30.41,32.24,50.61,41.9,28.21,27.48,51.31,48.67,31.25,32.99,52.01,51.74,36.46,"answer prompt: ""Please directly give the best option:"""
6
+ [PandaGPT](https://huggingface.co/openllmplayground/pandagpt_13b_max_len_400),VideoLLM,13B,41.64,34.37,51.81,51.56,27.5,36.69,53.04,49.16,23.74,33.75,50.84,49.48,26.03,34.03,49.6,55.96,25.69,32.12,53.66,54.33,29.75,35.07,52.23,48.26,32.55,"answer prompt: ""Please directly give the best option:"""
7
+ [Valley2-7B](https://huggingface.co/luoruipu1/Valley2-7b),VideoLLM,7B,37.49,29.56,53.49,34.6,26.35,43.49,58.11,36.7,24.75,24.92,52.51,36.77,21.91,29.55,52.02,33.64,20.41,18.54,50.26,36.0,35.75,29.86,52.9,29.86,29.43,"answer prompt: ""Please directly give the best option:"""