Spaces:

AILab-CVC
/

SEED-Bench_Leaderboard

Running

App Files Files Community

BreakLee commited on Aug 21, 2023

Commit

c4d90ef

•

1 Parent(s): 6d72d0f

SEED Benchmark Leaderboard Update

Browse files

Files changed (6) hide show

__pycache__/constants.cpython-38.pyc +0 -0
app.py +23 -14
constants.py +6 -2
file/result.csv +22 -22
src/__pycache__/utils_display.cpython-38.pyc +0 -0
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0

__pycache__/constants.cpython-38.pyc ADDED Viewed

Binary file (7.6 kB). View file

app.py CHANGED Viewed

@@ -126,6 +126,9 @@ def add_new_eval(
             model_type,
             model_name,
             LLM_name,
             each_task_accuracy[1],
             each_task_accuracy[2],
             each_task_accuracy[3],
@@ -135,19 +138,25 @@ def add_new_eval(
             each_task_accuracy[7],
             each_task_accuracy[8],
             each_task_accuracy[9],
-            average_accuracy_image,
             each_task_accuracy[10],
             each_task_accuracy[11],
             each_task_accuracy[12],
-            average_accuracy_video,
-            overall_accuracy]
-        # pdb.set_trace()
         csv_data.loc[col] = new_data
         csv_data = csv_data.to_csv(CSV_DIR, index=False)
     return 0
 def get_baseline_df():
     df = pd.read_csv(CSV_DIR)
     return df
 block = gr.Blocks()
@@ -173,8 +182,8 @@ with block:
             # selection for column part:
             checkbox_group = gr.CheckboxGroup(
-                choices=TASK_INFO,
-                value=TASK_INFO,
                 label="Select options",
                 interactive=True,
             )
@@ -191,9 +200,9 @@ with block:
             def on_checkbox_group_change(selected_columns):
                 # pdb.set_trace()
-                selected_columns = [item for item in TASK_INFO if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
-                updated_data = get_baseline_df()[present_columns]
                 updated_headers = present_columns
                 update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
@@ -229,10 +238,10 @@ with block:
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(
-                        label="Model Name", placeholder="LLaMA-7B"
                         )
                     revision_name_textbox = gr.Textbox(
-                        label="Revision Model Name", placeholder="LLaMA"
                     )
                     model_type = gr.Dropdown(
                         choices=[
@@ -241,7 +250,7 @@ with block:
                             "VideoLLM",
                             "Other",
                         ],
-                        label="Model Type",
                         multiselect=False,
                         value="ImageLLM",
                         interactive=True,
@@ -254,18 +263,18 @@ with block:
                     LLM_type = gr.Dropdown(
                         choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
-                        label="LLM Type",
                         multiselect=False,
                         value="LLaMA-7B",
                         interactive=True,
                     )
                     LLM_name_textbox = gr.Textbox(
-                        label="LLM Model (for Other)",
                         placeholder="LLaMA-13B"
                     )
                     Evaluation_dimension = gr.Dropdown(
                         choices=["All", "Image", "Video"],
-                        label="Evaluation Dimension",
                         multiselect=False,
                         value="All",
                         interactive=True,

             model_type,
             model_name,
             LLM_name,
+            overall_accuracy,
+            average_accuracy_image,
+            average_accuracy_video,
             each_task_accuracy[1],
             each_task_accuracy[2],
             each_task_accuracy[3],
             each_task_accuracy[7],
             each_task_accuracy[8],
             each_task_accuracy[9],
             each_task_accuracy[10],
             each_task_accuracy[11],
             each_task_accuracy[12],
+            ]
         csv_data.loc[col] = new_data
         csv_data = csv_data.to_csv(CSV_DIR, index=False)
     return 0
 def get_baseline_df():
+    # pdb.set_trace()
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg. All", ascending=False)
+    present_columns = MODEL_INFO + checkbox_group.value
+    df = df[present_columns]
+    return df
+def get_all_df():
     df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg. All", ascending=False)
     return df
 block = gr.Blocks()
             # selection for column part:
             checkbox_group = gr.CheckboxGroup(
+                choices=TASK_INFO_v2,
+                value=AVG_INFO,
                 label="Select options",
                 interactive=True,
             )
             def on_checkbox_group_change(selected_columns):
                 # pdb.set_trace()
+                selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
+                updated_data = get_all_df()[present_columns]
                 updated_headers = present_columns
                 update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(
+                        label="Model name", placeholder="LLaMA-7B"
                         )
                     revision_name_textbox = gr.Textbox(
+                        label="Revision Model Name", placeholder="LLaMA-7B"
                     )
                     model_type = gr.Dropdown(
                         choices=[
                             "VideoLLM",
                             "Other",
                         ],
+                        label="Model type",
                         multiselect=False,
                         value="ImageLLM",
                         interactive=True,
                     LLM_type = gr.Dropdown(
                         choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
+                        label="LLM type",
                         multiselect=False,
                         value="LLaMA-7B",
                         interactive=True,
                     )
                     LLM_name_textbox = gr.Textbox(
+                        label="LLM model (for Other)",
                         placeholder="LLaMA-13B"
                     )
                     Evaluation_dimension = gr.Dropdown(
                         choices=["All", "Image", "Video"],
+                        label="Evaluation dimension",
                         multiselect=False,
                         value="All",
                         interactive=True,

constants.py CHANGED Viewed

@@ -1,11 +1,15 @@
 # this is .py for store constants
 MODEL_INFO = ["Model Type", "Model", "Language Model"]
 TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
-AVG_INFO = ["Avg. Img", "Avg. Video", "Avg. All"]
 DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
 CSV_DIR = "./file/result.csv"
-COLUMN_NAMES = MODEL_INFO + TASK_INFO
 DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
 UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5               & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\

 # this is .py for store constants
 MODEL_INFO = ["Model Type", "Model", "Language Model"]
 TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
+TASK_INFO_v2 = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition",  "Action Recognition", "Action Prediction", "Procedure Understanding"]
+AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
 DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
 CSV_DIR = "./file/result.csv"
+# COLUMN_NAMES = MODEL_INFO + TASK_INFO
+COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
 DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
 UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5               & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\

file/result.csv CHANGED Viewed

@@ -1,22 +1,22 @@
-Model Type,Model,Language Model,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Avg. Img,Action Recognition,Action Prediction,Procedure Understanding,Avg. Video,Avg. All
-LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,27.3,23.2,34.9,25.4,28.6,27.7
-LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,28.2,27.3,34.5,23.8,29.5,28.5
-LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,26.6,33.0,23.1,26.2,27.3,26.8
-ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,49.7,32.6,47.5,24.0,36.7,46.4
-ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,57.8,33.1,49.1,27.1,38.3,52.7
-ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,58.8,34.5,49.6,23.1,38.1,53.4
-ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,37.0,29.7,21.4,19.1,23.8,33.5
-ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Flan-T5-XL,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,47.4,38.2,24.5,27.1,29.9,42.8
-ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,41.8,39.5,24.3,31.9,31.4,39.1
-ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,34.5,36.9,25.8,24.0,29.2,33.2
-ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,35.2,37.9,27.2,24.8,30.4,33.9
-ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,42.9,36.8,29.2,23.8,30.6,39.7
-ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,34.5,37.2,25.4,24.2,29.3,33.1
-ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.7,42.9,34.7,26.9,35.7,40.9
-ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,35.2,38.6,18.5,19.6,25.8,32.7
-ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,35.5,33.9,25.4,23.0,27.8,33.5
-ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,37.9,26.7,17.9,26.5,23.0,34.0
-ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,54.4,41.3,40.4,27.0,37.5,50.0
-VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,39.0,34.9,36.4,27.3,33.7,37.6
-VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,33.9,27.6,21.3,21.1,23.5,31.2
-VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,32.0,31.3,23.2,20.7,25.4,30.3

+Model Type,Model,Language Model,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
+LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,27.7,27.3,28.6,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,23.2,34.9,25.4
+LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
+LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,33.0,23.1,26.2
+ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24.0
+ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
+ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
+ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,33.5,37.0,23.8,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,29.7,21.4,19.1
+ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Flan-T5-XL,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
+ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,39.5,24.3,31.9
+ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24.0
+ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,37.9,27.2,24.8
+ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
+ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,37.2,25.4,24.2
+ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,40.9,42.7,35.7,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
+ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,32.7,35.2,25.8,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,38.6,18.5,19.6
+ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,33.9,25.4,23.0
+ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,34.0,37.9,23.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
+ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,50.0,54.4,37.5,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27.0
+VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37.6,39.0,33.7,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
+VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
+VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,30.3,32.0,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7

src/__pycache__/utils_display.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/utils_display.cpython-38.pyc and b/src/__pycache__/utils_display.cpython-38.pyc differ

src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc CHANGED Viewed

Binary files a/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc and b/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc differ