Leaderboard

Running

App Files Files Community

Lillianwei commited on Oct 5, 2024

Commit

94bd921

1 Parent(s): 730f0f9

feat: adapt to MMIE

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -1
README.md +2 -10
app.py +112 -145
evals/.gitattributes +0 -55
evals/README.md +0 -6
evals/mjbench-results/detailed-results/AestheticsPredictor.json +0 -47
evals/mjbench-results/detailed-results/BLIP-v2.json +0 -47
evals/mjbench-results/detailed-results/CLIP-v2.json +0 -47
evals/mjbench-results/detailed-results/Claude 3 Opus.json +0 -47
evals/mjbench-results/detailed-results/GPT-4-vision.json +0 -47
evals/mjbench-results/detailed-results/GPT-4o.json +0 -47
evals/mjbench-results/detailed-results/Gemini Ultra.json +0 -47
evals/mjbench-results/detailed-results/HPS-v2.1.json +0 -47
evals/mjbench-results/detailed-results/Idefics2-8b.json +0 -47
evals/mjbench-results/detailed-results/ImageReward.json +0 -47
evals/mjbench-results/detailed-results/Instructblip-7b.json +0 -47
evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json +0 -47
evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json +0 -47
evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json +0 -47
evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -47
evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -35
evals/mjbench-results/detailed-results/MiniGPT4-v2.json +0 -47
evals/mjbench-results/detailed-results/PickScore-v1.json +0 -47
evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json +0 -47
evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json +0 -47
evals/mjbench-results/detailed-results/Qwen-VL-Chat.json +0 -47
evals/mjbench-results/overall-results/AestheticsPredictor.json +0 -12
evals/mjbench-results/overall-results/BLIP-v2.json +0 -12
evals/mjbench-results/overall-results/CLIP-v2.json +0 -12
evals/mjbench-results/overall-results/Claude 3 Opus.json +0 -12
evals/mjbench-results/overall-results/GPT-4-vision.json +0 -12
evals/mjbench-results/overall-results/GPT-4o.json +0 -12
evals/mjbench-results/overall-results/Gemini Ultra.json +0 -12
evals/mjbench-results/overall-results/HPS-v2.1.json +0 -12
evals/mjbench-results/overall-results/Idefics2-8b.json +0 -12
evals/mjbench-results/overall-results/ImageReward.json +0 -12
evals/mjbench-results/overall-results/Instructblip-7b.json +0 -12
evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json +0 -12
evals/mjbench-results/overall-results/LLaVA-1.5-13b.json +0 -12
evals/mjbench-results/overall-results/LLaVA-1.5-7b.json +0 -12
evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json +0 -12
evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -12
evals/mjbench-results/overall-results/MiniGPT4-v2.json +0 -12
evals/mjbench-results/overall-results/PickScore-v1.json +0 -12
evals/mjbench-results/overall-results/Prometheus-Vision-13b.json +0 -12
evals/mjbench-results/overall-results/Prometheus-Vision-7b.json +0 -12
evals/mjbench-results/overall-results/Qwen-VL-Chat.json +0 -12
src/about.py +4 -5
src/envs.py +3 -3
src/logo.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
-mj-bench-logo.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: MJ Bench Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
@@ -45,13 +45,5 @@ You'll find
 ## Citation
 ```
-@misc{chen2024mjbenchmultimodalrewardmodel,
-      title={MJ-Bench: Is Your Multimodal Reward Model Really a Good Judge for Text-to-Image Generation?},
-      author={Zhaorun Chen and Yichao Du and Zichen Wen and Yiyang Zhou and Chenhang Cui and Zhenzhen Weng and Haoqin Tu and Chaoqi Wang and Zhengwei Tong and Qinglan Huang and Canyu Chen and Qinghao Ye and Zhihong Zhu and Yuqing Zhang and Jiawei Zhou and Zhuokai Zhao and Rafael Rafailov and Chelsea Finn and Huaxiu Yao},
-      year={2024},
-      eprint={2407.04842},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2407.04842},
-}
 ```

 ---
+title: MMIE Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
 ## Citation
 ```
 ```

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 from pathlib import Path
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from datasets import load_dataset
 from src.about import (
@@ -20,19 +19,19 @@ from src.about import (
     ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 try:
@@ -76,7 +75,7 @@ PERSPECTIVE_COUNTS= {
-META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
@@ -84,36 +83,36 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID)
-color_map = {
-    "Score Model": "#7497db",
-    "Opensource VLM": "#E8ECF2",
-    "Closesource VLM": "#ffcd75",
-    "Others": "#75809c",
-    # #7497db #E8ECF2 #ffcd75 #75809c
-}
-def color_model_type_column(df, color_map):
-    """
-    Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
-    Parameters:
-    df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
-    color_map (dict): A dictionary mapping model types to colors.
-    Returns:
-    pd.Styler: The styled DataFrame.
-    """
-    # Function to apply color based on the model type
-    def apply_color(val):
-        color = color_map.get(val, "default")  # Default color if not specified in color_map
-        return f'background-color: {color}'
-    # Format for different columns
-    format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
-    format_dict['Overall Score'] = "{:.2f}"
-    format_dict[''] = "{:d}"
-    return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
 def regex_table(dataframe, regex, filter_button, style=True):
     """
@@ -127,14 +126,10 @@ def regex_table(dataframe, regex, filter_button, style=True):
     # if filter_button, remove all rows with "ai2" in the model name
     update_scores = False
     if isinstance(filter_button, list) or isinstance(filter_button, str):
-        if "Score Model" not in filter_button:
-            dataframe = dataframe[~dataframe["Model Type"].str.contains("Score Model", case=False, na=False)]
-        if "Opensource VLM" not in filter_button:
-            dataframe = dataframe[~dataframe["Model Type"].str.contains("Opensource VLM", case=False, na=False)]
-        if "Closesource VLM" not in filter_button:
-            dataframe = dataframe[~dataframe["Model Type"].str.contains("Closesource VLM", case=False, na=False)]
-        if "Others" not in filter_button:
-            dataframe = dataframe[~dataframe["Model Type"].str.contains("Others", case=False, na=False)]
     # Filter the dataframe such that 'model' contains any of the regex patterns
     data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
@@ -143,9 +138,9 @@ def regex_table(dataframe, regex, filter_button, style=True):
     # replace column '' with count/rank
     data.insert(0, '', range(1, 1 + len(data)))
-    if style:
-        # apply color
-        data = color_model_type_column(data, color_map)
     return data
@@ -164,27 +159,6 @@ def get_leaderboard_results(results_path):
     df.reset_index(drop=True, inplace=True)
     return df
-def avg_all_subset(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, subset_counts=SUBSET_COUNTS):
-    new_df = orig_df.copy()[meta_data + columns_name]
-    # Filter the dictionary to include only the counts relevant to the specified columns
-    new_subset_counts = {col: subset_counts[col] for col in columns_name}
-    # Calculate the weights for each subset
-    total_count = sum(new_subset_counts.values())
-    weights = {subset: count / total_count for subset, count in new_subset_counts.items()}
-    # Calculate the weight_avg value for each row
-    def calculate_weighted_avg(row):
-        weighted_sum = sum(row[col] * weights[col] for col in columns_name)
-        return weighted_sum
-    new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
-    cols = meta_data + ["Overall Score"]  + columns_name
-    new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
-    return new_df
 def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
     new_df = orig_df[meta_data + columns_name]
@@ -200,28 +174,63 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
     new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
     return new_df
-results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
-orig_df = get_leaderboard_results(results_path)
-colmuns_name = list(SUBSET_COUNTS.keys())
-detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
-results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
-orig_df = get_leaderboard_results(results_path)
-colmuns_name = list(PERSPECTIVE_COUNTS.keys())
-perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
-total_models = len(detailed_df)
 with gr.Blocks(css=custom_css) as app:
     with gr.Row():
         with gr.Column(scale=6):
             gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
         with gr.Column(scale=4):
-            gr.Markdown("![](https://huggingface.co/spaces/MJ-Bench/MJ-Bench-Leaderboard/resolve/main/src/mj-bench-logo.jpg)")
             # gr.HTML(BGB_LOGO, elem_classes="logo")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏆 MJ-Bench Leaderboard"):
             with gr.Row():
                 search_overall = gr.Textbox(
                     label="Model Search (delimit with , )",
@@ -229,88 +238,46 @@ with gr.Blocks(css=custom_css) as app:
                     show_label=False
                 )
                 model_type_overall = gr.CheckboxGroup(
-                    choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
-                    value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
-                    label="Model Types",
                     show_label=False,
                     interactive=True,
                 )
             with gr.Row():
-                mjbench_table_overall_hidden = gr.Dataframe(
-                    perspective_df,
-                    headers=perspective_df.columns.tolist(),
-                    elem_id="mjbench_leadboard_overall_hidden",
                     wrap=True,
                     visible=False,
                 )
-                mjbench_table_overall = gr.Dataframe(
                     regex_table(
-                        perspective_df.copy(),
                         "",
-                        ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
                      ),
-                    headers=perspective_df.columns.tolist(),
-                    elem_id="mjbench_leadboard_overall",
                     wrap=True,
                     height=1000,
                 )
-        # with gr.TabItem("🔍 MJ-Bench Detailed Results"):
-        #     with gr.Row():
-        #         search_detail = gr.Textbox(
-        #             label="Model Search (delimit with , )",
-        #             placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
-        #             show_label=False
-        #         )
-        #         model_type_detail = gr.CheckboxGroup(
-        #             choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
-        #             value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
-        #             label="Model Types",
-        #             show_label=False,
-        #             interactive=True,
-        #         )
-        #     with gr.Row():
-        #         mjbench_table_detail_hidden = gr.Dataframe(
-        #             detailed_df,
-        #             headers=detailed_df.columns.tolist(),
-        #             elem_id="mjbench_detailed_hidden",
-        #             # column_widths = ["500px", "500px"],
-        #             wrap=True,
-        #             visible=False,
-        #         )
-        #         mjbench_table_detail = gr.Dataframe(
-        #             regex_table(
-        #                 detailed_df.copy(),
-        #                 "",
-        #                 ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
-        #              ),
-        #             headers=detailed_df.columns.tolist(),
-        #             elem_id="mjbench_detailed",
-        #             column_widths = ["40px", "200px", "180px", "130px", "150px"] + ["130px"]*50,
-        #             wrap=True,
-        #             height=1000,
-        #         )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
     with gr.Accordion("📚 Citation", open=False):
             citation_button = gr.Textbox(
-                value=r"""@misc{mjbench2024mjbench,
-  title={MJ-BENCH: Is Your Multimodal Reward Model Really a Good Judge?},
-  author={Chen*, Zhaorun and Du*, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and HUANG, Leria and Chen, Canyu and Ye Qinghao and Zhu, Zhihong and Zhang, Yuqing and Zhou, Jiawei and Zhao, Zhuokai and Rafailov, Rafael and Finn, Chelsea and Yao, Huaxiu},
-  year={2024}
-}""",
                 lines=7,
                 label="Copy the following to cite these results.",
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-    search_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
-    model_type_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall)
-    # search_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
-    # model_type_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h

 from pathlib import Path
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
+# from src.display.utils import (
+#     BENCHMARK_COLS,
+#     COLS,
+#     EVAL_COLS,
+#     EVAL_TYPES,
+#     NUMERIC_INTERVALS,
+#     TYPES,
+#     AutoEvalColumn,
+#     ModelType,
+#     fields,
+#     WeightType,
+#     Precision
+# )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 try:
+META_DATA = ['Model']
     API.restart_space(repo_id=REPO_ID)
+# color_map = {
+#     "Score Model": "#7497db",
+#     "Opensource VLM": "#E8ECF2",
+#     "Closesource VLM": "#ffcd75",
+#     "Others": "#75809c",
+#     # #7497db #E8ECF2 #ffcd75 #75809c
+# }
+# def color_model_type_column(df, color_map):
+#     """
+#     Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
+#     Parameters:
+#     df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
+#     color_map (dict): A dictionary mapping model types to colors.
+#     Returns:
+#     pd.Styler: The styled DataFrame.
+#     """
+#     # Function to apply color based on the model type
+#     def apply_color(val):
+#         color = color_map.get(val, "default")  # Default color if not specified in color_map
+#         return f'background-color: {color}'
+#     # Format for different columns
+#     format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
+#     format_dict['Overall Score'] = "{:.2f}"
+#     format_dict[''] = "{:d}"
+#     return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
 def regex_table(dataframe, regex, filter_button, style=True):
     """
     # if filter_button, remove all rows with "ai2" in the model name
     update_scores = False
     if isinstance(filter_button, list) or isinstance(filter_button, str):
+        if "Integrated LVLM" not in filter_button:
+            dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)]
+        if "Interleaved LVLM" not in filter_button:
+            dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)]
     # Filter the dataframe such that 'model' contains any of the regex patterns
     data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
     # replace column '' with count/rank
     data.insert(0, '', range(1, 1 + len(data)))
+    # if style:
+    #     # apply color
+    #     data = color_model_type_column(data, color_map)
     return data
     df.reset_index(drop=True, inplace=True)
     return df
 def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
     new_df = orig_df[meta_data + columns_name]
     new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
     return new_df
+data = {
+    "Model": [
+        "MiniGPT-5", "EMU-2", "GILL", "Anole",
+        "GPT-4o - Openjourney", "GPT-4o - SD-3", "GPT-4o - SD-XL", "GPT-4o - Flux",
+        "Gemini-1.5 - Openjourney", "Gemini-1.5 - SD-3", "Gemini-1.5 - SD-XL", "Gemini-1.5 - Flux",
+        "LLAVA-34b - Openjourney", "LLAVA-34b - SD-3", "LLAVA-34b - SD-XL", "LLAVA-34b - Flux",
+        "Qwen-VL-70b - Openjourney", "Qwen-VL-70b - SD-3", "Qwen-VL-70b - SD-XL", "Qwen-VL-70b - Flux"
+    ],
+    "Model Type":[
+        "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+        "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
+    ],
+    "Situational analysis": [
+        47.63, 39.65, 46.72, 48.95,
+        53.05, 53.00, 56.12, 54.97,
+        48.08, 47.48, 49.43, 47.07,
+        54.12, 54.72, 55.97, 54.23,
+        52.73, 54.98, 52.58, 54.23
+    ],
+    "Project-based learning": [
+        55.12, 46.12, 57.57, 59.05,
+        71.40, 71.20, 73.25, 68.80,
+        67.93, 68.70, 71.85, 68.33,
+        73.47, 72.55, 74.60, 71.32,
+        71.63, 71.87, 73.57, 69.47
+    ],
+    "Multi-step reasoning": [
+        42.17, 50.75, 39.33, 51.72,
+        53.67, 53.67, 53.67, 53.67,
+        60.05, 60.05, 60.05, 60.05,
+        47.28, 47.28, 47.28, 47.28,
+        55.63, 55.63, 55.63, 55.63
+    ],
+    "AVG": [
+        50.92, 45.33, 51.58, 55.22,
+        63.65, 63.52, 65.47, 62.63,
+        61.57, 61.87, 64.15, 61.55,
+        63.93, 63.57, 65.05, 62.73,
+        64.05, 64.75, 65.12, 63.18
+    ]
+}
+df = pd.DataFrame(data)
+total_models = len(df)
 with gr.Blocks(css=custom_css) as app:
     with gr.Row():
         with gr.Column(scale=6):
             gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
         with gr.Column(scale=4):
+            gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/resolve/main/src/logo.png)")
             # gr.HTML(BGB_LOGO, elem_classes="logo")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏆 MMIE Leaderboard"):
             with gr.Row():
                 search_overall = gr.Textbox(
                     label="Model Search (delimit with , )",
                     show_label=False
                 )
                 model_type_overall = gr.CheckboxGroup(
+                    choices=["Interleaved LVLM", "Integrated LVLM"],
+                    value=["Interleaved LVLM", "Integrated LVLM"],
+                    label="Model Type",
                     show_label=False,
                     interactive=True,
                 )
             with gr.Row():
+                mmie_table_overall_hidden = gr.Dataframe(
+                    df,
+                    headers=df.columns.tolist(),
+                    elem_id="mmie_leadboard_overall_hidden",
                     wrap=True,
                     visible=False,
                 )
+                mmie_table_overall = gr.Dataframe(
                     regex_table(
+                        df.copy(),
                         "",
+                        ["Interleaved LVLM", "Integrated LVLM"]
                      ),
+                    headers=df.columns.tolist(),
+                    elem_id="mmie_leadboard_overall",
                     wrap=True,
                     height=1000,
                 )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
     with gr.Accordion("📚 Citation", open=False):
             citation_button = gr.Textbox(
+                value=r"""""",
                 lines=7,
                 label="Copy the following to cite these results.",
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+    search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
+    model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h

evals/.gitattributes DELETED Viewed

@@ -1,55 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.lz4 filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-# Audio files - uncompressed
-*.pcm filter=lfs diff=lfs merge=lfs -text
-*.sam filter=lfs diff=lfs merge=lfs -text
-*.raw filter=lfs diff=lfs merge=lfs -text
-# Audio files - compressed
-*.aac filter=lfs diff=lfs merge=lfs -text
-*.flac filter=lfs diff=lfs merge=lfs -text
-*.mp3 filter=lfs diff=lfs merge=lfs -text
-*.ogg filter=lfs diff=lfs merge=lfs -text
-*.wav filter=lfs diff=lfs merge=lfs -text
-# Image files - uncompressed
-*.bmp filter=lfs diff=lfs merge=lfs -text
-*.gif filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
-*.tiff filter=lfs diff=lfs merge=lfs -text
-# Image files - compressed
-*.jpg filter=lfs diff=lfs merge=lfs -text
-*.jpeg filter=lfs diff=lfs merge=lfs -text
-*.webp filter=lfs diff=lfs merge=lfs -text

evals/README.md DELETED Viewed

@@ -1,6 +0,0 @@
----
-# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
-# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
-{}
----
-# Coming Soon

evals/mjbench-results/detailed-results/AestheticsPredictor.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "AestheticsPredictor",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "LAION",
-        "Alignment-Object": 35.9,
-        "Alignment-Attribute": 38.4,
-        "Alignment-Action": 43.6,
-        "Alignment-Location": 31.6,
-        "Alignment-Count": 35.7,
-        "Alignment-Avg": 34.8,
-        "Safety-Toxicity-Crime": 51.7,
-        "Safety-Toxicity-Shocking": 58.6,
-        "Safety-Toxicity-Disgust": 64.3,
-        "Safety-Toxicity-Avg": 57.3,
-        "Safety-Nsfw-Evident": 14.6,
-        "Safety-Nsfw-Evasive": 55.2,
-        "Safety-Nsfw-Subtle": 14.2,
-        "Safety-Nsfw-Avg": 37.5,
-        "Quality-Distortion-Human_face": 78.7,
-        "Quality-Distortion-Human_limb": 57.1,
-        "Quality-Distortion-Object": 51.3,
-        "Quality-Distortion-Avg": 52.1,
-        "Quality-Blurry-Defocused": 90.1,
-        "Quality-Blurry-Motion": 93.4,
-        "Quality-Blurry-Avg": 91.6,
-        "Bias-Age": 59.4,
-        "Bias-Gender": 62.0,
-        "Bias-Race": 64.2,
-        "Bias-Nationality": 62.4,
-        "Bias-Religion": 61.0,
-        "Bias-Avg": 62.0,
-        "Bias-Age-NDS": 85.3,
-        "Bias-Gender-NDS": 85.9,
-        "Bias-Race-NDS": 86.3,
-        "Bias-Nationality-NDS": 85.8,
-        "Bias-Religion-NDS": 86.2,
-        "Bias-Avg-NDS": 85.9,
-        "Bias-Age-GES": 91.9,
-        "Bias-Gender-GES": 92.1,
-        "Bias-Race-GES": 92.4,
-        "Bias-Nationality-GES": 92.1,
-        "Bias-Religion-GES": 92.3,
-        "Bias-Avg-GES": 92.1
-    }
-]

evals/mjbench-results/detailed-results/BLIP-v2.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "BLIP-v2",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "Salesforce",
-        "Alignment-Object": 23.5,
-        "Alignment-Attribute": 22.7,
-        "Alignment-Action": 24.8,
-        "Alignment-Location": 19.7,
-        "Alignment-Count": 16.1,
-        "Alignment-Avg": 21.5,
-        "Safety-Toxicity-Crime": 6.9,
-        "Safety-Toxicity-Shocking": 0.0,
-        "Safety-Toxicity-Disgust": 4.8,
-        "Safety-Toxicity-Avg": 4.5,
-        "Safety-Nsfw-Evident": 58.4,
-        "Safety-Nsfw-Evasive": 51.1,
-        "Safety-Nsfw-Subtle": 35.7,
-        "Safety-Nsfw-Avg": 49.1,
-        "Quality-Distortion-Human_face": 3.6,
-        "Quality-Distortion-Human_limb": 2.0,
-        "Quality-Distortion-Object": 1.1,
-        "Quality-Distortion-Avg": 1.9,
-        "Quality-Blurry-Defocused": 8.3,
-        "Quality-Blurry-Motion": 47.2,
-        "Quality-Blurry-Avg": 15.0,
-        "Bias-Age": 69.6,
-        "Bias-Gender": 68.5,
-        "Bias-Race": 65.9,
-        "Bias-Nationality": 68.6,
-        "Bias-Religion": 74.7,
-        "Bias-Avg": 68.5,
-        "Bias-Age-NDS": 85.3,
-        "Bias-Gender-NDS": 83.6,
-        "Bias-Race-NDS": 82.7,
-        "Bias-Nationality-NDS": 81.8,
-        "Bias-Religion-NDS": 87.5,
-        "Bias-Avg-NDS": 83.6,
-        "Bias-Age-GES": 92.2,
-        "Bias-Gender-GES": 91.3,
-        "Bias-Race-GES": 90.7,
-        "Bias-Nationality-GES": 90.4,
-        "Bias-Religion-GES": 93.1,
-        "Bias-Avg-GES": 91.3
-    }
-]

evals/mjbench-results/detailed-results/CLIP-v2.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "CLIP-v2",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "LAION",
-        "Alignment-Object": 42.2,
-        "Alignment-Attribute": 45.9,
-        "Alignment-Action": 45.3,
-        "Alignment-Location": 43.4,
-        "Alignment-Count": 55.4,
-        "Alignment-Avg": 44.0,
-        "Safety-Toxicity-Crime": 89.7,
-        "Safety-Toxicity-Shocking": 96.6,
-        "Safety-Toxicity-Disgust": 97.6,
-        "Safety-Toxicity-Avg": 94.4,
-        "Safety-Nsfw-Evident": 20.8,
-        "Safety-Nsfw-Evasive": 4.5,
-        "Safety-Nsfw-Subtle": 16.6,
-        "Safety-Nsfw-Avg": 7.9,
-        "Quality-Distortion-Human_face": 26.6,
-        "Quality-Distortion-Human_limb": 17.2,
-        "Quality-Distortion-Object": 34.0,
-        "Quality-Distortion-Avg": 19.3,
-        "Quality-Blurry-Defocused": 50.6,
-        "Quality-Blurry-Motion": 63.7,
-        "Quality-Blurry-Avg": 56.7,
-        "Bias-Age": 57.2,
-        "Bias-Gender": 57.8,
-        "Bias-Race": 55.5,
-        "Bias-Nationality": 59.5,
-        "Bias-Religion": 60.8,
-        "Bias-Avg": 57.7,
-        "Bias-Age-NDS": 73.6,
-        "Bias-Gender-NDS": 75.2,
-        "Bias-Race-NDS": 73.1,
-        "Bias-Nationality-NDS": 79.1,
-        "Bias-Religion-NDS": 78.4,
-        "Bias-Avg-NDS": 75.2,
-        "Bias-Age-GES": 73.6,
-        "Bias-Gender-GES": 75.2,
-        "Bias-Race-GES": 73.1,
-        "Bias-Nationality-GES": 79.1,
-        "Bias-Religion-GES": 78.4,
-        "Bias-Avg-GES": 75.2
-    }
-]

evals/mjbench-results/detailed-results/Claude 3 Opus.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Claude 3 Opus",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Anthropic",
-        "Alignment-Object": 64.9,
-        "Alignment-Attribute": 38.9,
-        "Alignment-Action": 44.4,
-        "Alignment-Location": 55.3,
-        "Alignment-Count": 55.4,
-        "Alignment-Avg": 57.1,
-        "Safety-Toxicity-Crime": 62.1,
-        "Safety-Toxicity-Shocking": 37.9,
-        "Safety-Toxicity-Disgust": 50.0,
-        "Safety-Toxicity-Avg": 50.6,
-        "Safety-Nsfw-Evident": 10.5,
-        "Safety-Nsfw-Evasive": 6.2,
-        "Safety-Nsfw-Subtle": 3.6,
-        "Safety-Nsfw-Avg": 8.3,
-        "Quality-Distortion-Human_face": 26.6,
-        "Quality-Distortion-Human_limb": 19.3,
-        "Quality-Distortion-Object": 10.7,
-        "Quality-Distortion-Avg": 17.6,
-        "Quality-Blurry-Defocused": 89.6,
-        "Quality-Blurry-Motion": 93.3,
-        "Quality-Blurry-Avg": 92.7,
-        "Bias-Age": 53.9,
-        "Bias-Gender": 58.2,
-        "Bias-Race": 62.1,
-        "Bias-Nationality": 59.0,
-        "Bias-Religion": 54.0,
-        "Bias-Avg": 58.2,
-        "Bias-Age-NDS": 63.3,
-        "Bias-Gender-NDS": 66.1,
-        "Bias-Race-NDS": 67.5,
-        "Bias-Nationality-NDS": 66.9,
-        "Bias-Religion-NDS": 66.8,
-        "Bias-Avg-NDS": 66.1,
-        "Bias-Age-GES": 83.2,
-        "Bias-Gender-GES": 85.2,
-        "Bias-Race-GES": 86.5,
-        "Bias-Nationality-GES": 85.8,
-        "Bias-Religion-GES": 84.8,
-        "Bias-Avg-GES": 85.2
-    }
-]

evals/mjbench-results/detailed-results/GPT-4-vision.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "GPT-4-vision",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenAI",
-        "Alignment-Object": 68.1,
-        "Alignment-Attribute": 62.9,
-        "Alignment-Action": 64.1,
-        "Alignment-Location": 67.1,
-        "Alignment-Count": 73.2,
-        "Alignment-Avg": 66.1,
-        "Safety-Toxicity-Crime": 75.9,
-        "Safety-Toxicity-Shocking": 69.0,
-        "Safety-Toxicity-Disgust": 81.0,
-        "Safety-Toxicity-Avg": 76.4,
-        "Safety-Nsfw-Evident": 69.5,
-        "Safety-Nsfw-Evasive": 43.2,
-        "Safety-Nsfw-Subtle": 32.5,
-        "Safety-Nsfw-Avg": 44.1,
-        "Quality-Distortion-Human_face": 87.6,
-        "Quality-Distortion-Human_limb": 57.6,
-        "Quality-Distortion-Object": 83.1,
-        "Quality-Distortion-Avg": 75.7,
-        "Quality-Blurry-Defocused": 98.8,
-        "Quality-Blurry-Motion": 99.3,
-        "Quality-Blurry-Avg": 99.2,
-        "Bias-Age": 76.7,
-        "Bias-Gender": 79.1,
-        "Bias-Race": 77.4,
-        "Bias-Nationality": 81.0,
-        "Bias-Religion": 86.5,
-        "Bias-Avg": 79.1,
-        "Bias-Age-NDS": 81.2,
-        "Bias-Gender-NDS": 80.2,
-        "Bias-Race-NDS": 77.6,
-        "Bias-Nationality-NDS": 79.9,
-        "Bias-Religion-NDS": 88.2,
-        "Bias-Avg-NDS": 80.2,
-        "Bias-Age-GES": 93.0,
-        "Bias-Gender-GES": 93.2,
-        "Bias-Race-GES": 92.2,
-        "Bias-Nationality-GES": 93.4,
-        "Bias-Religion-GES": 96.4,
-        "Bias-Avg-GES": 93.2
-    }
-]

evals/mjbench-results/detailed-results/GPT-4o.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "GPT-4o",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenAI",
-        "Alignment-Object": 62.2,
-        "Alignment-Attribute": 57.2,
-        "Alignment-Action": 64.1,
-        "Alignment-Location": 63.2,
-        "Alignment-Count": 67.9,
-        "Alignment-Avg": 61.5,
-        "Safety-Toxicity-Crime": 86.2,
-        "Safety-Toxicity-Shocking": 96.6,
-        "Safety-Toxicity-Disgust": 95.2,
-        "Safety-Toxicity-Avg": 92.1,
-        "Safety-Nsfw-Evident": 72.3,
-        "Safety-Nsfw-Evasive": 51.7,
-        "Safety-Nsfw-Subtle": 38.9,
-        "Safety-Nsfw-Avg": 54.3,
-        "Quality-Distortion-Human_face": 99.4,
-        "Quality-Distortion-Human_limb": 78.2,
-        "Quality-Distortion-Object": 100.0,
-        "Quality-Distortion-Avg": 93.8,
-        "Quality-Blurry-Defocused": 100.0,
-        "Quality-Blurry-Motion": 100.0,
-        "Quality-Blurry-Avg": 100.0,
-        "Bias-Age": 60.9,
-        "Bias-Gender": 66.6,
-        "Bias-Race": 69.1,
-        "Bias-Nationality": 68.2,
-        "Bias-Religion": 69.6,
-        "Bias-Avg": 66.6,
-        "Bias-Age-NDS": 81.2,
-        "Bias-Gender-NDS": 82.7,
-        "Bias-Race-NDS": 82.8,
-        "Bias-Nationality-NDS": 83.2,
-        "Bias-Religion-NDS": 86.1,
-        "Bias-Avg-NDS": 82.7,
-        "Bias-Age-GES": 91.8,
-        "Bias-Gender-GES": 92.9,
-        "Bias-Race-GES": 93.1,
-        "Bias-Nationality-GES": 93.3,
-        "Bias-Religion-GES": 94.4,
-        "Bias-Avg-GES": 92.9
-    }
-]

evals/mjbench-results/detailed-results/Gemini Ultra.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Gemini Ultra",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Google",
-        "Alignment-Object": 71.7,
-        "Alignment-Attribute": 65.1,
-        "Alignment-Action": 63.2,
-        "Alignment-Location": 64.5,
-        "Alignment-Count": 67.8,
-        "Alignment-Avg": 67.2,
-        "Safety-Toxicity-Crime": 65.5,
-        "Safety-Toxicity-Shocking": 41.4,
-        "Safety-Toxicity-Disgust": 78.6,
-        "Safety-Toxicity-Avg": 64.0,
-        "Safety-Nsfw-Evident": 31.6,
-        "Safety-Nsfw-Evasive": 19.1,
-        "Safety-Nsfw-Subtle": 10.3,
-        "Safety-Nsfw-Avg": 22.7,
-        "Quality-Distortion-Human_face": 73.4,
-        "Quality-Distortion-Human_limb": 32.5,
-        "Quality-Distortion-Object": 61.0,
-        "Quality-Distortion-Avg": 55.7,
-        "Quality-Blurry-Defocused": 86.5,
-        "Quality-Blurry-Motion": 97.3,
-        "Quality-Blurry-Avg": 93.9,
-        "Bias-Age": 48.7,
-        "Bias-Gender": 56.9,
-        "Bias-Race": 62.9,
-        "Bias-Nationality": 60.0,
-        "Bias-Religion": 49.9,
-        "Bias-Avg": 56.9,
-        "Bias-Age-NDS": 72.6,
-        "Bias-Gender-NDS": 75.8,
-        "Bias-Race-NDS": 78.4,
-        "Bias-Nationality-NDS": 77.0,
-        "Bias-Religion-NDS": 72.3,
-        "Bias-Avg-NDS": 75.8,
-        "Bias-Age-GES": 86.6,
-        "Bias-Gender-GES": 89.0,
-        "Bias-Race-GES": 90.8,
-        "Bias-Nationality-GES": 90.0,
-        "Bias-Religion-GES": 86.2,
-        "Bias-Avg-GES": 89.0
-    }
-]

evals/mjbench-results/detailed-results/HPS-v2.1.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "HPS-v2.1",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "CUHK MMLab",
-        "Alignment-Object": 49.4,
-        "Alignment-Attribute": 53.7,
-        "Alignment-Action": 49.6,
-        "Alignment-Location": 51.3,
-        "Alignment-Count": 57.1,
-        "Alignment-Avg": 48.8,
-        "Safety-Toxicity-Crime": 89.7,
-        "Safety-Toxicity-Shocking": 86.2,
-        "Safety-Toxicity-Disgust": 85.7,
-        "Safety-Toxicity-Avg": 87.6,
-        "Safety-Nsfw-Evident": 1.1,
-        "Safety-Nsfw-Evasive": 30.8,
-        "Safety-Nsfw-Subtle": 0.6,
-        "Safety-Nsfw-Avg": 15.1,
-        "Quality-Distortion-Human_face": 60.4,
-        "Quality-Distortion-Human_limb": 37.1,
-        "Quality-Distortion-Object": 80.3,
-        "Quality-Distortion-Avg": 51.7,
-        "Quality-Blurry-Defocused": 85.7,
-        "Quality-Blurry-Motion": 94.6,
-        "Quality-Blurry-Avg": 88.6,
-        "Bias-Age": 52.9,
-        "Bias-Gender": 55.3,
-        "Bias-Race": 55.7,
-        "Bias-Nationality": 55.0,
-        "Bias-Religion": 62.4,
-        "Bias-Avg": 55.3,
-        "Bias-Age-NDS": 75.8,
-        "Bias-Gender-NDS": 78.2,
-        "Bias-Race-NDS": 79.5,
-        "Bias-Nationality-NDS": 78.6,
-        "Bias-Religion-NDS": 79.3,
-        "Bias-Avg-NDS": 78.2,
-        "Bias-Age-GES": 86.4,
-        "Bias-Gender-GES": 87.8,
-        "Bias-Race-GES": 88.5,
-        "Bias-Nationality-GES": 88.0,
-        "Bias-Religion-GES": 88.5,
-        "Bias-Avg-GES": 87.8
-    }
-]

evals/mjbench-results/detailed-results/Idefics2-8b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Idefics2-8b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "HuggingFace",
-        "Alignment-Object": 35.5,
-        "Alignment-Attribute": 31.7,
-        "Alignment-Action": 30.8,
-        "Alignment-Location": 29.9,
-        "Alignment-Count": 30.4,
-        "Alignment-Avg": 32.6,
-        "Safety-Toxicity-Crime": 58.6,
-        "Safety-Toxicity-Shocking": 44.8,
-        "Safety-Toxicity-Disgust": 57.1,
-        "Safety-Toxicity-Avg": 52.8,
-        "Safety-Nsfw-Evident": 32.9,
-        "Safety-Nsfw-Evasive": 13.2,
-        "Safety-Nsfw-Subtle": 19.5,
-        "Safety-Nsfw-Avg": 20.2,
-        "Quality-Distortion-Human_face": 29.6,
-        "Quality-Distortion-Human_limb": 25.8,
-        "Quality-Distortion-Object": 2.3,
-        "Quality-Distortion-Avg": 21.7,
-        "Quality-Blurry-Defocused": 70.6,
-        "Quality-Blurry-Motion": 46.9,
-        "Quality-Blurry-Avg": 58.7,
-        "Bias-Age": 37.4,
-        "Bias-Gender": 42.7,
-        "Bias-Race": 45.3,
-        "Bias-Nationality": 46.9,
-        "Bias-Religion": 35.2,
-        "Bias-Avg": 42.7,
-        "Bias-Age-NDS": 55.1,
-        "Bias-Gender-NDS": 59.2,
-        "Bias-Race-NDS": 61.7,
-        "Bias-Nationality-NDS": 62.8,
-        "Bias-Religion-NDS": 51.0,
-        "Bias-Avg-NDS": 59.2,
-        "Bias-Age-GES": 77.0,
-        "Bias-Gender-GES": 79.7,
-        "Bias-Race-GES": 81.3,
-        "Bias-Nationality-GES": 82.0,
-        "Bias-Religion-GES": 74.4,
-        "Bias-Avg-GES": 79.8
-    }
-]

evals/mjbench-results/detailed-results/ImageReward.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "ImageReward",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "THUDM",
-        "Alignment-Object": 50.6,
-        "Alignment-Attribute": 52.8,
-        "Alignment-Action": 47.1,
-        "Alignment-Location": 57.9,
-        "Alignment-Count": 53.6,
-        "Alignment-Avg": 51.1,
-        "Safety-Toxicity-Crime": 96.6,
-        "Safety-Toxicity-Shocking": 96.6,
-        "Safety-Toxicity-Disgust": 95.2,
-        "Safety-Toxicity-Avg": 95.5,
-        "Safety-Nsfw-Evident": 31.1,
-        "Safety-Nsfw-Evasive": 10.2,
-        "Safety-Nsfw-Subtle": 27.4,
-        "Safety-Nsfw-Avg": 18.2,
-        "Quality-Distortion-Human_face": 31.4,
-        "Quality-Distortion-Human_limb": 34.4,
-        "Quality-Distortion-Object": 40.2,
-        "Quality-Distortion-Avg": 33.3,
-        "Quality-Blurry-Defocused": 77.4,
-        "Quality-Blurry-Motion": 86.6,
-        "Quality-Blurry-Avg": 82.1,
-        "Bias-Age": 41.8,
-        "Bias-Gender": 40.4,
-        "Bias-Race": 36.8,
-        "Bias-Nationality": 39.5,
-        "Bias-Religion": 52.8,
-        "Bias-Avg": 40.4,
-        "Bias-Age-NDS": 73.9,
-        "Bias-Gender-NDS": 73.2,
-        "Bias-Race-NDS": 70.9,
-        "Bias-Nationality-NDS": 73.0,
-        "Bias-Religion-NDS": 80.2,
-        "Bias-Avg-NDS": 73.2,
-        "Bias-Age-GES": 85.5,
-        "Bias-Gender-GES": 85.0,
-        "Bias-Race-GES": 83.6,
-        "Bias-Nationality-GES": 84.8,
-        "Bias-Religion-GES": 89.0,
-        "Bias-Avg-GES": 85.0
-    }
-]

evals/mjbench-results/detailed-results/Instructblip-7b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Instructblip-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Salesforce",
-        "Alignment-Object": 17.1,
-        "Alignment-Attribute": 17.4,
-        "Alignment-Action": 16.2,
-        "Alignment-Location": 13.1,
-        "Alignment-Count": 21.4,
-        "Alignment-Avg": 17.1,
-        "Safety-Toxicity-Crime": 31.0,
-        "Safety-Toxicity-Shocking": 34.5,
-        "Safety-Toxicity-Disgust": 40.5,
-        "Safety-Toxicity-Avg": 39.3,
-        "Safety-Nsfw-Evident": 36.9,
-        "Safety-Nsfw-Evasive": 24.2,
-        "Safety-Nsfw-Subtle": 30.6,
-        "Safety-Nsfw-Avg": 33.7,
-        "Quality-Distortion-Human_face": 12.4,
-        "Quality-Distortion-Human_limb": 9.3,
-        "Quality-Distortion-Object": 21.0,
-        "Quality-Distortion-Avg": 13.3,
-        "Quality-Blurry-Defocused": 32.3,
-        "Quality-Blurry-Motion": 31.1,
-        "Quality-Blurry-Avg": 31.7,
-        "Bias-Age": 52.5,
-        "Bias-Gender": 53.6,
-        "Bias-Race": 53.6,
-        "Bias-Nationality": 52.0,
-        "Bias-Religion": 61.1,
-        "Bias-Avg": 53.6,
-        "Bias-Age-NDS": 80.8,
-        "Bias-Gender-NDS": 80.6,
-        "Bias-Race-NDS": 80.3,
-        "Bias-Nationality-NDS": 79.0,
-        "Bias-Religion-NDS": 85.4,
-        "Bias-Avg-NDS": 80.6,
-        "Bias-Age-GES": 91.0,
-        "Bias-Gender-GES": 91.2,
-        "Bias-Race-GES": 91.1,
-        "Bias-Nationality-GES": 90.4,
-        "Bias-Religion-GES": 93.8,
-        "Bias-Avg-GES": 91.1
-    }
-]

evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "InternVL-Chat-V1-5",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenGVLab",
-        "Alignment-Object": 73.3,
-        "Alignment-Attribute": 74.8,
-        "Alignment-Action": 78.6,
-        "Alignment-Location": 80.5,
-        "Alignment-Count": 78.6,
-        "Alignment-Avg": 75.8,
-        "Safety-Toxicity-Crime": 34.5,
-        "Safety-Toxicity-Shocking": 10.3,
-        "Safety-Toxicity-Disgust": 28.6,
-        "Safety-Toxicity-Avg": 25.8,
-        "Safety-Nsfw-Evident": 23.3,
-        "Safety-Nsfw-Evasive": 10.6,
-        "Safety-Nsfw-Subtle": 7.2,
-        "Safety-Nsfw-Avg": 16.2,
-        "Quality-Distortion-Human_face": 97.0,
-        "Quality-Distortion-Human_limb": 95.4,
-        "Quality-Distortion-Object": 97.1,
-        "Quality-Distortion-Avg": 97.1,
-        "Quality-Blurry-Defocused": 89.7,
-        "Quality-Blurry-Motion": 89.7,
-        "Quality-Blurry-Avg": 89.7,
-        "Bias-Age": 40.0,
-        "Bias-Gender": 41.3,
-        "Bias-Race": 42.1,
-        "Bias-Nationality": 42.0,
-        "Bias-Religion": 39.8,
-        "Bias-Avg": 41.3,
-        "Bias-Age-NDS": 74.0,
-        "Bias-Gender-NDS": 74.1,
-        "Bias-Race-NDS": 73.6,
-        "Bias-Nationality-NDS": 73.9,
-        "Bias-Religion-NDS": 76.6,
-        "Bias-Avg-NDS": 74.1,
-        "Bias-Age-GES": 86.9,
-        "Bias-Gender-GES": 87.2,
-        "Bias-Race-GES": 87.1,
-        "Bias-Nationality-GES": 87.3,
-        "Bias-Religion-GES": 88.0,
-        "Bias-Avg-GES": 87.2
-    }
-]

evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "LLaVA-1.5-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & Microsoft",
-        "Alignment-Object": 17.7,
-        "Alignment-Attribute": 13.5,
-        "Alignment-Action": 11.8,
-        "Alignment-Location": 16.5,
-        "Alignment-Count": 8.9,
-        "Alignment-Avg": 10.3,
-        "Safety-Toxicity-Crime": 31.0,
-        "Safety-Toxicity-Shocking": 31.0,
-        "Safety-Toxicity-Disgust": 40.5,
-        "Safety-Toxicity-Avg": 33.7,
-        "Safety-Nsfw-Evident": 40.8,
-        "Safety-Nsfw-Evasive": 29.9,
-        "Safety-Nsfw-Subtle": 33.6,
-        "Safety-Nsfw-Avg": 34.7,
-        "Quality-Distortion-Human_face": 20.1,
-        "Quality-Distortion-Human_limb": 14.6,
-        "Quality-Distortion-Object": 13.3,
-        "Quality-Distortion-Avg": 16.4,
-        "Quality-Blurry-Defocused": 18.0,
-        "Quality-Blurry-Motion": 34.0,
-        "Quality-Blurry-Avg": 26.1,
-        "Bias-Age": 67.0,
-        "Bias-Gender": 70.1,
-        "Bias-Race": 68.9,
-        "Bias-Nationality": 72.7,
-        "Bias-Religion": 75.1,
-        "Bias-Avg": 70.1,
-        "Bias-Age-NDS": 71.9,
-        "Bias-Gender-NDS": 74.8,
-        "Bias-Race-NDS": 76.6,
-        "Bias-Nationality-NDS": 74.0,
-        "Bias-Religion-NDS": 80.6,
-        "Bias-Avg-NDS": 74.8,
-        "Bias-Age-GES": 87.5,
-        "Bias-Gender-GES": 88.8,
-        "Bias-Race-GES": 88.9,
-        "Bias-Nationality-GES": 89.5,
-        "Bias-Religion-GES": 90.1,
-        "Bias-Avg-GES": 88.8
-    }
-]

evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "LLaVA-1.5-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & Microsoft",
-        "Alignment-Object": 20.7,
-        "Alignment-Attribute": 25.2,
-        "Alignment-Action": 23.1,
-        "Alignment-Location": 18.2,
-        "Alignment-Count": 17.9,
-        "Alignment-Avg": 22.0,
-        "Safety-Toxicity-Crime": 44.8,
-        "Safety-Toxicity-Shocking": 41.4,
-        "Safety-Toxicity-Disgust": 47.6,
-        "Safety-Toxicity-Avg": 43.8,
-        "Safety-Nsfw-Evident": 35.7,
-        "Safety-Nsfw-Evasive": 21.2,
-        "Safety-Nsfw-Subtle": 17.6,
-        "Safety-Nsfw-Avg": 26.3,
-        "Quality-Distortion-Human_face": 13.6,
-        "Quality-Distortion-Human_limb": 7.3,
-        "Quality-Distortion-Object": 9.2,
-        "Quality-Distortion-Avg": 10.2,
-        "Quality-Blurry-Defocused": 7.1,
-        "Quality-Blurry-Motion": 19.1,
-        "Quality-Blurry-Avg": 13.1,
-        "Bias-Age": 80.8,
-        "Bias-Gender": 83.9,
-        "Bias-Race": 84.6,
-        "Bias-Nationality": 84.9,
-        "Bias-Religion": 88.1,
-        "Bias-Avg": 84.0,
-        "Bias-Age-NDS": 67.6,
-        "Bias-Gender-NDS": 71.4,
-        "Bias-Race-NDS": 75.8,
-        "Bias-Nationality-NDS": 68.4,
-        "Bias-Religion-NDS": 77.3,
-        "Bias-Avg-NDS": 71.4,
-        "Bias-Age-GES": 87.4,
-        "Bias-Gender-GES": 88.9,
-        "Bias-Race-GES": 90.1,
-        "Bias-Nationality-GES": 88.7,
-        "Bias-Religion-GES": 90.7,
-        "Bias-Avg-GES": 88.9
-    }
-]

evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "LLaVA-NeXT-mistral-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & ByteDance",
-        "Alignment-Object": 25.9,
-        "Alignment-Attribute": 30.0,
-        "Alignment-Action": 41.9,
-        "Alignment-Location": 33.8,
-        "Alignment-Count": 35.7,
-        "Alignment-Avg": 31.3,
-        "Safety-Toxicity-Crime": 20.7,
-        "Safety-Toxicity-Shocking": 24.1,
-        "Safety-Toxicity-Disgust": 19.0,
-        "Safety-Toxicity-Avg": 21.3,
-        "Safety-Nsfw-Evident": 35.7,
-        "Safety-Nsfw-Evasive": 14.1,
-        "Safety-Nsfw-Subtle": 23.3,
-        "Safety-Nsfw-Avg": 25.6,
-        "Quality-Distortion-Human_face": 28.4,
-        "Quality-Distortion-Human_limb": 27.8,
-        "Quality-Distortion-Object": 19.0,
-        "Quality-Distortion-Avg": 30.1,
-        "Quality-Blurry-Defocused": 41.7,
-        "Quality-Blurry-Motion": 66.1,
-        "Quality-Blurry-Avg": 53.9,
-        "Bias-Age": 54.3,
-        "Bias-Gender": 56.7,
-        "Bias-Race": 57.0,
-        "Bias-Nationality": 56.1,
-        "Bias-Religion": 64.8,
-        "Bias-Avg": 56.6,
-        "Bias-Age-NDS": 63.2,
-        "Bias-Gender-NDS": 64.1,
-        "Bias-Race-NDS": 62.5,
-        "Bias-Nationality-NDS": 63.8,
-        "Bias-Religion-NDS": 74.2,
-        "Bias-Avg-NDS": 64.1,
-        "Bias-Age-GES": 82.1,
-        "Bias-Gender-GES": 82.8,
-        "Bias-Race-GES": 82.4,
-        "Bias-Nationality-GES": 82.5,
-        "Bias-Religion-GES": 87.8,
-        "Bias-Avg-GES": 82.8
-    }
-]

evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json DELETED Viewed

@@ -1,35 +0,0 @@
-[
-    {
-        "Model": "LLaVA-NeXT-vicuna-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & ByteDance",
-        "Alignment-Object": 25.9,
-        "Alignment-Attribute": 27.4,
-        "Alignment-Action": 31.6,
-        "Alignment-Location": 38.9,
-        "Alignment-Count": 32.1,
-        "Alignment-Avg": 29.1,
-        "Safety-Toxicity-Crime": 44.8,
-        "Safety-Toxicity-Shocking": 37.9,
-        "Safety-Toxicity-Disgust": 52.4,
-        "Safety-Toxicity-Avg": 43.8,
-        "Safety-Nsfw-Evident": 40.9,
-        "Safety-Nsfw-Evasive": 25.1,
-        "Safety-Nsfw-Subtle": 27.8,
-        "Safety-Nsfw-Avg": 36.5,
-        "Quality-Distortion-Human_face": 18.9,
-        "Quality-Distortion-Human_limb": 27.8,
-        "Quality-Distortion-Object": 12.0,
-        "Quality-Distortion-Avg": 20.5,
-        "Quality-Blurry-Defocused": 40.6,
-        "Quality-Blurry-Motion": 45.4,
-        "Quality-Blurry-Avg": 43.0,
-        "Bias-Age": 54.3,
-        "Bias-Gender": 56.7,
-        "Bias-Race": 57.0,
-        "Bias-Nationality": 56.1,
-        "Bias-Religion": 64.8,
-        "Bias-Avg": 56.6
-    }
-]

evals/mjbench-results/detailed-results/MiniGPT4-v2.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "MiniGPT4-v2",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Vision-CAIR",
-        "Alignment-Object": 37.5,
-        "Alignment-Attribute": 30.9,
-        "Alignment-Action": 30.8,
-        "Alignment-Location": 32.5,
-        "Alignment-Count": 39.3,
-        "Alignment-Avg": 32.8,
-        "Safety-Toxicity-Crime": 41.4,
-        "Safety-Toxicity-Shocking": 62.1,
-        "Safety-Toxicity-Disgust": 42.9,
-        "Safety-Toxicity-Avg": 48.3,
-        "Safety-Nsfw-Evident": 39.6,
-        "Safety-Nsfw-Evasive": 21.4,
-        "Safety-Nsfw-Subtle": 36.5,
-        "Safety-Nsfw-Avg": 32.6,
-        "Quality-Distortion-Human_face": 39.6,
-        "Quality-Distortion-Human_limb": 39.1,
-        "Quality-Distortion-Object": 42.0,
-        "Quality-Distortion-Avg": 40.0,
-        "Quality-Blurry-Defocused": 33.4,
-        "Quality-Blurry-Motion": 37.4,
-        "Quality-Blurry-Avg": 35.4,
-        "Bias-Age": 31.8,
-        "Bias-Gender": 32.2,
-        "Bias-Race": 31.9,
-        "Bias-Nationality": 34.1,
-        "Bias-Religion": 28.3,
-        "Bias-Avg": 32.2,
-        "Bias-Age-NDS": 68.1,
-        "Bias-Gender-NDS": 67.2,
-        "Bias-Race-NDS": 66.2,
-        "Bias-Nationality-NDS": 67.0,
-        "Bias-Religion-NDS": 69.3,
-        "Bias-Avg-NDS": 67.2,
-        "Bias-Age-GES": 83.7,
-        "Bias-Gender-GES": 83.3,
-        "Bias-Race-GES": 82.8,
-        "Bias-Nationality-GES": 83.4,
-        "Bias-Religion-GES": 84.1,
-        "Bias-Avg-GES": 83.3
-    }
-]

evals/mjbench-results/detailed-results/PickScore-v1.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "PickScore-v1",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Stability AI",
-        "Alignment-Object": 60.9,
-        "Alignment-Attribute": 60.3,
-        "Alignment-Action": 62.4,
-        "Alignment-Location": 59.2,
-        "Alignment-Count": 67.9,
-        "Alignment-Avg": 60.9,
-        "Safety-Toxicity-Crime": 89.7,
-        "Safety-Toxicity-Shocking": 82.8,
-        "Safety-Toxicity-Disgust": 88.1,
-        "Safety-Toxicity-Avg": 86.5,
-        "Safety-Nsfw-Evident": 3.1,
-        "Safety-Nsfw-Evasive": 48.2,
-        "Safety-Nsfw-Subtle": 2.1,
-        "Safety-Nsfw-Avg": 32.2,
-        "Quality-Distortion-Human_face": 83.4,
-        "Quality-Distortion-Human_limb": 68.2,
-        "Quality-Distortion-Object": 92.1,
-        "Quality-Distortion-Avg": 79.3,
-        "Quality-Blurry-Defocused": 80.6,
-        "Quality-Blurry-Motion": 93.4,
-        "Quality-Blurry-Avg": 86.6,
-        "Bias-Age": 30.4,
-        "Bias-Gender": 31.1,
-        "Bias-Race": 30.8,
-        "Bias-Nationality": 31.7,
-        "Bias-Religion": 33.0,
-        "Bias-Avg": 31.1,
-        "Bias-Age-NDS": 65.3,
-        "Bias-Gender-NDS": 66.7,
-        "Bias-Race-NDS": 66.4,
-        "Bias-Nationality-NDS": 67.3,
-        "Bias-Religion-NDS": 69.4,
-        "Bias-Avg-NDS": 66.7,
-        "Bias-Age-GES": 80.5,
-        "Bias-Gender-GES": 81.2,
-        "Bias-Race-GES": 81.0,
-        "Bias-Nationality-GES": 81.6,
-        "Bias-Religion-GES": 82.6,
-        "Bias-Avg-GES": 81.2
-    }
-]

evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Prometheus-Vision-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "prometheus-eval",
-        "Alignment-Object": 14.3,
-        "Alignment-Attribute": 10.9,
-        "Alignment-Action": 9.4,
-        "Alignment-Location": 11.7,
-        "Alignment-Count": 16.1,
-        "Alignment-Avg": 11.8,
-        "Safety-Toxicity-Crime": 0.0,
-        "Safety-Toxicity-Shocking": 0.0,
-        "Safety-Toxicity-Disgust": 0.0,
-        "Safety-Toxicity-Avg": 0.0,
-        "Safety-Nsfw-Evident": 6.5,
-        "Safety-Nsfw-Evasive": 4.1,
-        "Safety-Nsfw-Subtle": 4.2,
-        "Safety-Nsfw-Avg": 5.3,
-        "Quality-Distortion-Human_face": 7.1,
-        "Quality-Distortion-Human_limb": 4.6,
-        "Quality-Distortion-Object": 7.2,
-        "Quality-Distortion-Avg": 6.2,
-        "Quality-Blurry-Defocused": 9.4,
-        "Quality-Blurry-Motion": 10.6,
-        "Quality-Blurry-Avg": 10.0,
-        "Bias-Age": 65.1,
-        "Bias-Gender": 65.8,
-        "Bias-Race": 63.4,
-        "Bias-Nationality": 65.7,
-        "Bias-Religion": 77.1,
-        "Bias-Avg": 65.8,
-        "Bias-Age-NDS": 54.2,
-        "Bias-Gender-NDS": 44.7,
-        "Bias-Race-NDS": 36.0,
-        "Bias-Nationality-NDS": 39.3,
-        "Bias-Religion-NDS": 65.7,
-        "Bias-Avg-NDS": 44.7,
-        "Bias-Age-GES": 79.2,
-        "Bias-Gender-GES": 76.0,
-        "Bias-Race-GES": 72.7,
-        "Bias-Nationality-GES": 74.1,
-        "Bias-Religion-GES": 85.1,
-        "Bias-Avg-GES": 76.0
-    }
-]

evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Prometheus-Vision-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "prometheus-eval",
-        "Alignment-Object": 19.5,
-        "Alignment-Attribute": 15.2,
-        "Alignment-Action": 16.2,
-        "Alignment-Location": 22.1,
-        "Alignment-Count": 26.8,
-        "Alignment-Avg": 18.8,
-        "Safety-Toxicity-Crime": 0.0,
-        "Safety-Toxicity-Shocking": 0.0,
-        "Safety-Toxicity-Disgust": 0.0,
-        "Safety-Toxicity-Avg": 0.0,
-        "Safety-Nsfw-Evident": 10.3,
-        "Safety-Nsfw-Evasive": 6.8,
-        "Safety-Nsfw-Subtle": 4.3,
-        "Safety-Nsfw-Avg": 7.1,
-        "Quality-Distortion-Human_face": 16.6,
-        "Quality-Distortion-Human_limb": 17.9,
-        "Quality-Distortion-Object": 14.1,
-        "Quality-Distortion-Avg": 16.4,
-        "Quality-Blurry-Defocused": 22.3,
-        "Quality-Blurry-Motion": 30.3,
-        "Quality-Blurry-Avg": 26.3,
-        "Bias-Age": 43.8,
-        "Bias-Gender": 50.4,
-        "Bias-Race": 54.4,
-        "Bias-Nationality": 53.6,
-        "Bias-Religion": 44.9,
-        "Bias-Avg": 50.4,
-        "Bias-Age-NDS": 47.2,
-        "Bias-Gender-NDS": 42.5,
-        "Bias-Race-NDS": 37.8,
-        "Bias-Nationality-NDS": 40.0,
-        "Bias-Religion-NDS": 54.2,
-        "Bias-Avg-NDS": 42.5,
-        "Bias-Age-GES": 74.9,
-        "Bias-Gender-GES": 74.3,
-        "Bias-Race-GES": 73.1,
-        "Bias-Nationality-GES": 74.2,
-        "Bias-Religion-GES": 77.3,
-        "Bias-Avg-GES": 74.3
-    }
-]

evals/mjbench-results/detailed-results/Qwen-VL-Chat.json DELETED Viewed

@@ -1,47 +0,0 @@
-[
-    {
-        "Model": "Qwen-VL-Chat",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Alibaba",
-        "Alignment-Object": 30.7,
-        "Alignment-Attribute": 29.1,
-        "Alignment-Action": 35.9,
-        "Alignment-Location": 29.9,
-        "Alignment-Count": 32.1,
-        "Alignment-Avg": 31.1,
-        "Safety-Toxicity-Crime": 27.6,
-        "Safety-Toxicity-Shocking": 13.8,
-        "Safety-Toxicity-Disgust": 31.0,
-        "Safety-Toxicity-Avg": 24.7,
-        "Safety-Nsfw-Evident": 18.9,
-        "Safety-Nsfw-Evasive": 7.6,
-        "Safety-Nsfw-Subtle": 6.3,
-        "Safety-Nsfw-Avg": 11.6,
-        "Quality-Distortion-Human_face": 14.2,
-        "Quality-Distortion-Human_limb": 15.9,
-        "Quality-Distortion-Object": 9.4,
-        "Quality-Distortion-Avg": 13.6,
-        "Quality-Blurry-Defocused": 0.9,
-        "Quality-Blurry-Motion": 2.1,
-        "Quality-Blurry-Avg": 1.4,
-        "Bias-Age": 70.8,
-        "Bias-Gender": 71.5,
-        "Bias-Race": 72.3,
-        "Bias-Nationality": 72.2,
-        "Bias-Religion": 68.1,
-        "Bias-Avg": 71.5,
-        "Bias-Age-NDS": 62.4,
-        "Bias-Gender-NDS": 62.3,
-        "Bias-Race-NDS": 62.3,
-        "Bias-Nationality-NDS": 63.1,
-        "Bias-Religion-NDS": 58.9,
-        "Bias-Avg-NDS": 62.3,
-        "Bias-Age-GES": 85.9,
-        "Bias-Gender-GES": 86.0,
-        "Bias-Race-GES": 86.0,
-        "Bias-Nationality-GES": 86.4,
-        "Bias-Religion-GES": 83.8,
-        "Bias-Avg-GES": 85.9
-    }
-]

evals/mjbench-results/overall-results/AestheticsPredictor.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "AestheticsPredictor",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "LAION",
-        "Alignment": 32.4,
-        "Safety": 27.0,
-        "Quality": 69.6,
-        "Bias": 61.4
-    }
-]

evals/mjbench-results/overall-results/BLIP-v2.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "BLIP-v2",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "Salesforce",
-        "Alignment": 17.3,
-        "Safety": 44.0,
-        "Quality": 7.5,
-        "Bias": 68.7
-    }
-]

evals/mjbench-results/overall-results/CLIP-v2.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "CLIP-v2",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "LAION",
-        "Alignment": 38.1,
-        "Safety": 12.7,
-        "Quality": 34.4,
-        "Bias": 57.4
-    }
-]

evals/mjbench-results/overall-results/Claude 3 Opus.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Claude 3 Opus",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Anthropic",
-        "Alignment": 57.1,
-        "Safety": 13.4,
-        "Quality": 11.9,
-        "Bias": 57.7
-    }
-]

evals/mjbench-results/overall-results/GPT-4-vision.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "GPT-4-vision",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenAI",
-        "Alignment": 66.1,
-        "Safety": 26.5,
-        "Quality": 90.4,
-        "Bias": 79.0
-    }
-]

evals/mjbench-results/overall-results/GPT-4o.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "GPT-4o",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenAI",
-        "Alignment": 61.5,
-        "Safety": 35.3,
-        "Quality": 97.6,
-        "Bias": 65.8
-    }
-]

evals/mjbench-results/overall-results/Gemini Ultra.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Gemini Ultra",
-        "Model Type": "Closesource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Google",
-        "Alignment": 67.2,
-        "Safety": 13.1,
-        "Quality": 55.7,
-        "Bias": 55.6
-    }
-]

evals/mjbench-results/overall-results/HPS-v2.1.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "HPS-v2.1",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "CUHK MMLab",
-        "Alignment": 47.3,
-        "Safety": 18.8,
-        "Quality": 67.3,
-        "Bias": 55.0
-    }
-]

evals/mjbench-results/overall-results/Idefics2-8b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Idefics2-8b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "HuggingFace",
-        "Alignment": 32.6,
-        "Safety": 13.6,
-        "Quality": 46.1,
-        "Bias": 42.1
-    }
-]

evals/mjbench-results/overall-results/ImageReward.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "ImageReward",
-        "Model Type": "Score Model",
-        "Input Type": "Single Image",
-        "Organization": "THUDM",
-        "Alignment": 50.9,
-        "Safety": 24.9,
-        "Quality": 63.5,
-        "Bias": 40.9
-    }
-]

evals/mjbench-results/overall-results/Instructblip-7b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Instructblip-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Salesforce",
-        "Alignment": 17.1,
-        "Safety": 26.4,
-        "Quality": 25.2,
-        "Bias": 53.1
-    }
-]

evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "InternVL-Chat-V1-5",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "OpenGVLab",
-        "Alignment": 55.3,
-        "Safety": 6.3,
-        "Quality": 66.3,
-        "Bias": 25.4
-    }
-]

evals/mjbench-results/overall-results/LLaVA-1.5-13b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "LLaVA-1.5-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & Microsoft",
-        "Alignment": 10.3,
-        "Safety": 30.7,
-        "Quality": 23.3,
-        "Bias": 69.7
-    }
-]

evals/mjbench-results/overall-results/LLaVA-1.5-7b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "LLaVA-1.5-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & Microsoft",
-        "Alignment": 22.0,
-        "Safety": 24.8,
-        "Quality": 12.4,
-        "Bias": 83.7
-    }
-]

evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "LLaVA-NeXT-mistral-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & ByteDance",
-        "Alignment": 31.3,
-        "Safety": 15.2,
-        "Quality": 45.8,
-        "Bias": 69.9
-    }
-]

evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "LLaVA-NeXT-vicuna-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "UW-Madison & ByteDance",
-        "Alignment": 29.1,
-        "Safety": 27.9,
-        "Quality": 36.8,
-        "Bias": 56.3
-    }
-]

evals/mjbench-results/overall-results/MiniGPT4-v2.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "MiniGPT4-v2",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Vision-CAIR",
-        "Alignment": 32.8,
-        "Safety": 25.7,
-        "Quality": 36.7,
-        "Bias": 32.6
-    }
-]

evals/mjbench-results/overall-results/PickScore-v1.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "PickScore-v1",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "Stability AI",
-        "Alignment": 58.8,
-        "Safety": 37.2,
-        "Quality": 83.8,
-        "Bias": 31.0
-    }
-]

evals/mjbench-results/overall-results/Prometheus-Vision-13b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Prometheus-Vision-13b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "prometheus-eval",
-        "Alignment": 11.8,
-        "Safety": 3.6,
-        "Quality": 8.7,
-        "Bias": 66.3
-    }
-]

evals/mjbench-results/overall-results/Prometheus-Vision-7b.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Prometheus-Vision-7b",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Single Image",
-        "Organization": "prometheus-eval",
-        "Alignment": 18.8,
-        "Safety": 7.1,
-        "Quality": 23.4,
-        "Bias": 49.5
-    }
-]

evals/mjbench-results/overall-results/Qwen-VL-Chat.json DELETED Viewed

@@ -1,12 +0,0 @@
-[
-    {
-        "Model": "Qwen-VL-Chat",
-        "Model Type": "Opensource VLM",
-        "Input Type": "Multi Image",
-        "Organization": "Alibaba",
-        "Alignment": 52.1,
-        "Safety": 26.8,
-        "Quality": 23.6,
-        "Bias": 71.9
-    }
-]

src/about.py CHANGED Viewed

@@ -21,15 +21,14 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">MJ-Bench</h1>"""
-MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-# Multimodal Judge Benchmark (MJ-Bench): Is Your Multimodal Reward Model Really a Good Judge?
-### Evaluating the `Alignment`, `Quality`, `Safety`, and `Bias` of multimodal reward models
-[Website](https://mj-bench.github.io) | [Code](https://github.com/MJ-Bench/MJ-Bench) | [Eval. Dataset](https://huggingface.co/datasets/MJ-Bench/MJ-Bench) | [Results](https://huggingface.co/datasets/MJ-Bench/MJ-Bench-Results) | [Refined Model via RMs](https://huggingface.co/collections/MJ-Bench/aligned-diffusion-model-via-dpo-667f8b71f35c3ff47acafd43) | [Paper](https://arxiv.org/abs/2407.04842) | Total models: {}
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">MMIE</h1>"""
+# MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+# MMIE: Massive Multimodal Interleaved Comprehension Benchmark for Large Vision-Language Models
+[Website](https://github.com/richard-peng-xia/MMIE) | [Code](https://github.com/richard-peng-xia/MMIE) | [Dataset](https://huggingface.co/datasets/MMIE/MMIE) | [Results](https://huggingface.co/datasets/MMIE/MMIE-Leaderboard) | [Eval Model](https://huggingface.co/MMIE/MMIE-Eval) | [Paper]()
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/envs.py CHANGED Viewed

@@ -9,9 +9,9 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
 OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"MJ-Bench/MJ-Bench-Leaderboard"
-QUEUE_REPO = f"MJ-Bench/MJ-Bench-Requests"
-RESULTS_REPO = f"MJ-Bench/MJ-Bench-Results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

 OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"MMIE/MMIE-Leaderboard"
+QUEUE_REPO = f"MMIE/MMIE-Requests"
+RESULTS_REPO = f"MMIE/MMIE-Results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/logo.png ADDED Viewed