wandra13 commited on
Commit
06b3f52
Β·
1 Parent(s): 1efe882

Update app.py with new model entries and logos; enhance CSS for improved UI. Added new assets including HumaniBench logo and vector favicon. Refactored styles for better layout and responsiveness.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
36
+ src/assets/teaser_figure_humanibench.png filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -14,55 +14,52 @@ GITHUB_URL = "https://github.com/VectorInstitute/humaniBench"
14
  DATASET_URL = "https://huggingface.co/datasets/vector-institute/HumaniBench"
15
  WEBSITE_URL = "https://vectorinstitute.github.io/humanibench/"
16
 
17
- black_logo_path = "src/assets/logo-icon-black.png"
18
- white_logo_path = "src/assets/logo-icon-white.png"
19
 
20
  # ========================
21
- # MODEL REGISTRY
22
- # Ordered by overall performance (update once paper results are confirmed)
23
  # ========================
24
 
25
  MODELS = [
26
- {"model": "GPT-4o", "link": "https://openai.com/gpt-4", "org": "OpenAI", "params": "-", "type": "Closed"},
27
- {"model": "Gemini-2.0-Flash", "link": "https://deepmind.google/technologies/gemini/", "org": "Google", "params": "-", "type": "Closed"},
28
- {"model": "InternVL2-26B", "link": "https://huggingface.co/OpenGVLab/InternVL2-26B", "org": "OpenGVLab", "params": "26B", "type": "Open"},
29
- {"model": "InternVL-Chat-v1.5", "link": "https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5", "org": "OpenGVLab", "params": "26B", "type": "Open"},
30
- {"model": "LLaVA-NeXT-34B", "link": "https://huggingface.co/lmms-lab/llava-next-34b", "org": "lmms-lab", "params": "34B", "type": "Open"},
31
- {"model": "LLaVA-NeXT-13B", "link": "https://huggingface.co/lmms-lab/llava-next-vicuna-13b", "org": "lmms-lab", "params": "13B", "type": "Open"},
32
- {"model": "LLaVA-1.5-13B", "link": "https://huggingface.co/llava-hf/llava-1.5-13b-hf", "org": "llava-hf", "params": "13B", "type": "Open"},
33
- {"model": "Qwen-VL-Chat", "link": "https://huggingface.co/Qwen/Qwen-VL-Chat", "org": "Alibaba", "params": "7B", "type": "Open"},
34
- {"model": "CogVLM2-19B", "link": "https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B", "org": "THUDM", "params": "19B", "type": "Open"},
35
- {"model": "IDEFICS2-8B", "link": "https://huggingface.co/HuggingFaceM4/idefics2-8b", "org": "HuggingFace", "params": "8B", "type": "Open"},
36
- {"model": "InstructBLIP-13B", "link": "https://huggingface.co/Salesforce/instructblip-vicuna-13b", "org": "Salesforce", "params": "13B", "type": "Open"},
37
- {"model": "Phi-3.5-Vision", "link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "org": "Microsoft", "params": "4B", "type": "Open"},
38
- {"model": "MiniCPM-V-2.6", "link": "https://huggingface.co/openbmb/MiniCPM-V-2_6", "org": "OpenBMB", "params": "8B", "type": "Open"},
39
- {"model": "BLIP-2-FlanT5-XXL", "link": "https://huggingface.co/Salesforce/blip2-flan-t5-xxl", "org": "Salesforce", "params": "11B", "type": "Open"},
40
- {"model": "mPLUG-Owl2", "link": "https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b", "org": "Alibaba DAMO", "params": "7B", "type": "Open"},
41
  ]
42
 
43
  # ========================
44
  # PRINCIPLE DATA (Table A2)
45
- # 7 Human-Centric (HC) principles, scores per model
46
- # Replace None with actual values from the paper
47
  # ========================
48
 
49
- # Columns: Fairness | Ethics | Understanding | Reasoning | Language | Empathy | Robustness
50
  PRINCIPLE_DATA = [
51
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
52
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
53
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
54
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
55
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
56
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
57
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
58
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
59
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
60
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
61
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
62
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
63
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
64
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
65
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Fairness": None, "Ethics": None, "Understanding": None, "Reasoning": None, "Language": None, "Empathy": None, "Robustness": None, "Overall": None},
66
  ]
67
 
68
  # ========================
@@ -70,139 +67,158 @@ PRINCIPLE_DATA = [
70
  # T1–T7 per-model accuracy / scores
71
  # ========================
72
 
73
- # T1: Scene Understanding β€” Accuracy (%)
 
 
 
 
 
 
 
 
 
74
  T1_DATA = [
75
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
76
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
77
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
78
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
79
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
80
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
81
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
82
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
83
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
84
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
85
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
86
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
87
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
88
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
89
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
90
  ]
91
 
92
- # T2: Instance Identity β€” Accuracy (%)
 
 
93
  T2_DATA = [
94
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
95
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
96
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
97
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
98
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
99
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
100
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
101
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
102
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
103
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
104
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
105
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
106
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
107
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
108
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
109
  ]
110
 
111
- # T3: Multiple-Choice VQA β€” Accuracy (%)
 
 
112
  T3_DATA = [
113
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
114
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
115
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
116
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
117
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
118
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
119
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
120
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
121
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
122
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
123
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
124
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
125
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
126
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
127
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Age": None, "Gender": None, "Race": None, "Occupation": None, "Sports": None, "Overall": None},
128
  ]
129
 
130
- # T4: Multilingual VQA β€” Accuracy (%) per language
131
- LANGUAGES = ["English", "Spanish", "French", "German", "Chinese", "Japanese", "Arabic", "Hindi", "Portuguese", "Italian", "Korean"]
132
 
 
133
  T4_DATA = [
134
- {"model": "GPT-4o", "link": MODELS[0]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
135
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
136
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
137
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
138
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
139
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
140
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
141
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
142
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
143
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
144
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
145
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
146
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
147
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
148
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "English": None, "Spanish": None, "French": None, "German": None, "Chinese": None, "Japanese": None, "Arabic": None, "Hindi": None, "Portuguese": None, "Italian": None, "Korean": None, "Avg": None},
149
  ]
150
 
151
- # T5: Visual Grounding β€” Accuracy (%)
 
 
152
  T5_DATA = [
153
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Overall": None},
154
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Overall": None},
155
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Overall": None},
156
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Overall": None},
157
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Overall": None},
158
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Overall": None},
159
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Overall": None},
160
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Overall": None},
161
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Overall": None},
162
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Overall": None},
163
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Overall": None},
164
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Overall": None},
165
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Overall": None},
166
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Overall": None},
167
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Overall": None},
168
  ]
169
 
170
- # T6: Empathetic Captioning β€” quality score
 
 
171
  T6_DATA = [
172
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Score": None},
173
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Score": None},
174
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Score": None},
175
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Score": None},
176
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Score": None},
177
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Score": None},
178
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Score": None},
179
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Score": None},
180
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Score": None},
181
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Score": None},
182
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Score": None},
183
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Score": None},
184
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Score": None},
185
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Score": None},
186
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Score": None},
187
  ]
188
 
189
- # T7: Image Resilience β€” Accuracy (%)
 
 
190
  T7_DATA = [
191
- {"model": "GPT-4o", "link": MODELS[0]["link"], "Overall": None},
192
- {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Overall": None},
193
- {"model": "InternVL2-26B", "link": MODELS[2]["link"], "Overall": None},
194
- {"model": "InternVL-Chat-v1.5", "link": MODELS[3]["link"], "Overall": None},
195
- {"model": "LLaVA-NeXT-34B", "link": MODELS[4]["link"], "Overall": None},
196
- {"model": "LLaVA-NeXT-13B", "link": MODELS[5]["link"], "Overall": None},
197
- {"model": "LLaVA-1.5-13B", "link": MODELS[6]["link"], "Overall": None},
198
- {"model": "Qwen-VL-Chat", "link": MODELS[7]["link"], "Overall": None},
199
- {"model": "CogVLM2-19B", "link": MODELS[8]["link"], "Overall": None},
200
- {"model": "IDEFICS2-8B", "link": MODELS[9]["link"], "Overall": None},
201
- {"model": "InstructBLIP-13B", "link": MODELS[10]["link"], "Overall": None},
202
- {"model": "Phi-3.5-Vision", "link": MODELS[11]["link"], "Overall": None},
203
- {"model": "MiniCPM-V-2.6", "link": MODELS[12]["link"], "Overall": None},
204
- {"model": "BLIP-2-FlanT5-XXL", "link": MODELS[13]["link"], "Overall": None},
205
- {"model": "mPLUG-Owl2", "link": MODELS[14]["link"], "Overall": None},
206
  ]
207
 
208
 
@@ -240,7 +256,7 @@ INTRODUCTION_HTML = f"""
240
  <div class="stat-label">Image–Question Pairs</div>
241
  </div>
242
  <div class="stat-box">
243
- <div class="stat-value">1,500</div>
244
  <div class="stat-label">Unique Images</div>
245
  </div>
246
  <div class="stat-box">
@@ -268,7 +284,7 @@ grounded in seven human-centric (HC) principles.
268
  ### Dataset Overview
269
 
270
  - **32,000+ expert-verified** image–question pairs from real-world news imagery
271
- - **1,500 unique images** spanning diverse social contexts
272
  - **7 evaluation tasks** (T1–T7) covering scene understanding, identity, reasoning, language, grounding, empathy, and robustness
273
  - **7 HC principles**: Fairness, Ethics, Understanding, Reasoning, Language, Empathy, Robustness
274
  - **5 social attributes**: Age, Gender, Race, Occupation, Sports
@@ -315,6 +331,7 @@ This dataset is released under **CC BY-NC-SA 4.0**.
315
 
316
  ### Contact
317
 
 
318
  - **Website:** [{WEBSITE_URL}]({WEBSITE_URL})
319
  - **Dataset:** [HuggingFace]({DATASET_URL})
320
  - **Code:** [GitHub]({GITHUB_URL})
@@ -329,7 +346,7 @@ This dataset is released under **CC BY-NC-SA 4.0**.
329
  # TABLE BUILDERS
330
  # ========================
331
 
332
- def _make_df(data: list[dict], score_cols: list[str], pct: bool = True) -> pd.DataFrame:
333
  rows = []
334
  for item in data:
335
  row = {"Model": make_clickable_model(item["model"], item.get("link"))}
@@ -364,7 +381,7 @@ def build_overall_leaderboard():
364
  )
365
 
366
 
367
- def build_task_leaderboard(task_data: list[dict], score_cols: list[str], pct: bool = True):
368
  df = _make_df(task_data, score_cols, pct=pct)
369
  return gr.Dataframe(
370
  value=df,
@@ -375,9 +392,9 @@ def build_task_leaderboard(task_data: list[dict], score_cols: list[str], pct: bo
375
  )
376
 
377
 
378
- def build_social_leaderboard(task_data: list[dict]):
379
- ATTR_COLS = ["Age", "Gender", "Race", "Occupation", "Sports", "Overall"]
380
- return build_task_leaderboard(task_data, ATTR_COLS, pct=True)
381
 
382
 
383
  def build_multilingual_leaderboard():
@@ -396,20 +413,33 @@ with demo:
396
  <div id="page-header">
397
  <div id="header-container">
398
  <div id="left-container">
399
- <img id="black-logo" src="/gradio_api/file={black_logo_path}" onerror="this.style.display='none'">
400
- <img id="white-logo" src="/gradio_api/file={white_logo_path}" onerror="this.style.display='none'">
 
 
401
  </div>
402
  <div id="centre-container">
403
  <h1>HumaniBench Leaderboard</h1>
404
  <p>A Human-Centric Evaluation Framework for Large Multimodal Models</p>
405
  </div>
406
- <div id="right-container"></div>
 
 
 
407
  </div>
408
  </div>
409
  """)
410
 
411
  gr.HTML(INTRODUCTION_HTML)
412
 
 
 
 
 
 
 
 
 
413
  with gr.Tabs():
414
 
415
  # ── Tab 1: Overall Rankings ──────────────────────────────────────────
@@ -422,7 +452,7 @@ with demo:
422
  </div>
423
  """, elem_classes="markdown-text")
424
  build_overall_leaderboard()
425
- gr.Markdown("*Scores are averaged across tasks associated with each principle. -- indicates data not yet available.*")
426
 
427
  # ── Tab 2: Task Results ──────────────────────────────────────────────
428
  with gr.Tab("Task Results"):
@@ -435,80 +465,34 @@ with demo:
435
 
436
  with gr.Tabs():
437
  with gr.Tab("T1 Β· Scene Understanding"):
438
- gr.Markdown("**Metric:** Accuracy (%) | Breakdown by social attribute (Age / Gender / Race / Occupation / Sports)")
439
- build_social_leaderboard(T1_DATA)
440
 
441
  with gr.Tab("T2 Β· Instance Identity"):
442
- gr.Markdown("**Metric:** Accuracy (%) | Breakdown by social attribute")
443
- build_social_leaderboard(T2_DATA)
444
 
445
  with gr.Tab("T3 Β· MC-VQA"):
446
- gr.Markdown("**Metric:** Accuracy (%) | Breakdown by social attribute")
447
- build_social_leaderboard(T3_DATA)
448
 
449
  with gr.Tab("T4 Β· Multilingual"):
450
- gr.Markdown("**Metric:** Accuracy (%) across 11 languages β€” see the Multilingual tab for the full view.")
451
  build_multilingual_leaderboard()
452
 
453
  with gr.Tab("T5 Β· Visual Grounding"):
454
- gr.Markdown("**Metric:** Accuracy (%)")
455
- build_task_leaderboard(T5_DATA, ["Overall"], pct=True)
456
 
457
  with gr.Tab("T6 Β· Empathetic Captioning"):
458
- gr.Markdown("**Metric:** Empathy score")
459
- build_task_leaderboard(T6_DATA, ["Score"], pct=False)
460
 
461
  with gr.Tab("T7 Β· Image Resilience"):
462
- gr.Markdown("**Metric:** Accuracy (%)")
463
- build_task_leaderboard(T7_DATA, ["Overall"], pct=True)
464
-
465
- # ── Tab 3: Social Attributes ─────────────────────────────────────────
466
- with gr.Tab("Social Attributes"):
467
- gr.Markdown("""
468
- <div class="warning-box">
469
- <h3>Fairness Analysis: Accuracy by Social Attribute</h3>
470
- Performance breakdowns across Age, Gender, Race, Occupation, and Sports
471
- for Tasks T1, T2, and T3. Disparities reveal systematic biases.
472
- </div>
473
- """, elem_classes="markdown-text")
474
-
475
- with gr.Tabs():
476
- with gr.Tab("T1 Β· Scene Understanding"):
477
- gr.Markdown("**Metric:** Accuracy (%) per social attribute group")
478
- build_social_leaderboard(T1_DATA)
479
-
480
- with gr.Tab("T2 Β· Instance Identity"):
481
- gr.Markdown("**Metric:** Accuracy (%) per social attribute group")
482
- build_social_leaderboard(T2_DATA)
483
-
484
- with gr.Tab("T3 Β· MC-VQA"):
485
- gr.Markdown("**Metric:** Accuracy (%) per social attribute group")
486
- build_social_leaderboard(T3_DATA)
487
-
488
- gr.Markdown("""
489
- <div class="warning-box">
490
- ⚠️ Performance disparities across demographic groups should be addressed before deploying models
491
- in high-stakes settings.
492
- </div>
493
- """, elem_classes="markdown-text")
494
-
495
- # ── Tab 4: Multilingual ──────────────────────────────────────────────
496
- with gr.Tab("Multilingual (T4)"):
497
- gr.Markdown("""
498
- <div class="info-box">
499
- <h3>Task T4: Multilingual VQA β€” Accuracy (%) per Language</h3>
500
- Models are evaluated on visual questions posed in 11 languages. Avg is the
501
- macro-average across all languages.
502
- </div>
503
- """, elem_classes="markdown-text")
504
- build_multilingual_leaderboard()
505
- gr.Markdown("""
506
- **Languages:** English Β· Spanish Β· French Β· German Β· Chinese Β· Japanese Β· Arabic Β· Hindi Β· Portuguese Β· Italian Β· Korean
507
-
508
- *Gaps between high-resource (English, French) and low-resource (Arabic, Hindi) languages expose multilingual inclusivity limitations.*
509
- """, elem_classes="markdown-text")
510
 
511
- # ── Tab 5: About ─────────────────────────────────────────────────────
512
  with gr.Tab("About"):
513
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
514
 
@@ -526,6 +510,4 @@ with demo:
526
 
527
 
528
  if __name__ == "__main__":
529
- import os
530
- assets = [p for p in [black_logo_path, white_logo_path] if os.path.exists(p)]
531
- demo.launch(allowed_paths=assets if assets else None)
 
14
  DATASET_URL = "https://huggingface.co/datasets/vector-institute/HumaniBench"
15
  WEBSITE_URL = "https://vectorinstitute.github.io/humanibench/"
16
 
17
+ vector_logo_path = "src/assets/vector-favicon-48x48.svg"
18
+ humanibench_logo_path = "src/assets/HumaniBenchLogo.ico"
19
 
20
  # ========================
21
+ # MODEL REGISTRY (Table A2 order)
 
22
  # ========================
23
 
24
  MODELS = [
25
+ {"model": "GPT-4o", "link": "https://openai.com/gpt-4o", "org": "OpenAI", "params": "-", "type": "Closed"},
26
+ {"model": "Gemini-2.0-Flash", "link": "https://deepmind.google/technologies/gemini/", "org": "Google", "params": "-", "type": "Closed"},
27
+ {"model": "Qwen-2.5-7B", "link": "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "org": "Alibaba", "params": "7B", "type": "Open"},
28
+ {"model": "LLaVA-v1.6", "link": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", "org": "LLaVA", "params": "7B", "type": "Open"},
29
+ {"model": "Phi-4", "link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct", "org": "Microsoft", "params": "5.6B", "type": "Open"},
30
+ {"model": "Gemma-3", "link": "https://huggingface.co/google/gemma-3-4b-it", "org": "Google", "params": "4B", "type": "Open"},
31
+ {"model": "CogVLM2-19B", "link": "https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B", "org": "THUDM", "params": "19B", "type": "Open"},
32
+ {"model": "Phi-3.5", "link": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct", "org": "Microsoft", "params": "4B", "type": "Open"},
33
+ {"model": "Molmo-7V", "link": "https://huggingface.co/allenai/Molmo-7B-O-0924", "org": "Allen AI", "params": "7B", "type": "Open"},
34
+ {"model": "Aya-Vision-8B", "link": "https://huggingface.co/CohereForAI/aya-vision-8b", "org": "Cohere", "params": "8B", "type": "Open"},
35
+ {"model": "InternVL2.5", "link": "https://huggingface.co/OpenGVLab/InternVL2_5-8B", "org": "OpenGVLab", "params": "8B", "type": "Open"},
36
+ {"model": "Janus-Pro-7B", "link": "https://huggingface.co/deepseek-ai/Janus-Pro-7B", "org": "DeepSeek", "params": "7B", "type": "Open"},
37
+ {"model": "GLM-4V-9B", "link": "https://huggingface.co/THUDM/glm-4v-9b", "org": "THUDM", "params": "9B", "type": "Open"},
38
+ {"model": "Llama-3.2-11B", "link": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct", "org": "Meta", "params": "11B", "type": "Open"},
39
+ {"model": "DeepSeek-VL2-Small", "link": "https://huggingface.co/deepseek-ai/deepseek-vl2-small", "org": "DeepSeek", "params": "3B", "type": "Open"},
40
  ]
41
 
42
  # ========================
43
  # PRINCIPLE DATA (Table A2)
44
+ # Scores are percentages; Overall = mean of all 7 principles
 
45
  # ========================
46
 
 
47
  PRINCIPLE_DATA = [
48
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Fairness": 61.1, "Ethics": 99.0, "Understanding": 74.8, "Reasoning": 79.2, "Language": 62.5, "Empathy": 90.5, "Robustness": 50.90, "Overall": 74.00},
49
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Fairness": 61.0, "Ethics": 98.9, "Understanding": 73.5, "Reasoning": 78.8, "Language": 62.2, "Empathy": 89.5, "Robustness": 57.20, "Overall": 74.44},
50
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Fairness": 63.1, "Ethics": 96.5, "Understanding": 84.9, "Reasoning": 67.1, "Language": 57.4, "Empathy": 73.8, "Robustness": 53.60, "Overall": 70.91},
51
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Fairness": 59.7, "Ethics": 94.4, "Understanding": 80.3, "Reasoning": 68.1, "Language": 55.4, "Empathy": 66.3, "Robustness": 60.60, "Overall": 69.26},
52
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Fairness": 59.2, "Ethics": 98.2, "Understanding": 78.6, "Reasoning": 77.4, "Language": 61.3, "Empathy": 79.0, "Robustness": 45.70, "Overall": 71.34},
53
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Fairness": 57.5, "Ethics": 94.6, "Understanding": 73.2, "Reasoning": 67.8, "Language": 57.7, "Empathy": 79.8, "Robustness": 58.30, "Overall": 69.84},
54
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Fairness": 53.1, "Ethics": 96.3, "Understanding": 67.5, "Reasoning": 74.4, "Language": 60.4, "Empathy": 68.0, "Robustness": 35.12, "Overall": 64.97},
55
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Fairness": 56.0, "Ethics": 96.1, "Understanding": 72.3, "Reasoning": 69.7, "Language": 57.3, "Empathy": 70.8, "Robustness": 50.50, "Overall": 67.53},
56
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Fairness": 52.4, "Ethics": 94.8, "Understanding": 66.2, "Reasoning": 65.8, "Language": 55.0, "Empathy": 58.8, "Robustness": 49.70, "Overall": 63.24},
57
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Fairness": 51.7, "Ethics": 94.9, "Understanding": 64.4, "Reasoning": 68.1, "Language": 50.8, "Empathy": 77.8, "Robustness": 45.90, "Overall": 64.80},
58
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Fairness": 50.9, "Ethics": 93.8, "Understanding": 63.8, "Reasoning": 64.4, "Language": 51.1, "Empathy": 74.5, "Robustness": 56.40, "Overall": 64.99},
59
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Fairness": 50.2, "Ethics": 96.9, "Understanding": 63.3, "Reasoning": 65.2, "Language": 57.6, "Empathy": 69.5, "Robustness": 52.80, "Overall": 65.07},
60
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Fairness": 50.2, "Ethics": 94.4, "Understanding": 63.9, "Reasoning": 63.0, "Language": 50.0, "Empathy": 67.8, "Robustness": 50.50, "Overall": 62.83},
61
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Fairness": 50.2, "Ethics": 94.9, "Understanding": 58.9, "Reasoning": 63.0, "Language": 50.7, "Empathy": 71.3, "Robustness": 56.70, "Overall": 63.67},
62
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Fairness": 48.8, "Ethics": 90.6, "Understanding": 54.8, "Reasoning": 61.6, "Language": 49.1, "Empathy": 59.3, "Robustness": 55.70, "Overall": 59.99},
63
  ]
64
 
65
  # ========================
 
67
  # T1–T7 per-model accuracy / scores
68
  # ========================
69
 
70
+ def _task_rows(extra_keys: list) -> list:
71
+ """Generate per-model rows with None scores for the given extra columns."""
72
+ return [
73
+ {"model": m["model"], "link": m["link"], **{k: None for k in extra_keys}}
74
+ for m in MODELS
75
+ ]
76
+
77
+ T1_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]
78
+
79
+ # T1: Scene Understanding (Open-Ended VQA)
80
  T1_DATA = [
81
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 74.80, "Bias": 0.90, "Hallucination": 2.10, "Faithfulness": 76.50, "Context Rel.": 75.20, "Coherence": 75.80},
82
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 73.20, "Bias": 1.10, "Hallucination": 1.70, "Faithfulness": 75.90, "Context Rel.": 74.30, "Coherence": 74.80},
83
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 67.37, "Bias": 9.33, "Hallucination": 9.38, "Faithfulness": 67.92, "Context Rel.": 66.28, "Coherence": 66.40},
84
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 64.34, "Bias": 9.03, "Hallucination": 9.12, "Faithfulness": 65.33, "Context Rel.": 68.10, "Coherence": 66.90},
85
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 68.10, "Bias": 1.23, "Hallucination": 3.12, "Faithfulness": 72.38, "Context Rel.": 73.47, "Coherence": 73.20},
86
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 66.50, "Bias": 8.50, "Hallucination": 8.20, "Faithfulness": 70.10, "Context Rel.": 68.30, "Coherence": 69.00},
87
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 67.34, "Bias": 11.38, "Hallucination": 10.45, "Faithfulness": 69.01, "Context Rel.": 71.29, "Coherence": 69.80},
88
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 67.19, "Bias": 2.40, "Hallucination": 5.21, "Faithfulness": 67.45, "Context Rel.": 65.28, "Coherence": 65.90},
89
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 67.12, "Bias": 1.87, "Hallucination": 4.35, "Faithfulness": 64.78, "Context Rel.": 62.01, "Coherence": 62.60},
90
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 62.19, "Bias": 8.12, "Hallucination": 8.46, "Faithfulness": 68.84, "Context Rel.": 68.22, "Coherence": 68.00},
91
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 61.10, "Bias": 10.70, "Hallucination": 10.73, "Faithfulness": 65.71, "Context Rel.": 64.18, "Coherence": 64.20},
92
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 62.10, "Bias": 1.35, "Hallucination": 3.21, "Faithfulness": 69.26, "Context Rel.": 67.09, "Coherence": 67.50},
93
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 60.18, "Bias": 8.63, "Hallucination": 8.34, "Faithfulness": 69.98, "Context Rel.": 65.10, "Coherence": 65.40},
94
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 63.40, "Bias": 19.30, "Hallucination": 15.67, "Faithfulness": 62.09, "Context Rel.": 66.01, "Coherence": 64.30},
95
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 59.10, "Bias": 12.56, "Hallucination": 11.29, "Faithfulness": 62.14, "Context Rel.": 63.10, "Coherence": 63.00},
96
  ]
97
 
98
+ T2_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]
99
+
100
+ # T2: Instance Identity (Open-Ended VQA)
101
  T2_DATA = [
102
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 68.10, "Bias": 1.50, "Hallucination": 3.00, "Faithfulness": 85.00, "Context Rel.": 85.00, "Coherence": 85.00},
103
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 66.50, "Bias": 2.00, "Hallucination": 4.00, "Faithfulness": 83.00, "Context Rel.": 82.00, "Coherence": 82.00},
104
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 62.37, "Bias": 10.21, "Hallucination": 6.27, "Faithfulness": 67.92, "Context Rel.": 68.65, "Coherence": 66.94},
105
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 59.34, "Bias": 9.82, "Hallucination": 10.01, "Faithfulness": 65.33, "Context Rel.": 66.10, "Coherence": 65.02},
106
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 63.10, "Bias": 2.07, "Hallucination": 4.08, "Faithfulness": 81.67, "Context Rel.": 82.21, "Coherence": 81.76},
107
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 61.94, "Bias": 15.19, "Hallucination": 5.00, "Faithfulness": 78.96, "Context Rel.": 75.00, "Coherence": 76.00},
108
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 62.34, "Bias": 12.31, "Hallucination": 6.53, "Faithfulness": 74.01, "Context Rel.": 70.14, "Coherence": 72.45},
109
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 62.19, "Bias": 3.39, "Hallucination": 6.19, "Faithfulness": 67.45, "Context Rel.": 68.34, "Coherence": 67.80},
110
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 57.19, "Bias": 9.02, "Hallucination": 9.39, "Faithfulness": 68.84, "Context Rel.": 67.74, "Coherence": 66.89},
111
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 62.12, "Bias": 2.83, "Hallucination": 5.44, "Faithfulness": 64.78, "Context Rel.": 67.33, "Coherence": 65.41},
112
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 56.10, "Bias": 11.74, "Hallucination": 11.69, "Faithfulness": 65.71, "Context Rel.": 64.49, "Coherence": 62.92},
113
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 57.10, "Bias": 2.16, "Hallucination": 4.24, "Faithfulness": 69.26, "Context Rel.": 71.82, "Coherence": 71.09},
114
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 55.18, "Bias": 9.59, "Hallucination": 9.18, "Faithfulness": 69.98, "Context Rel.": 65.73, "Coherence": 64.30},
115
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 54.10, "Bias": 13.48, "Hallucination": 12.41, "Faithfulness": 64.05, "Context Rel.": 63.12, "Coherence": 61.37},
116
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 58.40, "Bias": 20.42, "Hallucination": 16.72, "Faithfulness": 62.09, "Context Rel.": 60.04, "Coherence": 59.11},
117
  ]
118
 
119
+ T3_COLS = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]
120
+
121
+ # T3: Multiple-Choice VQA
122
  T3_DATA = [
123
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Accuracy": 68.10, "Bias": 0.95, "Hallucination": 1.20, "Faithfulness": 82.30, "Context Rel.": 80.45, "Coherence": 73.90},
124
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Accuracy": 70.40, "Bias": 0.85, "Hallucination": 0.95, "Faithfulness": 81.60, "Context Rel.": 82.10, "Coherence": 74.60},
125
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Accuracy": 52.93, "Bias": 6.30, "Hallucination": 6.35, "Faithfulness": 69.22, "Context Rel.": 67.54, "Coherence": 66.63},
126
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Accuracy": 50.89, "Bias": 7.68, "Hallucination": 7.22, "Faithfulness": 64.77, "Context Rel.": 63.06, "Coherence": 62.25},
127
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Accuracy": 60.80, "Bias": 2.01, "Hallucination": 3.00, "Faithfulness": 76.55, "Context Rel.": 74.77, "Coherence": 73.86},
128
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Accuracy": 54.22, "Bias": 5.43, "Hallucination": 5.80, "Faithfulness": 71.14, "Context Rel.": 69.37, "Coherence": 68.46},
129
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Accuracy": 61.10, "Bias": 1.95, "Hallucination": 2.90, "Faithfulness": 77.20, "Context Rel.": 75.40, "Coherence": 74.50},
130
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Accuracy": 53.18, "Bias": 6.13, "Hallucination": 6.24, "Faithfulness": 69.98, "Context Rel.": 68.16, "Coherence": 67.26},
131
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Accuracy": 51.47, "Bias": 7.29, "Hallucination": 6.97, "Faithfulness": 66.02, "Context Rel.": 64.38, "Coherence": 63.56},
132
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Accuracy": 51.64, "Bias": 7.17, "Hallucination": 6.90, "Faithfulness": 67.33, "Context Rel.": 65.69, "Coherence": 64.74},
133
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Accuracy": 49.05, "Bias": 8.92, "Hallucination": 8.00, "Faithfulness": 61.01, "Context Rel.": 59.37, "Coherence": 58.53},
134
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Accuracy": 55.51, "Bias": 4.56, "Hallucination": 5.25, "Faithfulness": 72.33, "Context Rel.": 70.47, "Coherence": 69.53},
135
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Accuracy": 50.76, "Bias": 7.76, "Hallucination": 7.27, "Faithfulness": 63.26, "Context Rel.": 61.55, "Coherence": 60.73},
136
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Accuracy": 45.67, "Bias": 18.28, "Hallucination": 12.98, "Faithfulness": 52.02, "Context Rel.": 55.29, "Coherence": 54.39},
137
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Accuracy": 45.35, "Bias": 14.13, "Hallucination": 12.55, "Faithfulness": 54.21, "Context Rel.": 56.46, "Coherence": 54.52},
138
  ]
139
 
140
+ LANGUAGES = ["English", "French", "Spanish", "Portuguese", "Mandarin", "Korean", "Urdu", "Persian", "Bengali", "Punjabi", "Tamil"]
 
141
 
142
+ # T4: Multilingual VQA β€” Accuracy (%) per language
143
  T4_DATA = [
144
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "English": 64.6, "French": 64.0, "Spanish": 63.4, "Portuguese": 62.8, "Mandarin": 62.3, "Korean": 61.8, "Urdu": 60.1, "Persian": 59.7, "Bengali": 59.1, "Punjabi": 58.6, "Tamil": 58.1, "Avg": 61.32},
145
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "English": 64.4, "French": 63.8, "Spanish": 63.2, "Portuguese": 62.6, "Mandarin": 62.1, "Korean": 61.7, "Urdu": 60.0, "Persian": 59.5, "Bengali": 58.9, "Punjabi": 58.4, "Tamil": 58.0, "Avg": 61.15},
146
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "English": 59.2, "French": 58.6, "Spanish": 57.9, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09},
147
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "English": 56.8, "French": 56.4, "Spanish": 55.6, "Portuguese": 55.1, "Mandarin": 54.6, "Korean": 54.1, "Urdu": 52.8, "Persian": 52.4, "Bengali": 51.8, "Punjabi": 51.4, "Tamil": 51.0, "Avg": 53.82},
148
+ {"model": "Phi-4", "link": MODELS[4]["link"], "English": 63.3, "French": 62.8, "Spanish": 62.1, "Portuguese": 61.6, "Mandarin": 61.1, "Korean": 60.6, "Urdu": 58.9, "Persian": 58.5, "Bengali": 57.8, "Punjabi": 57.3, "Tamil": 56.9, "Avg": 60.08},
149
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "English": 59.5, "French": 59.0, "Spanish": 58.2, "Portuguese": 57.7, "Mandarin": 57.3, "Korean": 56.9, "Urdu": 55.3, "Persian": 54.9, "Bengali": 54.3, "Punjabi": 53.8, "Tamil": 53.3, "Avg": 56.38},
150
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "English": 61.6, "French": 61.3, "Spanish": 60.9, "Portuguese": 61.4, "Mandarin": 60.9, "Korean": 60.4, "Urdu": 58.7, "Persian": 58.3, "Bengali": 57.6, "Punjabi": 57.1, "Tamil": 56.6, "Avg": 59.53},
151
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "English": 59.1, "French": 58.6, "Spanish": 58.0, "Portuguese": 57.5, "Mandarin": 57.0, "Korean": 56.6, "Urdu": 55.1, "Persian": 54.6, "Bengali": 53.9, "Punjabi": 53.5, "Tamil": 53.1, "Avg": 56.09},
152
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "English": 56.1, "French": 55.6, "Spanish": 54.9, "Portuguese": 54.5, "Mandarin": 54.2, "Korean": 53.8, "Urdu": 52.5, "Persian": 52.1, "Bengali": 51.5, "Punjabi": 51.1, "Tamil": 50.7, "Avg": 53.36},
153
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "English": 55.8, "French": 55.0, "Spanish": 54.2, "Portuguese": 53.2, "Mandarin": 52.3, "Korean": 51.7, "Urdu": 51.3, "Persian": 51.7, "Bengali": 51.9, "Punjabi": 49.9, "Tamil": 49.1, "Avg": 52.37},
154
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "English": 53.9, "French": 53.1, "Spanish": 52.4, "Portuguese": 51.1, "Mandarin": 50.5, "Korean": 49.7, "Urdu": 49.3, "Persian": 49.9, "Bengali": 50.1, "Punjabi": 47.9, "Tamil": 47.3, "Avg": 50.47},
155
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "English": 58.5, "French": 58.1, "Spanish": 57.5, "Portuguese": 57.0, "Mandarin": 56.5, "Korean": 55.8, "Urdu": 54.5, "Persian": 54.1, "Bengali": 53.5, "Punjabi": 53.0, "Tamil": 52.6, "Avg": 55.55},
156
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "English": 53.3, "French": 52.7, "Spanish": 51.8, "Portuguese": 50.8, "Mandarin": 50.1, "Korean": 49.4, "Urdu": 49.0, "Persian": 49.5, "Bengali": 49.7, "Punjabi": 47.6, "Tamil": 47.2, "Avg": 50.10},
157
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "English": 51.9, "French": 51.5, "Spanish": 50.7, "Portuguese": 50.3, "Mandarin": 49.9, "Korean": 49.4, "Urdu": 48.0, "Persian": 47.6, "Bengali": 47.0, "Punjabi": 46.5, "Tamil": 46.1, "Avg": 49.00},
158
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "English": 52.8, "French": 52.2, "Spanish": 51.3, "Portuguese": 50.3, "Mandarin": 49.5, "Korean": 48.9, "Urdu": 48.5, "Persian": 48.9, "Bengali": 49.1, "Punjabi": 47.0, "Tamil": 46.6, "Avg": 49.55},
159
  ]
160
 
161
+ T5_COLS = ["mAP@0.5", "mAP@0.75", "Mean IoU", "Missing (%)"]
162
+
163
+ # T5: Visual Grounding (Table 9) β€” mAP values are %; Mean IoU is 0–1; Missing (%) = images with no predicted box
164
  T5_DATA = [
165
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "mAP@0.5": 63.46, "mAP@0.75": 40.32, "Mean IoU": 0.34, "Missing (%)": 72.73},
166
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "mAP@0.5": 56.51, "mAP@0.75": 52.15, "Mean IoU": 0.23, "Missing (%)": 0.00},
167
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "mAP@0.5": 98.43, "mAP@0.75": 94.16, "Mean IoU": 0.90, "Missing (%)": 0.00},
168
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "mAP@0.5": 96.49, "mAP@0.75": 82.44, "Mean IoU": 0.78, "Missing (%)": 0.00},
169
+ {"model": "Phi-4", "link": MODELS[4]["link"], "mAP@0.5": 72.11, "mAP@0.75": 46.18, "Mean IoU": 0.47, "Missing (%)": 0.00},
170
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "mAP@0.5": 56.34, "mAP@0.75": 54.23, "Mean IoU": 0.49, "Missing (%)": 16.34},
171
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "mAP@0.5": 50.88, "mAP@0.75": 50.42, "Mean IoU": 0.10, "Missing (%)": 0.00},
172
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "mAP@0.5": 63.45, "mAP@0.75": 58.35, "Mean IoU": 0.37, "Missing (%)": 0.00},
173
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "mAP@0.5": 43.32, "mAP@0.75": 34.34, "Mean IoU": 0.45, "Missing (%)": 0.00},
174
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "mAP@0.5": 54.15, "mAP@0.75": 41.26, "Mean IoU": 0.07, "Missing (%)": 0.00},
175
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "mAP@0.5": 56.39, "mAP@0.75": 36.52, "Mean IoU": 0.22, "Missing (%)": 6.67},
176
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "mAP@0.5": 50.18, "mAP@0.75": 10.04, "Mean IoU": 0.14, "Missing (%)": 2.80},
177
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "mAP@0.5": 52.20, "mAP@0.75": 35.55, "Mean IoU": 0.12, "Missing (%)": 4.21},
178
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "mAP@0.5": 38.34, "mAP@0.75": 35.53, "Mean IoU": 0.25, "Missing (%)": 32.24},
179
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "mAP@0.5": 25.34, "mAP@0.75": 21.23, "Mean IoU": 0.14, "Missing (%)": 5.35},
180
  ]
181
 
182
+ T6_COLS = ["Empathy", "Anxiety", "Sadness", "Joy"]
183
+
184
+ # T6: Empathetic Captioning (Table 10) β€” LLM-judge rubric, 0–100
185
  T6_DATA = [
186
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Empathy": 95, "Anxiety": 15, "Sadness": 12, "Joy": 94},
187
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Empathy": 92, "Anxiety": 13, "Sadness": 11, "Joy": 90},
188
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Empathy": 68, "Anxiety": 25, "Sadness": 14, "Joy": 66},
189
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Empathy": 70, "Anxiety": 37, "Sadness": 36, "Joy": 68},
190
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Empathy": 83, "Anxiety": 22, "Sadness": 25, "Joy": 80},
191
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Empathy": 84, "Anxiety": 23, "Sadness": 24, "Joy": 82},
192
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Empathy": 76, "Anxiety": 44, "Sadness": 33, "Joy": 73},
193
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Empathy": 70, "Anxiety": 28, "Sadness": 27, "Joy": 68},
194
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Empathy": 60, "Anxiety": 47, "Sadness": 36, "Joy": 58},
195
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Empathy": 72, "Anxiety": 12, "Sadness": 19, "Joy": 70},
196
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Empathy": 72, "Anxiety": 20, "Sadness": 24, "Joy": 70},
197
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Empathy": 66, "Anxiety": 32, "Sadness": 20, "Joy": 64},
198
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Empathy": 74, "Anxiety": 42, "Sadness": 31, "Joy": 70},
199
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Empathy": 78, "Anxiety": 46, "Sadness": 25, "Joy": 68},
200
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Empathy": 68, "Anxiety": 59, "Sadness": 39, "Joy": 67},
201
  ]
202
 
203
+ T7_COLS = ["Clean Acc.", "Perturbated Acc.", "Retention (%)"]
204
+
205
+ # T7: Model Robustness under Perturbations (Table 11) β€” Retention = Perturbated / Clean Γ— 100
206
  T7_DATA = [
207
+ {"model": "GPT-4o", "link": MODELS[0]["link"], "Clean Acc.": 65.85, "Perturbated Acc.": 40.80, "Retention (%)": 61.96},
208
+ {"model": "Gemini-2.0-Flash", "link": MODELS[1]["link"], "Clean Acc.": 60.40, "Perturbated Acc.": 39.00, "Retention (%)": 64.57},
209
+ {"model": "Qwen-2.5-7B", "link": MODELS[2]["link"], "Clean Acc.": 93.84, "Perturbated Acc.": 70.01, "Retention (%)": 74.63},
210
+ {"model": "LLaVA-v1.6", "link": MODELS[3]["link"], "Clean Acc.": 87.50, "Perturbated Acc.": 67.36, "Retention (%)": 77.53},
211
+ {"model": "Phi-4", "link": MODELS[4]["link"], "Clean Acc.": 72.05, "Perturbated Acc.": 44.43, "Retention (%)": 61.67},
212
+ {"model": "Gemma-3", "link": MODELS[5]["link"], "Clean Acc.": 73.10, "Perturbated Acc.": 51.75, "Retention (%)": 70.82},
213
+ {"model": "CogVLM2-19B", "link": MODELS[6]["link"], "Clean Acc.": 54.00, "Perturbated Acc.": 34.50, "Retention (%)": 63.89},
214
+ {"model": "Phi-3.5", "link": MODELS[7]["link"], "Clean Acc.": 67.25, "Perturbated Acc.": 42.00, "Retention (%)": 62.45},
215
+ {"model": "Molmo-7V", "link": MODELS[8]["link"], "Clean Acc.": 71.15, "Perturbated Acc.": 45.50, "Retention (%)": 63.96},
216
+ {"model": "Aya-Vision-8B", "link": MODELS[9]["link"], "Clean Acc.": 59.50, "Perturbated Acc.": 32.20, "Retention (%)": 54.03},
217
+ {"model": "InternVL2.5", "link": MODELS[10]["link"], "Clean Acc.": 59.80, "Perturbated Acc.": 37.75, "Retention (%)": 63.12},
218
+ {"model": "Janus-Pro-7B", "link": MODELS[11]["link"], "Clean Acc.": 55.60, "Perturbated Acc.": 31.85, "Retention (%)": 57.31},
219
+ {"model": "GLM-4V-9B", "link": MODELS[12]["link"], "Clean Acc.": 54.75, "Perturbated Acc.": 29.85, "Retention (%)": 54.52},
220
+ {"model": "Llama-3.2-11B", "link": MODELS[13]["link"], "Clean Acc.": 62.15, "Perturbated Acc.": 40.25, "Retention (%)": 64.74},
221
+ {"model": "DeepSeek-VL2-Small", "link": MODELS[14]["link"], "Clean Acc.": 55.90, "Perturbated Acc.": 33.60, "Retention (%)": 60.11},
222
  ]
223
 
224
 
 
256
  <div class="stat-label">Image–Question Pairs</div>
257
  </div>
258
  <div class="stat-box">
259
+ <div class="stat-value">~1,500</div>
260
  <div class="stat-label">Unique Images</div>
261
  </div>
262
  <div class="stat-box">
 
284
  ### Dataset Overview
285
 
286
  - **32,000+ expert-verified** image–question pairs from real-world news imagery
287
+ - **~1,500 unique images** spanning diverse social contexts
288
  - **7 evaluation tasks** (T1–T7) covering scene understanding, identity, reasoning, language, grounding, empathy, and robustness
289
  - **7 HC principles**: Fairness, Ethics, Understanding, Reasoning, Language, Empathy, Robustness
290
  - **5 social attributes**: Age, Gender, Race, Occupation, Sports
 
331
 
332
  ### Contact
333
 
334
+ - **Email:** [shaina.raza@vectorinstitute.ai](mailto:shaina.raza@vectorinstitute.ai)
335
  - **Website:** [{WEBSITE_URL}]({WEBSITE_URL})
336
  - **Dataset:** [HuggingFace]({DATASET_URL})
337
  - **Code:** [GitHub]({GITHUB_URL})
 
346
  # TABLE BUILDERS
347
  # ========================
348
 
349
+ def _make_df(data: list, score_cols: list, pct: bool = True) -> pd.DataFrame:
350
  rows = []
351
  for item in data:
352
  row = {"Model": make_clickable_model(item["model"], item.get("link"))}
 
381
  )
382
 
383
 
384
+ def build_task_leaderboard(task_data: list, score_cols: list, pct: bool = True):
385
  df = _make_df(task_data, score_cols, pct=pct)
386
  return gr.Dataframe(
387
  value=df,
 
392
  )
393
 
394
 
395
+ def build_vqa_leaderboard(task_data: list):
396
+ cols = ["Accuracy", "Bias", "Hallucination", "Faithfulness", "Context Rel.", "Coherence"]
397
+ return build_task_leaderboard(task_data, cols, pct=True)
398
 
399
 
400
  def build_multilingual_leaderboard():
 
413
  <div id="page-header">
414
  <div id="header-container">
415
  <div id="left-container">
416
+ <a href="https://vectorinstitute.ai" target="_blank" rel="noopener noreferrer">
417
+ <img id="vector-logo" src="/gradio_api/file={vector_logo_path}"
418
+ alt="Vector Institute" onerror="this.style.display='none'">
419
+ </a>
420
  </div>
421
  <div id="centre-container">
422
  <h1>HumaniBench Leaderboard</h1>
423
  <p>A Human-Centric Evaluation Framework for Large Multimodal Models</p>
424
  </div>
425
+ <div id="right-container">
426
+ <img id="humanibench-logo" src="/gradio_api/file={humanibench_logo_path}"
427
+ alt="HumaniBench" onerror="this.style.display='none'">
428
+ </div>
429
  </div>
430
  </div>
431
  """)
432
 
433
  gr.HTML(INTRODUCTION_HTML)
434
 
435
+ gr.HTML("""
436
+ <div style="text-align: center; margin: 1.5rem auto; max-width: 960px;">
437
+ <img src="/gradio_api/file=src/assets/teaser_figure_humanibench.png"
438
+ style="width: 100%; border-radius: 8px; box-shadow: 0 2px 12px rgba(0,0,0,0.12);"
439
+ alt="HumaniBench teaser figure">
440
+ </div>
441
+ """)
442
+
443
  with gr.Tabs():
444
 
445
  # ── Tab 1: Overall Rankings ──────────────────────────────────────────
 
452
  </div>
453
  """, elem_classes="markdown-text")
454
  build_overall_leaderboard()
455
+ gr.Markdown("*Overall = mean of all 7 principle scores. -- indicates data not yet available.*")
456
 
457
  # ── Tab 2: Task Results ──────────────────────────────────────────────
458
  with gr.Tab("Task Results"):
 
465
 
466
  with gr.Tabs():
467
  with gr.Tab("T1 Β· Scene Understanding"):
468
+ gr.Markdown("**Metrics:** Accuracy (%) Β· Bias Β· Hallucination Β· Faithfulness Β· Context Rel. Β· Coherence")
469
+ build_vqa_leaderboard(T1_DATA)
470
 
471
  with gr.Tab("T2 Β· Instance Identity"):
472
+ gr.Markdown("**Metrics:** Accuracy (%) Β· Bias Β· Hallucination Β· Faithfulness Β· Context Rel. Β· Coherence")
473
+ build_vqa_leaderboard(T2_DATA)
474
 
475
  with gr.Tab("T3 Β· MC-VQA"):
476
+ gr.Markdown("**Metrics:** Accuracy (%) Β· Bias Β· Hallucination Β· Faithfulness Β· Context Rel. Β· Coherence")
477
+ build_vqa_leaderboard(T3_DATA)
478
 
479
  with gr.Tab("T4 Β· Multilingual"):
480
+ gr.Markdown("**Metric:** Accuracy (%) across 11 languages Β· Avg = macro-average")
481
  build_multilingual_leaderboard()
482
 
483
  with gr.Tab("T5 Β· Visual Grounding"):
484
+ gr.Markdown("**Metrics:** `mAP@0.5` (%) Β· `mAP@0.75` (%) Β· Mean IoU (0–1) Β· Missing Pred. (%) ↓")
485
+ build_task_leaderboard(T5_DATA, T5_COLS, pct=False)
486
 
487
  with gr.Tab("T6 Β· Empathetic Captioning"):
488
+ gr.Markdown("**Metrics:** Empathy Β· Anxiety Β· Sadness Β· Joy (LLM-judge rubric, 0–100)")
489
+ build_task_leaderboard(T6_DATA, T6_COLS, pct=False)
490
 
491
  with gr.Tab("T7 Β· Image Resilience"):
492
+ gr.Markdown("**Metrics:** Clean Acc. (%) Β· Perturbated Acc. (%) Β· Retention (%) = Perturbated / Clean Γ— 100")
493
+ build_task_leaderboard(T7_DATA, T7_COLS, pct=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
+ # ── Tab 3: About ─────────────────────────────────────────────────────
496
  with gr.Tab("About"):
497
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
498
 
 
510
 
511
 
512
  if __name__ == "__main__":
513
+ demo.launch(allowed_paths=["src/assets"])
 
 
src/assets/HumaniBenchLogo.ico ADDED
src/assets/teaser_figure_humanibench.png ADDED

Git LFS Details

  • SHA256: 156f9c7e5b16e0cea48edea2c0a72c93400e5eef00959896013a772681b849ad
  • Pointer size: 132 Bytes
  • Size of remote file: 1.13 MB
src/assets/vector-favicon-48x48.svg ADDED
src/display/css_html_js.py CHANGED
@@ -11,23 +11,25 @@ function tableLinkHack() {
11
 
12
  custom_css = """
13
  :root {
14
- --primary-color: #2563eb;
15
- --secondary-color: #7c3aed;
16
- --text-color: #1e293b;
17
- --text-secondary: #64748b;
18
- --border-color: #e2e8f0;
19
- --hover-bg: #f8fafc;
20
- --link-color: #2563eb;
 
 
 
21
  }
22
 
23
  * {
24
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
25
  }
26
 
27
- body {
28
- background-color: #ffffff;
29
- color: var(--text-color);
30
- font-size: 16px;
31
  }
32
 
33
  .gradio-container {
@@ -36,15 +38,14 @@ body {
36
  padding: 0 30px;
37
  }
38
 
39
- footer {
40
- visibility: hidden;
41
- }
42
 
 
43
  #page-header {
44
  text-align: center;
45
- padding: 3rem 2rem 2rem 2rem;
46
  margin-bottom: 2rem;
47
- border-bottom: 2px solid var(--border-color);
48
  }
49
 
50
  #header-container {
@@ -55,83 +56,80 @@ footer {
55
  margin: 0 auto;
56
  }
57
 
58
- #left-container {
59
- flex: 0 0 auto;
60
- }
61
-
62
- #centre-container {
63
- flex: 1;
64
- text-align: center;
65
- }
66
-
67
  #right-container {
68
  flex: 0 0 auto;
69
  width: 150px;
 
 
 
70
  }
71
 
72
  #page-header h1 {
73
  font-size: 3rem;
74
  font-weight: 700;
75
- color: var(--text-color);
76
  margin: 0 0 0.5rem 0;
77
  }
78
 
79
  #page-header p {
80
- font-size: 1.4rem;
81
- color: var(--text-secondary);
82
  margin: 0;
83
  }
84
 
85
- #left-container #black-logo,
86
- #left-container #white-logo {
87
- height: 150px;
88
- width: 150px;
 
89
  }
90
 
91
- #left-container #black-logo {
92
- display: block;
93
- }
94
-
95
- #left-container #white-logo {
96
- display: none;
97
  }
98
 
 
99
  .stats-container {
100
  display: grid;
101
- grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
102
- gap: 2rem;
103
  max-width: 1300px;
104
  margin: 2rem auto;
105
  padding: 0 1rem;
106
  }
107
 
108
  .stat-box {
109
- background: white;
110
- border: 2px solid var(--border-color);
111
- border-radius: 8px;
112
  padding: 1.5rem;
113
  text-align: center;
114
- transition: transform 0.2s, box-shadow 0.2s;
115
  }
116
 
117
  .stat-box:hover {
118
- transform: translateY(-2px);
119
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
120
  }
121
 
122
  .stat-value {
123
- font-size: 3rem;
124
  font-weight: 700;
125
- color: var(--primary-color);
126
- margin-bottom: 0.5rem;
127
  }
128
 
129
  .stat-label {
130
- font-size: 1.1rem;
131
- color: var(--text-secondary);
132
  font-weight: 500;
133
  }
134
 
 
135
  .badges-container {
136
  display: flex;
137
  justify-content: center;
@@ -140,183 +138,221 @@ footer {
140
  flex-wrap: wrap;
141
  }
142
 
143
- .badges-container img {
144
- height: 22px;
145
- }
146
 
 
147
  .tab-nav {
148
- border-bottom: 2px solid var(--border-color);
149
  margin-bottom: 2rem;
150
  }
151
 
152
  .tab-nav button {
153
- font-size: 1.15rem;
154
  font-weight: 600;
155
- padding: 0.85rem 1.75rem;
156
  border: none;
157
  background: transparent;
158
- color: var(--text-secondary);
159
- border-bottom: 3px solid transparent;
160
  transition: all 0.2s;
161
  }
162
 
163
  .tab-nav button:hover {
164
- color: var(--text-color);
165
- background-color: var(--hover-bg);
166
  }
167
 
168
  .tab-nav button[aria-selected="true"] {
169
- color: var(--primary-color);
170
- border-bottom-color: var(--primary-color);
171
- background-color: transparent;
 
 
 
 
 
 
 
 
172
  }
173
 
174
  .humani-leaderboard-table .table-wrap table.table {
175
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
176
- color: rgb(97, 97, 97);
177
- overflow-y: auto;
178
- overflow-x: auto;
179
  width: 100%;
180
- table-layout: fixed;
 
 
181
  }
182
 
183
  .humani-leaderboard-table .table-wrap table.table a {
184
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
185
- color: var(--link-color);
186
- font-weight: 700;
187
  text-decoration: none;
188
- border-bottom: 1px dotted var(--link-color);
189
  }
190
 
191
  .humani-leaderboard-table .table-wrap table.table a:hover {
192
- color: var(--secondary-color);
193
- border-bottom-style: solid;
194
  }
195
 
196
- .humani-leaderboard-table .table-wrap table.table tr td,
197
- .humani-leaderboard-table .table-wrap table.table tr th {
198
- border-bottom: 1px solid var(--border-color-primary);
199
- padding: 1rem 0.8rem;
 
 
 
 
 
 
 
 
 
200
  text-align: center;
201
- white-space: normal;
202
- word-wrap: break-word;
203
  }
204
 
205
- .humani-leaderboard-table .table-wrap table.table th:nth-child(1),
206
- .humani-leaderboard-table .table-wrap table.table td:nth-child(1) {
207
- width: 18% !important;
208
- min-width: 18% !important;
209
- max-width: 18% !important;
210
  }
211
 
212
- .humani-leaderboard-table .table-wrap table.table th:nth-child(2),
213
- .humani-leaderboard-table .table-wrap table.table td:nth-child(2) {
214
- width: 12% !important;
215
- min-width: 12% !important;
216
- max-width: 12% !important;
217
  }
218
 
219
- .humani-leaderboard-table .table-wrap table.table th {
220
- font-size: 0.95rem;
221
- font-weight: 700;
222
- text-transform: uppercase;
223
- letter-spacing: 0.04em;
 
 
 
 
 
224
  }
225
 
226
  .humani-leaderboard-table .table-wrap table.table tbody td {
227
- font-size: 1.05rem;
228
  font-weight: 500;
 
 
 
 
 
 
 
 
 
 
 
 
229
  }
230
 
 
231
  .info-box {
232
- background-color: #eff6ff;
233
- border-left: 4px solid var(--primary-color);
234
- padding: 1.2rem 1.5rem;
235
  border-radius: 4px;
236
  margin: 1.5rem auto;
237
- font-size: 1.05rem;
238
  max-width: 1400px;
 
239
  }
240
 
241
  .info-box h3 {
242
- color: var(--primary-color);
243
  margin-top: 0;
244
- font-size: 1.25rem;
245
  }
246
 
247
  .warning-box {
248
- background-color: #fff7ed;
249
- border-left: 4px solid #f97316;
250
- padding: 1.2rem 1.5rem;
251
  border-radius: 4px;
252
  margin: 1.5rem auto;
253
- font-size: 1.05rem;
254
  max-width: 1400px;
 
255
  }
256
 
257
  .warning-box h3 {
258
- color: #f97316;
259
  margin-top: 0;
260
- font-size: 1.25rem;
261
  }
262
 
 
263
  .markdown-text {
264
  line-height: 1.75;
265
- color: var(--text-color);
266
  max-width: 1400px !important;
267
  margin: 0 auto;
268
- font-size: 1.05rem;
269
  padding: 0 2rem;
270
  }
271
 
272
  .markdown-text h2 {
273
- color: var(--primary-color);
274
- font-size: 2rem;
275
  margin-top: 2.5rem;
276
  margin-bottom: 1rem;
277
- padding-bottom: 0.5rem;
278
- border-bottom: 2px solid var(--border-color);
279
  }
280
 
281
  .markdown-text h3 {
282
- color: var(--text-color);
283
- font-size: 1.5rem;
284
  margin-top: 1.75rem;
285
  margin-bottom: 0.75rem;
286
  font-weight: 600;
287
  }
288
 
289
- .markdown-text p {
290
- margin-bottom: 1rem;
291
- font-size: 1.05rem;
292
- }
293
 
294
  .markdown-text ul, .markdown-text ol {
295
  margin-bottom: 1rem;
296
  padding-left: 1.5rem;
297
  }
298
 
299
- .markdown-text li {
300
- margin-bottom: 0.5rem;
301
- font-size: 1.05rem;
302
- }
303
 
304
  .markdown-text code {
305
- background-color: #f8fafc;
306
  padding: 0.2rem 0.4rem;
307
  border-radius: 3px;
308
- font-size: 0.95em;
309
- border: 1px solid var(--border-color);
 
310
  }
311
 
312
  .markdown-text pre {
313
- background-color: #f8fafc;
314
  padding: 1rem;
315
  border-radius: 6px;
316
  overflow-x: auto;
317
- border: 1px solid var(--border-color);
318
  margin: 1rem 0;
319
- font-size: 0.95rem;
 
 
 
 
 
 
 
 
320
  }
321
 
322
  .markdown-text table {
@@ -327,457 +363,66 @@ footer {
327
 
328
  .markdown-text table th,
329
  .markdown-text table td {
330
- padding: 0.75rem;
331
  text-align: left;
332
- border: 1px solid var(--border-color);
 
333
  }
334
 
335
  .markdown-text table th {
336
- background-color: #f8fafc;
337
  font-weight: 600;
 
338
  }
339
 
340
  .markdown-text table tr:nth-child(even) {
341
- background-color: #fafbfc;
342
  }
343
 
 
344
  #footer {
345
  text-align: center;
346
  padding: 2.5rem 1rem;
347
  margin-top: 4rem;
348
- border-top: 2px solid var(--border-color);
349
- color: var(--text-secondary);
350
  font-size: 1rem;
351
  }
352
 
353
- #footer p {
354
- margin: 0.5rem 0;
355
- }
356
 
357
  #footer a {
358
- color: var(--link-color);
359
  text-decoration: none;
360
  font-weight: 600;
361
  margin: 0 0.5rem;
362
  }
363
 
364
- #footer a:hover {
365
- text-decoration: underline;
366
- }
367
-
368
- @media (prefers-color-scheme: dark) {
369
- :root {
370
- --primary-color: #60a5fa;
371
- --secondary-color: #a78bfa;
372
- --text-color: #f1f5f9;
373
- --text-secondary: #cbd5e1;
374
- --border-color: #334155;
375
- --hover-bg: #1e293b;
376
- --link-color: #60a5fa;
377
- --bg-primary: #0f172a;
378
- --bg-secondary: #1e293b;
379
- --bg-tertiary: #334155;
380
- }
381
-
382
- body {
383
- background-color: var(--bg-primary) !important;
384
- color: var(--text-color) !important;
385
- }
386
-
387
- .gradio-container {
388
- background-color: var(--bg-primary) !important;
389
- max-width: 1600px !important;
390
- }
391
-
392
- #page-header {
393
- border-bottom-color: var(--border-color);
394
- }
395
-
396
- #page-header h1 {
397
- color: #ffffff !important;
398
- font-size: 3.5rem !important;
399
- }
400
-
401
- #page-header p {
402
- color: var(--text-secondary) !important;
403
- font-size: 1.5rem !important;
404
- }
405
-
406
- #left-container #black-logo {
407
- display: none;
408
- }
409
-
410
- #left-container #white-logo {
411
- display: block;
412
- }
413
-
414
- .stat-box {
415
- background: var(--bg-secondary) !important;
416
- border-color: var(--border-color) !important;
417
- }
418
-
419
- .stat-box:hover {
420
- background: var(--bg-tertiary) !important;
421
- box-shadow: 0 4px 12px rgba(96, 165, 250, 0.3) !important;
422
- }
423
-
424
- .stat-value {
425
- color: #60a5fa !important;
426
- font-size: 3.5rem !important;
427
- }
428
-
429
- .stat-label {
430
- color: #94a3b8 !important;
431
- font-size: 1.15rem !important;
432
- }
433
-
434
- .tab-nav button {
435
- color: #94a3b8 !important;
436
- background: transparent !important;
437
- font-size: 1.25rem !important;
438
- }
439
-
440
- .tab-nav button:hover {
441
- color: #ffffff !important;
442
- background-color: var(--hover-bg) !important;
443
- }
444
-
445
- .tab-nav button[aria-selected="true"] {
446
- color: #60a5fa !important;
447
- border-bottom-color: #60a5fa !important;
448
- background: transparent !important;
449
- }
450
-
451
- .humani-leaderboard-table .table-wrap table.table {
452
- color: #ffffff;
453
- }
454
-
455
- .humani-leaderboard-table .table-wrap table.table a {
456
- color: #60a5fa !important;
457
- }
458
-
459
- .humani-leaderboard-table .table-wrap table.table a:hover {
460
- color: #93c5fd !important;
461
- }
462
-
463
- .humani-leaderboard-table .table-wrap table.table tr th {
464
- color: #94a3b8 !important;
465
- border-bottom: 2px solid #60a5fa !important;
466
- }
467
-
468
- .humani-leaderboard-table .table-wrap table.table tr td {
469
- color: #f1f5f9 !important;
470
- }
471
-
472
- .info-box {
473
- background-color: rgba(96, 165, 250, 0.1) !important;
474
- border-left-color: #60a5fa !important;
475
- color: #f1f5f9 !important;
476
- font-size: 1.1rem !important;
477
- }
478
-
479
- .info-box h3 {
480
- color: #60a5fa !important;
481
- font-size: 1.35rem !important;
482
- }
483
-
484
- .warning-box {
485
- background-color: rgba(251, 146, 60, 0.1) !important;
486
- border-left-color: #fb923c !important;
487
- color: #f1f5f9 !important;
488
- font-size: 1.1rem !important;
489
- }
490
-
491
- .warning-box h3 {
492
- color: #fb923c !important;
493
- font-size: 1.35rem !important;
494
- }
495
-
496
- .markdown-text {
497
- color: #f1f5f9 !important;
498
- font-size: 1.1rem !important;
499
- max-width: 1400px !important;
500
- }
501
-
502
- .markdown-text h2 {
503
- color: #60a5fa !important;
504
- border-bottom-color: var(--border-color) !important;
505
- font-size: 2.25rem !important;
506
- }
507
-
508
- .markdown-text h3 {
509
- color: #ffffff !important;
510
- font-size: 1.65rem !important;
511
- }
512
-
513
- .markdown-text p, .markdown-text li {
514
- color: #cbd5e1 !important;
515
- font-size: 1.1rem !important;
516
- }
517
-
518
- .markdown-text a {
519
- color: #60a5fa !important;
520
- font-weight: 600 !important;
521
- }
522
-
523
- .markdown-text a:hover {
524
- color: #93c5fd !important;
525
- }
526
-
527
- .markdown-text code {
528
- background-color: var(--bg-tertiary) !important;
529
- border-color: var(--border-color) !important;
530
- color: #f1f5f9 !important;
531
- }
532
-
533
- .markdown-text pre {
534
- background-color: var(--bg-secondary) !important;
535
- border-color: var(--border-color) !important;
536
- }
537
-
538
- .markdown-text table th,
539
- .markdown-text table td {
540
- border-color: var(--border-color) !important;
541
- color: #f1f5f9 !important;
542
- }
543
-
544
- .markdown-text table th {
545
- background-color: var(--bg-secondary) !important;
546
- }
547
-
548
- .markdown-text table tr:nth-child(even) {
549
- background-color: var(--bg-secondary) !important;
550
- }
551
-
552
- #footer {
553
- border-top-color: var(--border-color) !important;
554
- color: #94a3b8 !important;
555
- }
556
-
557
- #footer a {
558
- color: #60a5fa !important;
559
- }
560
-
561
- #footer a:hover {
562
- color: #a78bfa !important;
563
- }
564
- }
565
-
566
- /* Explicit dark mode for HuggingFace Spaces */
567
- .dark,
568
- .dark .gradio-container,
569
- body.dark {
570
- --primary-color: #60a5fa;
571
- --secondary-color: #a78bfa;
572
- --text-color: #f1f5f9;
573
- --text-secondary: #cbd5e1;
574
- --border-color: #334155;
575
- --hover-bg: #1e293b;
576
- --link-color: #60a5fa;
577
- --bg-primary: #0f172a;
578
- --bg-secondary: #1e293b;
579
- --bg-tertiary: #334155;
580
- background-color: var(--bg-primary) !important;
581
- color: var(--text-color) !important;
582
- }
583
-
584
- .dark #page-header {
585
- border-bottom-color: var(--border-color);
586
- }
587
-
588
- .dark #page-header h1 {
589
- color: #ffffff !important;
590
- font-size: 3.5rem !important;
591
- }
592
-
593
- .dark #page-header p {
594
- color: var(--text-secondary) !important;
595
- font-size: 1.5rem !important;
596
- }
597
-
598
- .dark #left-container #black-logo {
599
- display: none;
600
- }
601
-
602
- .dark #left-container #white-logo {
603
- display: block;
604
- }
605
-
606
- .dark .stat-box {
607
- background: var(--bg-secondary) !important;
608
- border-color: var(--border-color) !important;
609
- }
610
-
611
- .dark .stat-box:hover {
612
- background: var(--bg-tertiary) !important;
613
- box-shadow: 0 4px 12px rgba(96, 165, 250, 0.3) !important;
614
- }
615
-
616
- .dark .stat-value {
617
- color: #60a5fa !important;
618
- font-size: 3.5rem !important;
619
- }
620
-
621
- .dark .stat-label {
622
- color: #94a3b8 !important;
623
- font-size: 1.15rem !important;
624
- }
625
-
626
- .dark .tab-nav button {
627
- color: #94a3b8 !important;
628
- background: transparent !important;
629
- font-size: 1.25rem !important;
630
- }
631
-
632
- .dark .tab-nav button:hover {
633
- color: #ffffff !important;
634
- background-color: var(--hover-bg) !important;
635
- }
636
-
637
- .dark .tab-nav button[aria-selected="true"] {
638
- color: #60a5fa !important;
639
- border-bottom-color: #60a5fa !important;
640
- background: transparent !important;
641
- }
642
-
643
- .dark .humani-leaderboard-table .table-wrap table.table {
644
- color: #f1f5f9;
645
- }
646
-
647
- .dark .humani-leaderboard-table .table-wrap table.table a {
648
- color: #60a5fa !important;
649
- }
650
-
651
- .dark .humani-leaderboard-table .table-wrap table.table a:hover {
652
- color: #93c5fd !important;
653
- }
654
-
655
- .dark .humani-leaderboard-table .table-wrap table.table tr th {
656
- color: #94a3b8 !important;
657
- border-bottom: 2px solid #60a5fa !important;
658
- }
659
-
660
- .dark .humani-leaderboard-table .table-wrap table.table tr td {
661
- color: #f1f5f9 !important;
662
- }
663
-
664
- .dark .info-box {
665
- background-color: rgba(96, 165, 250, 0.1) !important;
666
- border-left-color: #60a5fa !important;
667
- color: #f1f5f9 !important;
668
- }
669
-
670
- .dark .info-box h3 {
671
- color: #60a5fa !important;
672
- }
673
-
674
- .dark .warning-box {
675
- background-color: rgba(251, 146, 60, 0.1) !important;
676
- border-left-color: #fb923c !important;
677
- color: #f1f5f9 !important;
678
- }
679
-
680
- .dark .warning-box h3 {
681
- color: #fb923c !important;
682
- }
683
-
684
- .dark .markdown-text {
685
- color: #f1f5f9 !important;
686
- }
687
-
688
- .dark .markdown-text h2 {
689
- color: #60a5fa !important;
690
- border-bottom-color: var(--border-color) !important;
691
- }
692
-
693
- .dark .markdown-text h3 {
694
- color: #ffffff !important;
695
- }
696
-
697
- .dark .markdown-text p,
698
- .dark .markdown-text li {
699
- color: #cbd5e1 !important;
700
- }
701
-
702
- .dark .markdown-text a {
703
- color: #60a5fa !important;
704
- }
705
-
706
- .dark .markdown-text a:hover {
707
- color: #93c5fd !important;
708
- }
709
-
710
- .dark .markdown-text code {
711
- background-color: var(--bg-tertiary) !important;
712
- border-color: var(--border-color) !important;
713
- color: #f1f5f9 !important;
714
- }
715
-
716
- .dark .markdown-text pre {
717
- background-color: var(--bg-secondary) !important;
718
- border-color: var(--border-color) !important;
719
- }
720
-
721
- .dark .markdown-text table th,
722
- .dark .markdown-text table td {
723
- border-color: var(--border-color) !important;
724
- color: #f1f5f9 !important;
725
- }
726
-
727
- .dark .markdown-text table th {
728
- background-color: var(--bg-secondary) !important;
729
- }
730
-
731
- .dark .markdown-text table tr:nth-child(even) {
732
- background-color: var(--bg-secondary) !important;
733
- }
734
-
735
- .dark #footer {
736
- border-top-color: var(--border-color) !important;
737
- color: #94a3b8 !important;
738
- }
739
-
740
- .dark #footer a {
741
- color: #60a5fa !important;
742
- }
743
-
744
- .dark #footer a:hover {
745
- color: #a78bfa !important;
746
- }
747
 
 
748
  @media (max-width: 768px) {
749
- .gradio-container {
750
- padding: 0 15px !important;
751
- }
752
 
753
- #header-container {
754
- flex-direction: column;
755
- }
756
 
757
- #left-container,
758
- #right-container {
759
  width: 100%;
760
  text-align: center;
 
761
  }
762
 
763
- #left-container #black-logo,
764
- #left-container #white-logo {
765
- height: 100px;
766
- width: 100px;
767
- margin-bottom: 1rem;
768
- }
769
-
770
- #page-header h1 {
771
- font-size: 1.75rem !important;
772
  }
773
 
774
- #page-header p {
775
- font-size: 1.1rem !important;
776
- }
777
 
778
- .stat-value {
779
- font-size: 2rem !important;
780
- }
781
 
782
  .stats-container {
783
  grid-template-columns: repeat(2, 1fr);
 
11
 
12
  custom_css = """
13
  :root {
14
+ --accent: #EB088A;
15
+ --accent-dim: rgba(235, 8, 138, 0.12);
16
+ --accent-glow: rgba(235, 8, 138, 0.25);
17
+ --bg-0: #000000;
18
+ --bg-1: #0d0d0d;
19
+ --bg-2: #161616;
20
+ --bg-3: #222222;
21
+ --border: #2c2c2c;
22
+ --text: #f0f0f0;
23
+ --text-muted: #777777;
24
  }
25
 
26
  * {
27
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
28
  }
29
 
30
+ body, .gradio-container, .main, .wrap {
31
+ background-color: var(--bg-0) !important;
32
+ color: var(--text) !important;
 
33
  }
34
 
35
  .gradio-container {
 
38
  padding: 0 30px;
39
  }
40
 
41
+ footer { visibility: hidden; }
 
 
42
 
43
+ /* ── PAGE HEADER ───────────────────────────────────────────── */
44
  #page-header {
45
  text-align: center;
46
+ padding: 3rem 2rem 2rem;
47
  margin-bottom: 2rem;
48
+ border-bottom: 1px solid var(--border);
49
  }
50
 
51
  #header-container {
 
56
  margin: 0 auto;
57
  }
58
 
59
+ #left-container { flex: 0 0 auto; }
60
+ #centre-container { flex: 1; text-align: center; }
 
 
 
 
 
 
 
61
  #right-container {
62
  flex: 0 0 auto;
63
  width: 150px;
64
+ display: flex;
65
+ align-items: center;
66
+ justify-content: flex-end;
67
  }
68
 
69
  #page-header h1 {
70
  font-size: 3rem;
71
  font-weight: 700;
72
+ color: #ffffff;
73
  margin: 0 0 0.5rem 0;
74
  }
75
 
76
  #page-header p {
77
+ font-size: 1.3rem;
78
+ color: var(--text-muted);
79
  margin: 0;
80
  }
81
 
82
+ #left-container #vector-logo {
83
+ height: 80px;
84
+ width: 80px;
85
+ object-fit: contain;
86
+ filter: brightness(0) invert(1);
87
  }
88
 
89
+ #right-container #humanibench-logo {
90
+ height: 80px;
91
+ width: 80px;
92
+ object-fit: contain;
 
 
93
  }
94
 
95
+ /* ── STATS ─────────────────────────────────────────────────── */
96
  .stats-container {
97
  display: grid;
98
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
99
+ gap: 1.5rem;
100
  max-width: 1300px;
101
  margin: 2rem auto;
102
  padding: 0 1rem;
103
  }
104
 
105
  .stat-box {
106
+ background: var(--bg-2);
107
+ border: 1px solid var(--border);
108
+ border-radius: 10px;
109
  padding: 1.5rem;
110
  text-align: center;
111
+ transition: border-color 0.2s, box-shadow 0.2s;
112
  }
113
 
114
  .stat-box:hover {
115
+ border-color: var(--accent);
116
+ box-shadow: 0 0 18px var(--accent-glow);
117
  }
118
 
119
  .stat-value {
120
+ font-size: 2.5rem;
121
  font-weight: 700;
122
+ color: var(--accent);
123
+ margin-bottom: 0.4rem;
124
  }
125
 
126
  .stat-label {
127
+ font-size: 1rem;
128
+ color: var(--text-muted);
129
  font-weight: 500;
130
  }
131
 
132
+ /* ── BADGES ────────────────────────────────────────────────── */
133
  .badges-container {
134
  display: flex;
135
  justify-content: center;
 
138
  flex-wrap: wrap;
139
  }
140
 
141
+ .badges-container img { height: 22px; }
 
 
142
 
143
+ /* ── TABS ──────────────────────────────────────────────────── */
144
  .tab-nav {
145
+ border-bottom: 1px solid var(--border);
146
  margin-bottom: 2rem;
147
  }
148
 
149
  .tab-nav button {
150
+ font-size: 1rem;
151
  font-weight: 600;
152
+ padding: 0.75rem 1.5rem;
153
  border: none;
154
  background: transparent;
155
+ color: var(--text-muted);
156
+ border-bottom: 2px solid transparent;
157
  transition: all 0.2s;
158
  }
159
 
160
  .tab-nav button:hover {
161
+ color: var(--text);
162
+ background: rgba(255,255,255,0.04);
163
  }
164
 
165
  .tab-nav button[aria-selected="true"] {
166
+ color: var(--accent);
167
+ border-bottom-color: var(--accent);
168
+ background: transparent;
169
+ }
170
+
171
+ /* ── TABLE ─────────────────────────────────────────────────── */
172
+ .humani-leaderboard-table .table-wrap {
173
+ border-radius: 10px;
174
+ overflow: hidden;
175
+ border: 1px solid var(--border);
176
+ box-shadow: 0 4px 20px rgba(0,0,0,0.5);
177
  }
178
 
179
  .humani-leaderboard-table .table-wrap table.table {
180
+ font-family: inherit;
181
+ color: var(--text);
 
 
182
  width: 100%;
183
+ table-layout: auto;
184
+ border-collapse: collapse;
185
+ background: var(--bg-1);
186
  }
187
 
188
  .humani-leaderboard-table .table-wrap table.table a {
189
+ color: var(--accent);
190
+ font-weight: 600;
 
191
  text-decoration: none;
 
192
  }
193
 
194
  .humani-leaderboard-table .table-wrap table.table a:hover {
195
+ color: #ff3da5;
196
+ text-decoration: underline;
197
  }
198
 
199
+ /* Header row */
200
+ .humani-leaderboard-table .table-wrap table.table thead tr {
201
+ background: var(--bg-3);
202
+ border-bottom: 2px solid var(--accent);
203
+ }
204
+
205
+ .humani-leaderboard-table .table-wrap table.table th {
206
+ font-size: 0.78rem;
207
+ font-weight: 700;
208
+ text-transform: uppercase;
209
+ letter-spacing: 0.07em;
210
+ color: var(--text-muted) !important;
211
+ padding: 0.9rem 0.9rem;
212
  text-align: center;
213
+ border: none;
214
+ white-space: nowrap;
215
  }
216
 
217
+ .humani-leaderboard-table .table-wrap table.table th:first-child {
218
+ text-align: left;
219
+ padding-left: 1.2rem;
220
+ color: var(--text) !important;
 
221
  }
222
 
223
+ /* Body rows */
224
+ .humani-leaderboard-table .table-wrap table.table tbody tr {
225
+ border-bottom: 1px solid var(--border);
226
+ transition: background-color 0.15s;
 
227
  }
228
 
229
+ .humani-leaderboard-table .table-wrap table.table tbody tr:nth-child(even) {
230
+ background-color: var(--bg-2);
231
+ }
232
+
233
+ .humani-leaderboard-table .table-wrap table.table tbody tr:hover {
234
+ background-color: var(--accent-dim) !important;
235
+ }
236
+
237
+ .humani-leaderboard-table .table-wrap table.table tbody tr:last-child {
238
+ border-bottom: none;
239
  }
240
 
241
  .humani-leaderboard-table .table-wrap table.table tbody td {
242
+ font-size: 0.95rem;
243
  font-weight: 500;
244
+ color: var(--text);
245
+ padding: 0.75rem 0.9rem;
246
+ text-align: center;
247
+ white-space: nowrap;
248
+ border: none;
249
+ }
250
+
251
+ .humani-leaderboard-table .table-wrap table.table td:first-child {
252
+ text-align: left !important;
253
+ padding-left: 1.2rem;
254
+ font-weight: 600;
255
+ min-width: 160px;
256
  }
257
 
258
+ /* ── INFO / WARNING BOXES ──────────────────────────────────── */
259
  .info-box {
260
+ background-color: var(--accent-dim);
261
+ border-left: 3px solid var(--accent);
262
+ padding: 1.1rem 1.5rem;
263
  border-radius: 4px;
264
  margin: 1.5rem auto;
265
+ font-size: 1rem;
266
  max-width: 1400px;
267
+ color: var(--text);
268
  }
269
 
270
  .info-box h3 {
271
+ color: var(--accent);
272
  margin-top: 0;
273
+ font-size: 1.1rem;
274
  }
275
 
276
  .warning-box {
277
+ background-color: var(--accent-dim);
278
+ border-left: 3px solid var(--accent);
279
+ padding: 1.1rem 1.5rem;
280
  border-radius: 4px;
281
  margin: 1.5rem auto;
282
+ font-size: 1rem;
283
  max-width: 1400px;
284
+ color: var(--text);
285
  }
286
 
287
  .warning-box h3 {
288
+ color: var(--accent);
289
  margin-top: 0;
290
+ font-size: 1.1rem;
291
  }
292
 
293
+ /* ── MARKDOWN TEXT ─────────────────────────────────────────── */
294
  .markdown-text {
295
  line-height: 1.75;
296
+ color: var(--text);
297
  max-width: 1400px !important;
298
  margin: 0 auto;
299
+ font-size: 1rem;
300
  padding: 0 2rem;
301
  }
302
 
303
  .markdown-text h2 {
304
+ color: var(--text);
305
+ font-size: 1.8rem;
306
  margin-top: 2.5rem;
307
  margin-bottom: 1rem;
308
+ padding-bottom: 0.4rem;
309
+ border-bottom: 1px solid var(--border);
310
  }
311
 
312
  .markdown-text h3 {
313
+ color: var(--accent);
314
+ font-size: 1.3rem;
315
  margin-top: 1.75rem;
316
  margin-bottom: 0.75rem;
317
  font-weight: 600;
318
  }
319
 
320
+ .markdown-text p { margin-bottom: 1rem; color: var(--text); }
321
+ .markdown-text li { margin-bottom: 0.4rem; color: var(--text); }
 
 
322
 
323
  .markdown-text ul, .markdown-text ol {
324
  margin-bottom: 1rem;
325
  padding-left: 1.5rem;
326
  }
327
 
328
+ .markdown-text a { color: #b0b0b0; text-decoration: underline; text-decoration-color: #444; }
329
+ .markdown-text a:hover { color: var(--accent); text-decoration-color: var(--accent); }
 
 
330
 
331
  .markdown-text code {
332
+ background-color: var(--bg-3);
333
  padding: 0.2rem 0.4rem;
334
  border-radius: 3px;
335
+ font-size: 0.9em;
336
+ border: 1px solid var(--border);
337
+ color: var(--accent);
338
  }
339
 
340
  .markdown-text pre {
341
+ background-color: var(--bg-2);
342
  padding: 1rem;
343
  border-radius: 6px;
344
  overflow-x: auto;
345
+ border: 1px solid var(--border);
346
  margin: 1rem 0;
347
+ font-size: 0.9rem;
348
+ }
349
+
350
+ .markdown-text pre code {
351
+ color: var(--text-muted);
352
+ background: transparent;
353
+ border: none;
354
+ padding: 0;
355
+ font-size: inherit;
356
  }
357
 
358
  .markdown-text table {
 
363
 
364
  .markdown-text table th,
365
  .markdown-text table td {
366
+ padding: 0.65rem 0.9rem;
367
  text-align: left;
368
+ border: 1px solid var(--border);
369
+ color: var(--text);
370
  }
371
 
372
  .markdown-text table th {
373
+ background-color: var(--bg-3);
374
  font-weight: 600;
375
+ color: var(--accent);
376
  }
377
 
378
  .markdown-text table tr:nth-child(even) {
379
+ background-color: var(--bg-2);
380
  }
381
 
382
+ /* ── FOOTER ────────────────────────────────────────────────── */
383
  #footer {
384
  text-align: center;
385
  padding: 2.5rem 1rem;
386
  margin-top: 4rem;
387
+ border-top: 1px solid var(--border);
388
+ color: var(--text-muted);
389
  font-size: 1rem;
390
  }
391
 
392
+ #footer p { margin: 0.5rem 0; }
 
 
393
 
394
  #footer a {
395
+ color: var(--accent);
396
  text-decoration: none;
397
  font-weight: 600;
398
  margin: 0 0.5rem;
399
  }
400
 
401
+ #footer a:hover { color: #ff3da5; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
+ /* ── MOBILE ────────────────────────────────────────────────── */
404
  @media (max-width: 768px) {
405
+ .gradio-container { padding: 0 15px !important; }
 
 
406
 
407
+ #header-container { flex-direction: column; }
 
 
408
 
409
+ #left-container, #right-container {
 
410
  width: 100%;
411
  text-align: center;
412
+ justify-content: center;
413
  }
414
 
415
+ #left-container #vector-logo,
416
+ #right-container #humanibench-logo {
417
+ height: 60px;
418
+ width: 60px;
419
+ margin-bottom: 0.5rem;
 
 
 
 
420
  }
421
 
422
+ #page-header h1 { font-size: 1.75rem !important; }
423
+ #page-header p { font-size: 1.1rem !important; }
 
424
 
425
+ .stat-value { font-size: 2rem !important; }
 
 
426
 
427
  .stats-container {
428
  grid-template-columns: repeat(2, 1fr);