KL4805 commited on
Commit
b2cdec4
1 Parent(s): aec9529

Upload OpenVLM_subset.json

Browse files
Files changed (1) hide show
  1. OpenVLM_subset.json +656 -0
OpenVLM_subset.json ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "time": "241031154353",
3
+ "results": {
4
+ "GPT-4o (0513, detail-high)": {
5
+ "META": {
6
+ "Method": [
7
+ "GPT-4o (0513, detail-high)",
8
+ "https://openai.com/index/hello-gpt-4o/"
9
+ ],
10
+ "Parameters": "",
11
+ "Language Model": "",
12
+ "Vision Model": "",
13
+ "Org": "OpenAI",
14
+ "Time": "2024/05/31",
15
+ "Verified": "Yes",
16
+ "OpenSource": "No",
17
+ "key": 270,
18
+ "dir_name": "GPT4o_HIGH"
19
+ },
20
+ "SEEDBench_IMG": {
21
+ "Overall": 77.1,
22
+ "Instance Attributes": 79.3,
23
+ "Instance Identity": 81.0,
24
+ "Instance Interaction": 80.4,
25
+ "Instance Location": 72.9,
26
+ "Instances Counting": 69.5,
27
+ "Scene Understanding": 80.1,
28
+ "Spatial Relation": 67.9,
29
+ "Text Understanding": 72.6,
30
+ "Visual Reasoning": 83.1,
31
+ "Overall (official)": "N/A"
32
+ },
33
+ "CCBench": {
34
+ "Overall": 71.2,
35
+ "Sketch Reasoning": 91.1,
36
+ "Historical Figure": 37.1,
37
+ "Calligraphy Painting": 70.2,
38
+ "Scenery Building": 89.5,
39
+ "Food Clothes": 62.6,
40
+ "Cultural Relic": 67.0,
41
+ "Traditional Show": 71.2
42
+ },
43
+ "MMBench_TEST_EN": {
44
+ "Overall": 83.4,
45
+ "CP": 87.4,
46
+ "FP-S": 78.9,
47
+ "FP-C": 83.8,
48
+ "AR": 86.5,
49
+ "LR": 80.3,
50
+ "RR": 80.6
51
+ },
52
+ "MMBench_TEST_CN": {
53
+ "Overall": 82.1,
54
+ "CP": 87.6,
55
+ "FP-S": 76.6,
56
+ "FP-C": 83.4,
57
+ "AR": 83.7,
58
+ "LR": 78.0,
59
+ "RR": 80.1
60
+ },
61
+ "MMBench_TEST_EN_V11": {
62
+ "Overall": 83.0,
63
+ "AR": 90.2,
64
+ "CP": 81.3,
65
+ "FP-C": 86.1,
66
+ "FP-S": 81.4,
67
+ "LR": 78.8,
68
+ "RR": 82.2,
69
+ "Action Recognition": 93.2,
70
+ "Attribute Comparison": 82.7,
71
+ "Attribute Recognition": 91.0,
72
+ "Celebrity Recognition": 62.6,
73
+ "Function Reasoning": 93.3,
74
+ "Future Prediction": 82.7,
75
+ "Identity Reasoning": 98.7,
76
+ "Image Emotion": 81.1,
77
+ "Image Quality": 59.7,
78
+ "Image Scene": 88.2,
79
+ "Image Style": 83.7,
80
+ "Image Topic": 97.8,
81
+ "Nature Relation": 92.4,
82
+ "Object Localization": 84.8,
83
+ "Ocr": 98.9,
84
+ "Physical Property Reasoning": 78.5,
85
+ "Physical Relation": 61.3,
86
+ "Social Relation": 89.0,
87
+ "Spatial Relationship": 78.7,
88
+ "Structuralized Imagetext Understanding": 76.1
89
+ },
90
+ "MMBench_TEST_CN_V11": {
91
+ "Overall": 81.5,
92
+ "AR": 86.5,
93
+ "CP": 81.5,
94
+ "FP-C": 85.0,
95
+ "FP-S": 79.1,
96
+ "LR": 77.2,
97
+ "RR": 79.8,
98
+ "Action Recognition": 94.0,
99
+ "Attribute Comparison": 81.3,
100
+ "Attribute Recognition": 91.0,
101
+ "Celebrity Recognition": 57.8,
102
+ "Function Reasoning": 94.4,
103
+ "Future Prediction": 82.7,
104
+ "Identity Reasoning": 97.4,
105
+ "Image Emotion": 85.6,
106
+ "Image Quality": 58.9,
107
+ "Image Scene": 88.2,
108
+ "Image Style": 80.4,
109
+ "Image Topic": 98.9,
110
+ "Nature Relation": 94.6,
111
+ "Object Localization": 82.9,
112
+ "Ocr": 97.8,
113
+ "Physical Property Reasoning": 67.1,
114
+ "Physical Relation": 53.3,
115
+ "Social Relation": 86.8,
116
+ "Spatial Relationship": 74.7,
117
+ "Structuralized Imagetext Understanding": 73.4
118
+ },
119
+ "MME": {
120
+ "Overall": 2310.3,
121
+ "Perception": 1614.2,
122
+ "Cognition": 696.1,
123
+ "OCR": 192.5,
124
+ "Artwork": 145.2,
125
+ "Celebrity": 67.9,
126
+ "Code Reasoning": 177.5,
127
+ "Color": 185.0,
128
+ "Commonsense Reasoning": 178.6,
129
+ "Count": 185.0,
130
+ "Existence": 185.0,
131
+ "Landmark": 182.0,
132
+ "Numerical Calculation": 147.5,
133
+ "Position": 133.3,
134
+ "Posters": 191.2,
135
+ "Scene": 147.0,
136
+ "Text Translation": 192.5
137
+ },
138
+ "MMVet": {
139
+ "Rec": 67.8,
140
+ "Ocr": 76.8,
141
+ "Know": 58.3,
142
+ "Gen": 56.9,
143
+ "Spat": 74.3,
144
+ "Math": 76.2,
145
+ "Overall": 69.1,
146
+ "Overall (official)": "N/A"
147
+ },
148
+ "MMMU_VAL": {
149
+ "Overall": 69.2,
150
+ "Art & Design": 72.5,
151
+ "Business": 73.3,
152
+ "Science": 64.7,
153
+ "Health & Medicine": 74.0,
154
+ "Humanities & Social Science": 80.8,
155
+ "Tech & Engineering": 57.6
156
+ },
157
+ "MathVista": {
158
+ "Overall": 61.3,
159
+ "SCI": 64.8,
160
+ "TQA": 70.3,
161
+ "NUM": 44.4,
162
+ "ARI": 58.4,
163
+ "VQA": 47.5,
164
+ "GEO": 61.5,
165
+ "ALG": 62.3,
166
+ "GPS": 60.1,
167
+ "MWP": 69.9,
168
+ "LOG": 43.2,
169
+ "FQA": 60.2,
170
+ "STA": 68.4
171
+ },
172
+ "HallusionBench": {
173
+ "aAcc": 70.2,
174
+ "fAcc": 49.1,
175
+ "qAcc": 45.5,
176
+ "Overall": 55.0
177
+ },
178
+ "LLaVABench": {
179
+ "Overall": 102.0,
180
+ "Conv": 93.6,
181
+ "Complex": 111.2,
182
+ "Detail": 93.6,
183
+ "Overall (official)": "N/A"
184
+ },
185
+ "AI2D": {
186
+ "Overall": 84.6,
187
+ "atomStructure": 75.0,
188
+ "eclipses": 90.3,
189
+ "faultsEarthquakes": 78.6,
190
+ "foodChainsWebs": 92.2,
191
+ "lifeCycles": 83.5,
192
+ "moonPhaseEquinox": 68.2,
193
+ "partsOfA": 80.9,
194
+ "partsOfTheEarth": 82.7,
195
+ "photosynthesisRespiration": 83.5,
196
+ "rockCycle": 73.1,
197
+ "rockStrata": 87.8,
198
+ "solarSystem": 97.2,
199
+ "typesOf": 81.0,
200
+ "volcano": 100.0,
201
+ "waterCNPCycle": 68.2
202
+ },
203
+ "ScienceQA_VAL": {
204
+ "Overall": 89.7,
205
+ "Adaptations": 97.9,
206
+ "Adaptations and natural selection": 100.0,
207
+ "Age of Exploration": 100.0,
208
+ "Ancient Egypt and Kush": 100.0,
209
+ "Ancient Mesopotamia": 100.0,
210
+ "Animals": 100.0,
211
+ "Astronomy": 100.0,
212
+ "Atoms and molecules": 100.0,
213
+ "Basic economic principles": 32.8,
214
+ "Chemical reactions": 100.0,
215
+ "Cities": 87.5,
216
+ "Classification": 98.8,
217
+ "Classification and scientific names": 100.0,
218
+ "Climate change": 100.0,
219
+ "Colonial America": 90.5,
220
+ "Context clues": 100.0,
221
+ "Descriptive details": 100.0,
222
+ "Designing experiments": 100.0,
223
+ "Domain-specific vocabulary": 60.0,
224
+ "Early 19th century American history": 100.0,
225
+ "Early Americas": 50.0,
226
+ "Earth events": 100.0,
227
+ "Ecological interactions": 76.0,
228
+ "Ecosystems": 95.5,
229
+ "Engineering practices": 100.0,
230
+ "English colonies in North America": 74.4,
231
+ "Force and motion": 84.0,
232
+ "Fossils": 82.4,
233
+ "Genes to traits": 83.0,
234
+ "Geography": 98.6,
235
+ "Government": 100.0,
236
+ "Independent reading comprehension": 100.0,
237
+ "Informational texts: level 1": 100.0,
238
+ "Magnets": 72.2,
239
+ "Maps": 96.8,
240
+ "Materials": 96.6,
241
+ "Medieval Asia": 100.0,
242
+ "Natural resources and human impacts": 100.0,
243
+ "Oceania: geography": 59.6,
244
+ "Oceans and continents": 100.0,
245
+ "Oceans and continents\t": 100.0,
246
+ "Particle motion and energy": 92.6,
247
+ "Persuasive strategies": 100.0,
248
+ "Physical Geography": 83.7,
249
+ "Plant reproduction": 90.0,
250
+ "Plants": 100.0,
251
+ "Plate tectonics": 100.0,
252
+ "Read-alone texts": 100.0,
253
+ "Rocks and minerals": 100.0,
254
+ "Rome and the Byzantine Empire": 100.0,
255
+ "Scientific names": 100.0,
256
+ "Solutions": 65.7,
257
+ "State capitals": 100.0,
258
+ "States": 100.0,
259
+ "States of matter": 97.4,
260
+ "The American Revolution": 100.0,
261
+ "The Americas: geography": 83.3,
262
+ "The Antebellum period": 100.0,
263
+ "The Civil War and Reconstruction": 100.0,
264
+ "The Silk Road": 100.0,
265
+ "Thermal energy": 100.0,
266
+ "Velocity, acceleration, and forces": 68.6,
267
+ "Visual elements": 100.0,
268
+ "Water cycle": 100.0,
269
+ "Weather and climate": 90.6,
270
+ "World religions": 100.0
271
+ },
272
+ "ScienceQA_TEST": {
273
+ "Overall": 90.7,
274
+ "Adaptations": 100.0,
275
+ "Ancient Egypt and Kush": 100.0,
276
+ "Ancient Mesopotamia": 100.0,
277
+ "Animals": 100.0,
278
+ "Astronomy": 100.0,
279
+ "Atoms and molecules": 100.0,
280
+ "Basic economic principles": 38.0,
281
+ "Cells": 100.0,
282
+ "Chemical reactions": 100.0,
283
+ "Cities": 91.7,
284
+ "Classification": 100.0,
285
+ "Classification and scientific names": 100.0,
286
+ "Climate change": 100.0,
287
+ "Colonial America": 81.6,
288
+ "Context clues": 100.0,
289
+ "Descriptive details": 100.0,
290
+ "Designing experiments": 100.0,
291
+ "Domain-specific vocabulary": 100.0,
292
+ "Early 19th century American history": 100.0,
293
+ "Earth events": 100.0,
294
+ "Ecological interactions": 66.7,
295
+ "Ecosystems": 90.4,
296
+ "Engineering practices": 98.2,
297
+ "English colonies in North America": 92.3,
298
+ "Force and motion": 100.0,
299
+ "Fossils": 100.0,
300
+ "Genes to traits": 76.3,
301
+ "Geography": 95.2,
302
+ "Government": 100.0,
303
+ "Greece": 100.0,
304
+ "Independent reading comprehension": 100.0,
305
+ "Informational texts: level 1": 100.0,
306
+ "Kinetic and potential energy": 100.0,
307
+ "Magnets": 77.3,
308
+ "Maps": 97.8,
309
+ "Materials": 96.5,
310
+ "Medieval Asia": 100.0,
311
+ "Oceania: geography": 76.5,
312
+ "Oceans and continents": 100.0,
313
+ "Oceans and continents\t": 100.0,
314
+ "Particle motion and energy": 97.6,
315
+ "Persuasive strategies": 100.0,
316
+ "Photosynthesis": 100.0,
317
+ "Physical Geography": 92.2,
318
+ "Plant reproduction": 100.0,
319
+ "Plants": 66.7,
320
+ "Plate tectonics": 100.0,
321
+ "Read-alone texts": 100.0,
322
+ "Rocks and minerals": 100.0,
323
+ "Scientific names": 100.0,
324
+ "Solutions": 72.2,
325
+ "State capitals": 100.0,
326
+ "States": 94.4,
327
+ "States of matter": 100.0,
328
+ "The American Revolution": 100.0,
329
+ "The Americas: geography": 71.1,
330
+ "The Antebellum period": 100.0,
331
+ "The Civil War and Reconstruction": 100.0,
332
+ "Thermal energy": 95.5,
333
+ "Topographic maps": 100.0,
334
+ "Velocity, acceleration, and forces": 67.7,
335
+ "Visual elements": 100.0,
336
+ "Water cycle": 100.0,
337
+ "Weather and climate": 91.4,
338
+ "World religions": 100.0
339
+ },
340
+ "OCRBench": {
341
+ "Text Recognition": 199,
342
+ "Scene Text-centric VQA": 181,
343
+ "Doc-oriented VQA": 168,
344
+ "Key Information Extraction": 170,
345
+ "Handwritten Mathematical Expression Recognition": 18,
346
+ "Final Score": 736
347
+ },
348
+ "MMStar": {
349
+ "Overall": 63.9,
350
+ "coarse perception": 73.6,
351
+ "fine-grained perception": 54.8,
352
+ "instance reasoning": 66.4,
353
+ "logical reasoning": 72.0,
354
+ "math": 66.4,
355
+ "science & technology": 50.0
356
+ },
357
+ "RealWorldQA": {
358
+ "Overall": 75.4
359
+ },
360
+ "POPE": {
361
+ "Overall": 85.6,
362
+ "acc": 86.7,
363
+ "precision": 93.0,
364
+ "recall": 79.3
365
+ },
366
+ "SEEDBench2_Plus": {
367
+ "Overall": 72.0,
368
+ "chart": 71.4,
369
+ "map": 62.0,
370
+ "web": 85.2
371
+ },
372
+ "MMT-Bench_VAL": {
373
+ "Overall": 67.3,
374
+ "VR": 85.3,
375
+ "Loc": 68.1,
376
+ "OCR": 82.5,
377
+ "Count": 57.2,
378
+ "HLN": 75.0,
379
+ "IR": 85.0,
380
+ "3D": 57.5,
381
+ "VC": 87.9,
382
+ "VG": 46.2,
383
+ "DU": 72.9,
384
+ "AR": 51.0,
385
+ "PLP": 43.5,
386
+ "I2IT": 50.0,
387
+ "RR": 76.2,
388
+ "IQT": 15.0,
389
+ "Emo": 58.3,
390
+ "VI": 33.9,
391
+ "MemU": 87.5,
392
+ "VPU": 84.9,
393
+ "AND": 57.0,
394
+ "KD": 57.1,
395
+ "VCR": 80.0,
396
+ "IEJ": 40.0,
397
+ "MIA": 42.5,
398
+ "CIM": 61.7,
399
+ "TU": 49.5,
400
+ "VP": 66.7,
401
+ "MedU": 74.0,
402
+ "AUD": 58.0,
403
+ "DKR": 64.6,
404
+ "EA": 90.0,
405
+ "GN": 46.2,
406
+ "abstract_visual_recognition": 85.0,
407
+ "action_quality_assessment": 15.0,
408
+ "age_gender_race_recognition": 60.0,
409
+ "anatomy_identification": 75.0,
410
+ "animal_keypoint_detection": 35.0,
411
+ "animals_recognition": 100.0,
412
+ "animated_character_recognition": 90.0,
413
+ "art_design": 81.8,
414
+ "artwork_emotion_recognition": 55.0,
415
+ "astronomical_recognition": 100.0,
416
+ "attribute_hallucination": 80.0,
417
+ "behavior_anomaly_detection": 30.0,
418
+ "body_emotion_recognition": 40.0,
419
+ "building_recognition": 90.0,
420
+ "business": 66.7,
421
+ "camouflage_object_detection": 55.0,
422
+ "celebrity_recognition": 0.0,
423
+ "chart_to_table": 95.0,
424
+ "chart_to_text": 90.0,
425
+ "chart_vqa": 70.0,
426
+ "chemical_apparatusn_recognition": 80.0,
427
+ "clock_reading": 30.0,
428
+ "clothes_keypoint_detection": 70.0,
429
+ "color_assimilation": 35.0,
430
+ "color_constancy": 14.3,
431
+ "color_contrast": 40.0,
432
+ "color_recognition": 95.0,
433
+ "counting_by_category": 33.8,
434
+ "counting_by_reasoning": 95.0,
435
+ "counting_by_visual_prompting": 50.0,
436
+ "crowd_counting": 50.0,
437
+ "deepfake_detection": 60.0,
438
+ "depth_estimation": 40.0,
439
+ "disaster_recognition": 85.0,
440
+ "disease_diagnose": 60.0,
441
+ "doc_vqa": 80.0,
442
+ "electronic_object_recognition": 100.0,
443
+ "eqn2latex": 90.0,
444
+ "exist_hallucination": 90.0,
445
+ "facail_expression_change_recognition": 95.0,
446
+ "face_detection": 90.0,
447
+ "face_mask_anomaly_dectection": 70.0,
448
+ "face_retrieval": 100.0,
449
+ "facial_expression_recognition": 75.0,
450
+ "fashion_recognition": 75.0,
451
+ "film_and_television_recognition": 95.0,
452
+ "font_recognition": 50.0,
453
+ "food_recognition": 100.0,
454
+ "furniture_keypoint_detection": 55.0,
455
+ "gaze_estimation": 10.0,
456
+ "general_action_recognition": 95.0,
457
+ "geometrical_perspective": 50.0,
458
+ "geometrical_relativity": 30.0,
459
+ "gesture_recognition": 65.0,
460
+ "google_apps": 50.0,
461
+ "gui_general": 45.0,
462
+ "gui_install": 50.0,
463
+ "handwritten_mathematical_expression_recognition": 90.0,
464
+ "handwritten_retrieval": 90.0,
465
+ "handwritten_text_recognition": 100.0,
466
+ "health_medicine": 92.9,
467
+ "helmet_anomaly_detection": 90.0,
468
+ "human_interaction_understanding": 95.0,
469
+ "human_keypoint_detection": 70.0,
470
+ "human_object_interaction_recognition": 75.0,
471
+ "humanitites_social_science": 54.5,
472
+ "image2image_retrieval": 75.0,
473
+ "image_based_action_recognition": 95.0,
474
+ "image_captioning": 100.0,
475
+ "image_captioning_paragraph": 95.0,
476
+ "image_colorization": 60.0,
477
+ "image_dense_captioning": 68.4,
478
+ "image_matting": 15.0,
479
+ "image_quality_assessment": 35.0,
480
+ "image_season_recognition": 80.0,
481
+ "industrial_produce_anomaly_detection": 40.0,
482
+ "instance_captioning": 95.0,
483
+ "interactive_segmentation": 85.7,
484
+ "jigsaw_puzzle_solving": 40.0,
485
+ "landmark_recognition": 100.0,
486
+ "lesion_grading": 90.0,
487
+ "logo_and_brand_recognition": 95.0,
488
+ "lvlm_response_judgement": 45.0,
489
+ "medical_modality_recognition": 100.0,
490
+ "meme_image_understanding": 95.0,
491
+ "meme_vedio_understanding": 80.0,
492
+ "mevis": 30.0,
493
+ "micro_expression_recognition": 20.0,
494
+ "multiple_image_captioning": 95.0,
495
+ "multiple_instance_captioning": 95.0,
496
+ "multiple_view_image_understanding": 10.0,
497
+ "muscial_instrument_recognition": 95.0,
498
+ "national_flag_recognition": 100.0,
499
+ "navigation": 90.0,
500
+ "next_img_prediction": 65.0,
501
+ "object_detection": 90.0,
502
+ "one_shot_detection": 85.0,
503
+ "order_hallucination": 50.0,
504
+ "other_biological_attributes": 45.0,
505
+ "painting_recognition": 90.0,
506
+ "person_reid": 95.0,
507
+ "pixel_localization": 25.0,
508
+ "pixel_recognition": 55.0,
509
+ "plant_recognition": 90.0,
510
+ "point_tracking": 35.0,
511
+ "polygon_localization": 40.0,
512
+ "profession_recognition": 90.0,
513
+ "ravens_progressive_matrices": 15.0,
514
+ "reason_seg": 47.4,
515
+ "referring_detection": 45.0,
516
+ "relation_hallucination": 80.0,
517
+ "religious_recognition": 75.0,
518
+ "remote_sensing_object_detection": 60.0,
519
+ "rock_recognition": 80.0,
520
+ "rotated_object_detection": 77.8,
521
+ "salient_object_detection_rgb": 55.0,
522
+ "salient_object_detection_rgbd": 50.0,
523
+ "scene_emotion_recognition": 65.0,
524
+ "scene_graph_recognition": 85.0,
525
+ "scene_recognition": 65.0,
526
+ "scene_text_recognition": 90.0,
527
+ "science": 58.3,
528
+ "screenshot2code": 60.0,
529
+ "sculpture_recognition": 80.0,
530
+ "shape_recognition": 95.0,
531
+ "sign_language_recognition": 40.0,
532
+ "single_object_tracking": 65.0,
533
+ "sketch2code": 50.0,
534
+ "sketch2image_retrieval": 95.0,
535
+ "small_object_detection": 60.0,
536
+ "social_relation_recognition": 50.0,
537
+ "som_recognition": 94.7,
538
+ "sports_recognition": 95.0,
539
+ "spot_the_diff": 10.0,
540
+ "spot_the_similarity": 75.0,
541
+ "table_structure_recognition": 50.0,
542
+ "tech_engineering": 33.3,
543
+ "temporal_anticipation": 75.0,
544
+ "temporal_localization": 52.6,
545
+ "temporal_ordering": 25.0,
546
+ "temporal_sequence_understanding": 25.0,
547
+ "text2image_retrieval": 55.0,
548
+ "texture_material_recognition": 75.0,
549
+ "threed_cad_recognition": 70.0,
550
+ "threed_indoor_recognition": 45.0,
551
+ "traffic_anomaly_detection": 55.0,
552
+ "traffic_light_understanding": 100.0,
553
+ "traffic_participants_understanding": 60.0,
554
+ "traffic_sign_understanding": 95.0,
555
+ "transparent_object_detection": 75.0,
556
+ "vehicle_keypoint_detection": 55.6,
557
+ "vehicle_recognition": 100.0,
558
+ "vehicle_retrieval": 85.0,
559
+ "video_captioning": 95.0,
560
+ "visual_document_information_extraction": 95.0,
561
+ "visual_prompt_understanding": 75.0,
562
+ "waste_recognition": 100.0,
563
+ "weapon_recognition": 100.0,
564
+ "weather_recognition": 100.0,
565
+ "web_shopping": 40.0,
566
+ "whoops": 80.0,
567
+ "writing_poetry_from_image": 60.0
568
+ },
569
+ "BLINK": {
570
+ "Overall": 68.0,
571
+ "Art_Style": 82.9,
572
+ "Counting": 66.7,
573
+ "Forensic_Detection": 90.9,
574
+ "Functional_Correspondence": 43.1,
575
+ "IQ_Test": 32.0,
576
+ "Jigsaw": 76.7,
577
+ "Multi-view_Reasoning": 58.6,
578
+ "Object_Localization": 69.7,
579
+ "Relative_Depth": 75.8,
580
+ "Relative_Reflectance": 32.8,
581
+ "Semantic_Correspondence": 61.2,
582
+ "Spatial_Relation": 83.2,
583
+ "Visual_Correspondence": 92.4,
584
+ "Visual_Similarity": 83.0
585
+ },
586
+ "QBench": {
587
+ "Overall": 78.9,
588
+ "type_0_concern_0": 82.4,
589
+ "type_0_concern_1": 82.3,
590
+ "type_0_concern_2": 81.2,
591
+ "type_0_concern_3": 87.1,
592
+ "type_1_concern_0": 76.7,
593
+ "type_1_concern_1": 84.8,
594
+ "type_1_concern_2": 87.0,
595
+ "type_1_concern_3": 88.9,
596
+ "type_2_concern_0": 66.5,
597
+ "type_2_concern_1": 72.4,
598
+ "type_2_concern_2": 66.7,
599
+ "type_2_concern_3": 80.0
600
+ },
601
+ "ABench": {
602
+ "Overall": 79.2,
603
+ "part1 -> bag_of_words -> attribute": 92.7,
604
+ "part1 -> bag_of_words -> composition -> arrangement": 86.7,
605
+ "part1 -> bag_of_words -> composition -> occlusion": 60.0,
606
+ "part1 -> bag_of_words -> composition -> orientation": 76.9,
607
+ "part1 -> bag_of_words -> composition -> size": 71.4,
608
+ "part1 -> bag_of_words -> counting": 79.6,
609
+ "part1 -> bag_of_words -> noun_as_adjective": 81.4,
610
+ "part1 -> basic_recognition -> major": 92.9,
611
+ "part1 -> basic_recognition -> minor": 93.2,
612
+ "part1 -> outside_knowledge -> contradiction overcome": 70.8,
613
+ "part1 -> outside_knowledge -> specific-terms -> company": 100.0,
614
+ "part1 -> outside_knowledge -> specific-terms -> creature": 83.3,
615
+ "part1 -> outside_knowledge -> specific-terms -> daily": 94.1,
616
+ "part1 -> outside_knowledge -> specific-terms -> food": 95.5,
617
+ "part1 -> outside_knowledge -> specific-terms -> geography": 81.0,
618
+ "part1 -> outside_knowledge -> specific-terms -> material": 95.2,
619
+ "part1 -> outside_knowledge -> specific-terms -> science": 100.0,
620
+ "part1 -> outside_knowledge -> specific-terms -> sports": 68.2,
621
+ "part1 -> outside_knowledge -> specific-terms -> style -> abstract": 100.0,
622
+ "part1 -> outside_knowledge -> specific-terms -> style -> art": 100.0,
623
+ "part1 -> outside_knowledge -> specific-terms -> style -> art_deco": 100.0,
624
+ "part1 -> outside_knowledge -> specific-terms -> style -> cubism": 100.0,
625
+ "part1 -> outside_knowledge -> specific-terms -> style -> dadaism": 100.0,
626
+ "part1 -> outside_knowledge -> specific-terms -> style -> deco": 100.0,
627
+ "part1 -> outside_knowledge -> specific-terms -> style -> expressionism": 100.0,
628
+ "part1 -> outside_knowledge -> specific-terms -> style -> fauvism": 100.0,
629
+ "part1 -> outside_knowledge -> specific-terms -> style -> futurism": 66.7,
630
+ "part1 -> outside_knowledge -> specific-terms -> style -> minimalism": 100.0,
631
+ "part1 -> outside_knowledge -> specific-terms -> style -> pop": 100.0,
632
+ "part1 -> outside_knowledge -> specific-terms -> style -> psychedelic": 100.0,
633
+ "part1 -> outside_knowledge -> specific-terms -> style -> steampunk": 100.0,
634
+ "part1 -> outside_knowledge -> specific-terms -> style -> surrealism": 100.0,
635
+ "part1 -> outside_knowledge -> specific-terms -> style -> victorian": 0.0,
636
+ "part1 -> outside_knowledge -> specific-terms -> vehicle": 94.7,
637
+ "part1 -> outside_knowledge -> specific-terms -> weather": 92.3,
638
+ "part2 -> aesthetic": 62.6,
639
+ "part2 -> generative": 72.4,
640
+ "part2 -> technical": 74.9
641
+ },
642
+ "MTVQA": {
643
+ "Overall": 31.2,
644
+ "AR": 21.3,
645
+ "DE": 35.1,
646
+ "FR": 42.2,
647
+ "IT": 37.2,
648
+ "JA": 19.9,
649
+ "KR": 35.1,
650
+ "RU": 15.9,
651
+ "TH": 26.0,
652
+ "VI": 39.6
653
+ }
654
+ }
655
+ }
656
+ }