Yilun Jin commited on
Commit
0302c93
1 Parent(s): 3647aad

is there a bug?

Browse files
ShoppingMMLU.json CHANGED
@@ -17,125 +17,7 @@
17
  "key": 270,
18
  "dir_name": "GPT4o_HIGH"
19
  },
20
- "SEEDBench_IMG": {
21
- "Overall": 77.1,
22
- "Instance Attributes": 79.3,
23
- "Instance Identity": 81.0,
24
- "Instance Interaction": 80.4,
25
- "Instance Location": 72.9,
26
- "Instances Counting": 69.5,
27
- "Scene Understanding": 80.1,
28
- "Spatial Relation": 67.9,
29
- "Text Understanding": 72.6,
30
- "Visual Reasoning": 83.1,
31
- "Overall (official)": "N/A"
32
- },
33
- "CCBench": {
34
- "Overall": 71.2,
35
- "Sketch Reasoning": 91.1,
36
- "Historical Figure": 37.1,
37
- "Calligraphy Painting": 70.2,
38
- "Scenery Building": 89.5,
39
- "Food Clothes": 62.6,
40
- "Cultural Relic": 67.0,
41
- "Traditional Show": 71.2
42
- },
43
- "MMBench_TEST_EN": {
44
- "Overall": 83.4,
45
- "CP": 87.4,
46
- "FP-S": 78.9,
47
- "FP-C": 83.8,
48
- "AR": 86.5,
49
- "LR": 80.3,
50
- "RR": 80.6
51
- },
52
- "MMBench_TEST_CN": {
53
- "Overall": 82.1,
54
- "CP": 87.6,
55
- "FP-S": 76.6,
56
- "FP-C": 83.4,
57
- "AR": 83.7,
58
- "LR": 78.0,
59
- "RR": 80.1
60
- },
61
- "MMBench_TEST_EN_V11": {
62
- "Overall": 83.0,
63
- "AR": 90.2,
64
- "CP": 81.3,
65
- "FP-C": 86.1,
66
- "FP-S": 81.4,
67
- "LR": 78.8,
68
- "RR": 82.2,
69
- "Action Recognition": 93.2,
70
- "Attribute Comparison": 82.7,
71
- "Attribute Recognition": 91.0,
72
- "Celebrity Recognition": 62.6,
73
- "Function Reasoning": 93.3,
74
- "Future Prediction": 82.7,
75
- "Identity Reasoning": 98.7,
76
- "Image Emotion": 81.1,
77
- "Image Quality": 59.7,
78
- "Image Scene": 88.2,
79
- "Image Style": 83.7,
80
- "Image Topic": 97.8,
81
- "Nature Relation": 92.4,
82
- "Object Localization": 84.8,
83
- "Ocr": 98.9,
84
- "Physical Property Reasoning": 78.5,
85
- "Physical Relation": 61.3,
86
- "Social Relation": 89.0,
87
- "Spatial Relationship": 78.7,
88
- "Structuralized Imagetext Understanding": 76.1
89
- },
90
- "MMBench_TEST_CN_V11": {
91
- "Overall": 81.5,
92
- "AR": 86.5,
93
- "CP": 81.5,
94
- "FP-C": 85.0,
95
- "FP-S": 79.1,
96
- "LR": 77.2,
97
- "RR": 79.8,
98
- "Action Recognition": 94.0,
99
- "Attribute Comparison": 81.3,
100
- "Attribute Recognition": 91.0,
101
- "Celebrity Recognition": 57.8,
102
- "Function Reasoning": 94.4,
103
- "Future Prediction": 82.7,
104
- "Identity Reasoning": 97.4,
105
- "Image Emotion": 85.6,
106
- "Image Quality": 58.9,
107
- "Image Scene": 88.2,
108
- "Image Style": 80.4,
109
- "Image Topic": 98.9,
110
- "Nature Relation": 94.6,
111
- "Object Localization": 82.9,
112
- "Ocr": 97.8,
113
- "Physical Property Reasoning": 67.1,
114
- "Physical Relation": 53.3,
115
- "Social Relation": 86.8,
116
- "Spatial Relationship": 74.7,
117
- "Structuralized Imagetext Understanding": 73.4
118
- },
119
- "MME": {
120
- "Overall": 2310.3,
121
- "Perception": 1614.2,
122
- "Cognition": 696.1,
123
- "OCR": 192.5,
124
- "Artwork": 145.2,
125
- "Celebrity": 67.9,
126
- "Code Reasoning": 177.5,
127
- "Color": 185.0,
128
- "Commonsense Reasoning": 178.6,
129
- "Count": 185.0,
130
- "Existence": 185.0,
131
- "Landmark": 182.0,
132
- "Numerical Calculation": 147.5,
133
- "Position": 133.3,
134
- "Posters": 191.2,
135
- "Scene": 147.0,
136
- "Text Translation": 192.5
137
- },
138
- "MMVet": {
139
  "Rec": 67.8,
140
  "Ocr": 76.8,
141
  "Know": 58.3,
@@ -145,16 +27,7 @@
145
  "Overall": 69.1,
146
  "Overall (official)": "N/A"
147
  },
148
- "MMMU_VAL": {
149
- "Overall": 69.2,
150
- "Art & Design": 72.5,
151
- "Business": 73.3,
152
- "Science": 64.7,
153
- "Health & Medicine": 74.0,
154
- "Humanities & Social Science": 80.8,
155
- "Tech & Engineering": 57.6
156
- },
157
- "MathVista": {
158
  "Overall": 61.3,
159
  "SCI": 64.8,
160
  "TQA": 70.3,
@@ -169,183 +42,15 @@
169
  "FQA": 60.2,
170
  "STA": 68.4
171
  },
172
- "HallusionBench": {
173
- "aAcc": 70.2,
174
- "fAcc": 49.1,
175
- "qAcc": 45.5,
176
- "Overall": 55.0
177
- },
178
- "LLaVABench": {
179
- "Overall": 102.0,
180
- "Conv": 93.6,
181
- "Complex": 111.2,
182
- "Detail": 93.6,
183
- "Overall (official)": "N/A"
184
- },
185
- "AI2D": {
186
- "Overall": 84.6,
187
- "atomStructure": 75.0,
188
- "eclipses": 90.3,
189
- "faultsEarthquakes": 78.6,
190
- "foodChainsWebs": 92.2,
191
- "lifeCycles": 83.5,
192
- "moonPhaseEquinox": 68.2,
193
- "partsOfA": 80.9,
194
- "partsOfTheEarth": 82.7,
195
- "photosynthesisRespiration": 83.5,
196
- "rockCycle": 73.1,
197
- "rockStrata": 87.8,
198
- "solarSystem": 97.2,
199
- "typesOf": 81.0,
200
- "volcano": 100.0,
201
- "waterCNPCycle": 68.2
202
- },
203
- "ScienceQA_VAL": {
204
- "Overall": 89.7,
205
- "Adaptations": 97.9,
206
- "Adaptations and natural selection": 100.0,
207
- "Age of Exploration": 100.0,
208
- "Ancient Egypt and Kush": 100.0,
209
- "Ancient Mesopotamia": 100.0,
210
- "Animals": 100.0,
211
- "Astronomy": 100.0,
212
- "Atoms and molecules": 100.0,
213
- "Basic economic principles": 32.8,
214
- "Chemical reactions": 100.0,
215
- "Cities": 87.5,
216
- "Classification": 98.8,
217
- "Classification and scientific names": 100.0,
218
- "Climate change": 100.0,
219
- "Colonial America": 90.5,
220
- "Context clues": 100.0,
221
- "Descriptive details": 100.0,
222
- "Designing experiments": 100.0,
223
- "Domain-specific vocabulary": 60.0,
224
- "Early 19th century American history": 100.0,
225
- "Early Americas": 50.0,
226
- "Earth events": 100.0,
227
- "Ecological interactions": 76.0,
228
- "Ecosystems": 95.5,
229
- "Engineering practices": 100.0,
230
- "English colonies in North America": 74.4,
231
- "Force and motion": 84.0,
232
- "Fossils": 82.4,
233
- "Genes to traits": 83.0,
234
- "Geography": 98.6,
235
- "Government": 100.0,
236
- "Independent reading comprehension": 100.0,
237
- "Informational texts: level 1": 100.0,
238
- "Magnets": 72.2,
239
- "Maps": 96.8,
240
- "Materials": 96.6,
241
- "Medieval Asia": 100.0,
242
- "Natural resources and human impacts": 100.0,
243
- "Oceania: geography": 59.6,
244
- "Oceans and continents": 100.0,
245
- "Oceans and continents\t": 100.0,
246
- "Particle motion and energy": 92.6,
247
- "Persuasive strategies": 100.0,
248
- "Physical Geography": 83.7,
249
- "Plant reproduction": 90.0,
250
- "Plants": 100.0,
251
- "Plate tectonics": 100.0,
252
- "Read-alone texts": 100.0,
253
- "Rocks and minerals": 100.0,
254
- "Rome and the Byzantine Empire": 100.0,
255
- "Scientific names": 100.0,
256
- "Solutions": 65.7,
257
- "State capitals": 100.0,
258
- "States": 100.0,
259
- "States of matter": 97.4,
260
- "The American Revolution": 100.0,
261
- "The Americas: geography": 83.3,
262
- "The Antebellum period": 100.0,
263
- "The Civil War and Reconstruction": 100.0,
264
- "The Silk Road": 100.0,
265
- "Thermal energy": 100.0,
266
- "Velocity, acceleration, and forces": 68.6,
267
- "Visual elements": 100.0,
268
- "Water cycle": 100.0,
269
- "Weather and climate": 90.6,
270
- "World religions": 100.0
271
- },
272
- "ScienceQA_TEST": {
273
- "Overall": 90.7,
274
- "Adaptations": 100.0,
275
- "Ancient Egypt and Kush": 100.0,
276
- "Ancient Mesopotamia": 100.0,
277
- "Animals": 100.0,
278
- "Astronomy": 100.0,
279
- "Atoms and molecules": 100.0,
280
- "Basic economic principles": 38.0,
281
- "Cells": 100.0,
282
- "Chemical reactions": 100.0,
283
- "Cities": 91.7,
284
- "Classification": 100.0,
285
- "Classification and scientific names": 100.0,
286
- "Climate change": 100.0,
287
- "Colonial America": 81.6,
288
- "Context clues": 100.0,
289
- "Descriptive details": 100.0,
290
- "Designing experiments": 100.0,
291
- "Domain-specific vocabulary": 100.0,
292
- "Early 19th century American history": 100.0,
293
- "Earth events": 100.0,
294
- "Ecological interactions": 66.7,
295
- "Ecosystems": 90.4,
296
- "Engineering practices": 98.2,
297
- "English colonies in North America": 92.3,
298
- "Force and motion": 100.0,
299
- "Fossils": 100.0,
300
- "Genes to traits": 76.3,
301
- "Geography": 95.2,
302
- "Government": 100.0,
303
- "Greece": 100.0,
304
- "Independent reading comprehension": 100.0,
305
- "Informational texts: level 1": 100.0,
306
- "Kinetic and potential energy": 100.0,
307
- "Magnets": 77.3,
308
- "Maps": 97.8,
309
- "Materials": 96.5,
310
- "Medieval Asia": 100.0,
311
- "Oceania: geography": 76.5,
312
- "Oceans and continents": 100.0,
313
- "Oceans and continents\t": 100.0,
314
- "Particle motion and energy": 97.6,
315
- "Persuasive strategies": 100.0,
316
- "Photosynthesis": 100.0,
317
- "Physical Geography": 92.2,
318
- "Plant reproduction": 100.0,
319
- "Plants": 66.7,
320
- "Plate tectonics": 100.0,
321
- "Read-alone texts": 100.0,
322
- "Rocks and minerals": 100.0,
323
- "Scientific names": 100.0,
324
- "Solutions": 72.2,
325
- "State capitals": 100.0,
326
- "States": 94.4,
327
- "States of matter": 100.0,
328
- "The American Revolution": 100.0,
329
- "The Americas: geography": 71.1,
330
- "The Antebellum period": 100.0,
331
- "The Civil War and Reconstruction": 100.0,
332
- "Thermal energy": 95.5,
333
- "Topographic maps": 100.0,
334
- "Velocity, acceleration, and forces": 67.7,
335
- "Visual elements": 100.0,
336
- "Water cycle": 100.0,
337
- "Weather and climate": 91.4,
338
- "World religions": 100.0
339
- },
340
- "OCRBench": {
341
  "Text Recognition": 199,
342
  "Scene Text-centric VQA": 181,
343
  "Doc-oriented VQA": 168,
344
  "Key Information Extraction": 170,
345
  "Handwritten Mathematical Expression Recognition": 18,
346
- "Final Score": 736
347
  },
348
- "MMStar": {
349
  "Overall": 63.9,
350
  "coarse perception": 73.6,
351
  "fine-grained perception": 54.8,
@@ -353,303 +58,6 @@
353
  "logical reasoning": 72.0,
354
  "math": 66.4,
355
  "science & technology": 50.0
356
- },
357
- "RealWorldQA": {
358
- "Overall": 75.4
359
- },
360
- "POPE": {
361
- "Overall": 85.6,
362
- "acc": 86.7,
363
- "precision": 93.0,
364
- "recall": 79.3
365
- },
366
- "SEEDBench2_Plus": {
367
- "Overall": 72.0,
368
- "chart": 71.4,
369
- "map": 62.0,
370
- "web": 85.2
371
- },
372
- "MMT-Bench_VAL": {
373
- "Overall": 67.3,
374
- "VR": 85.3,
375
- "Loc": 68.1,
376
- "OCR": 82.5,
377
- "Count": 57.2,
378
- "HLN": 75.0,
379
- "IR": 85.0,
380
- "3D": 57.5,
381
- "VC": 87.9,
382
- "VG": 46.2,
383
- "DU": 72.9,
384
- "AR": 51.0,
385
- "PLP": 43.5,
386
- "I2IT": 50.0,
387
- "RR": 76.2,
388
- "IQT": 15.0,
389
- "Emo": 58.3,
390
- "VI": 33.9,
391
- "MemU": 87.5,
392
- "VPU": 84.9,
393
- "AND": 57.0,
394
- "KD": 57.1,
395
- "VCR": 80.0,
396
- "IEJ": 40.0,
397
- "MIA": 42.5,
398
- "CIM": 61.7,
399
- "TU": 49.5,
400
- "VP": 66.7,
401
- "MedU": 74.0,
402
- "AUD": 58.0,
403
- "DKR": 64.6,
404
- "EA": 90.0,
405
- "GN": 46.2,
406
- "abstract_visual_recognition": 85.0,
407
- "action_quality_assessment": 15.0,
408
- "age_gender_race_recognition": 60.0,
409
- "anatomy_identification": 75.0,
410
- "animal_keypoint_detection": 35.0,
411
- "animals_recognition": 100.0,
412
- "animated_character_recognition": 90.0,
413
- "art_design": 81.8,
414
- "artwork_emotion_recognition": 55.0,
415
- "astronomical_recognition": 100.0,
416
- "attribute_hallucination": 80.0,
417
- "behavior_anomaly_detection": 30.0,
418
- "body_emotion_recognition": 40.0,
419
- "building_recognition": 90.0,
420
- "business": 66.7,
421
- "camouflage_object_detection": 55.0,
422
- "celebrity_recognition": 0.0,
423
- "chart_to_table": 95.0,
424
- "chart_to_text": 90.0,
425
- "chart_vqa": 70.0,
426
- "chemical_apparatusn_recognition": 80.0,
427
- "clock_reading": 30.0,
428
- "clothes_keypoint_detection": 70.0,
429
- "color_assimilation": 35.0,
430
- "color_constancy": 14.3,
431
- "color_contrast": 40.0,
432
- "color_recognition": 95.0,
433
- "counting_by_category": 33.8,
434
- "counting_by_reasoning": 95.0,
435
- "counting_by_visual_prompting": 50.0,
436
- "crowd_counting": 50.0,
437
- "deepfake_detection": 60.0,
438
- "depth_estimation": 40.0,
439
- "disaster_recognition": 85.0,
440
- "disease_diagnose": 60.0,
441
- "doc_vqa": 80.0,
442
- "electronic_object_recognition": 100.0,
443
- "eqn2latex": 90.0,
444
- "exist_hallucination": 90.0,
445
- "facail_expression_change_recognition": 95.0,
446
- "face_detection": 90.0,
447
- "face_mask_anomaly_dectection": 70.0,
448
- "face_retrieval": 100.0,
449
- "facial_expression_recognition": 75.0,
450
- "fashion_recognition": 75.0,
451
- "film_and_television_recognition": 95.0,
452
- "font_recognition": 50.0,
453
- "food_recognition": 100.0,
454
- "furniture_keypoint_detection": 55.0,
455
- "gaze_estimation": 10.0,
456
- "general_action_recognition": 95.0,
457
- "geometrical_perspective": 50.0,
458
- "geometrical_relativity": 30.0,
459
- "gesture_recognition": 65.0,
460
- "google_apps": 50.0,
461
- "gui_general": 45.0,
462
- "gui_install": 50.0,
463
- "handwritten_mathematical_expression_recognition": 90.0,
464
- "handwritten_retrieval": 90.0,
465
- "handwritten_text_recognition": 100.0,
466
- "health_medicine": 92.9,
467
- "helmet_anomaly_detection": 90.0,
468
- "human_interaction_understanding": 95.0,
469
- "human_keypoint_detection": 70.0,
470
- "human_object_interaction_recognition": 75.0,
471
- "humanitites_social_science": 54.5,
472
- "image2image_retrieval": 75.0,
473
- "image_based_action_recognition": 95.0,
474
- "image_captioning": 100.0,
475
- "image_captioning_paragraph": 95.0,
476
- "image_colorization": 60.0,
477
- "image_dense_captioning": 68.4,
478
- "image_matting": 15.0,
479
- "image_quality_assessment": 35.0,
480
- "image_season_recognition": 80.0,
481
- "industrial_produce_anomaly_detection": 40.0,
482
- "instance_captioning": 95.0,
483
- "interactive_segmentation": 85.7,
484
- "jigsaw_puzzle_solving": 40.0,
485
- "landmark_recognition": 100.0,
486
- "lesion_grading": 90.0,
487
- "logo_and_brand_recognition": 95.0,
488
- "lvlm_response_judgement": 45.0,
489
- "medical_modality_recognition": 100.0,
490
- "meme_image_understanding": 95.0,
491
- "meme_vedio_understanding": 80.0,
492
- "mevis": 30.0,
493
- "micro_expression_recognition": 20.0,
494
- "multiple_image_captioning": 95.0,
495
- "multiple_instance_captioning": 95.0,
496
- "multiple_view_image_understanding": 10.0,
497
- "muscial_instrument_recognition": 95.0,
498
- "national_flag_recognition": 100.0,
499
- "navigation": 90.0,
500
- "next_img_prediction": 65.0,
501
- "object_detection": 90.0,
502
- "one_shot_detection": 85.0,
503
- "order_hallucination": 50.0,
504
- "other_biological_attributes": 45.0,
505
- "painting_recognition": 90.0,
506
- "person_reid": 95.0,
507
- "pixel_localization": 25.0,
508
- "pixel_recognition": 55.0,
509
- "plant_recognition": 90.0,
510
- "point_tracking": 35.0,
511
- "polygon_localization": 40.0,
512
- "profession_recognition": 90.0,
513
- "ravens_progressive_matrices": 15.0,
514
- "reason_seg": 47.4,
515
- "referring_detection": 45.0,
516
- "relation_hallucination": 80.0,
517
- "religious_recognition": 75.0,
518
- "remote_sensing_object_detection": 60.0,
519
- "rock_recognition": 80.0,
520
- "rotated_object_detection": 77.8,
521
- "salient_object_detection_rgb": 55.0,
522
- "salient_object_detection_rgbd": 50.0,
523
- "scene_emotion_recognition": 65.0,
524
- "scene_graph_recognition": 85.0,
525
- "scene_recognition": 65.0,
526
- "scene_text_recognition": 90.0,
527
- "science": 58.3,
528
- "screenshot2code": 60.0,
529
- "sculpture_recognition": 80.0,
530
- "shape_recognition": 95.0,
531
- "sign_language_recognition": 40.0,
532
- "single_object_tracking": 65.0,
533
- "sketch2code": 50.0,
534
- "sketch2image_retrieval": 95.0,
535
- "small_object_detection": 60.0,
536
- "social_relation_recognition": 50.0,
537
- "som_recognition": 94.7,
538
- "sports_recognition": 95.0,
539
- "spot_the_diff": 10.0,
540
- "spot_the_similarity": 75.0,
541
- "table_structure_recognition": 50.0,
542
- "tech_engineering": 33.3,
543
- "temporal_anticipation": 75.0,
544
- "temporal_localization": 52.6,
545
- "temporal_ordering": 25.0,
546
- "temporal_sequence_understanding": 25.0,
547
- "text2image_retrieval": 55.0,
548
- "texture_material_recognition": 75.0,
549
- "threed_cad_recognition": 70.0,
550
- "threed_indoor_recognition": 45.0,
551
- "traffic_anomaly_detection": 55.0,
552
- "traffic_light_understanding": 100.0,
553
- "traffic_participants_understanding": 60.0,
554
- "traffic_sign_understanding": 95.0,
555
- "transparent_object_detection": 75.0,
556
- "vehicle_keypoint_detection": 55.6,
557
- "vehicle_recognition": 100.0,
558
- "vehicle_retrieval": 85.0,
559
- "video_captioning": 95.0,
560
- "visual_document_information_extraction": 95.0,
561
- "visual_prompt_understanding": 75.0,
562
- "waste_recognition": 100.0,
563
- "weapon_recognition": 100.0,
564
- "weather_recognition": 100.0,
565
- "web_shopping": 40.0,
566
- "whoops": 80.0,
567
- "writing_poetry_from_image": 60.0
568
- },
569
- "BLINK": {
570
- "Overall": 68.0,
571
- "Art_Style": 82.9,
572
- "Counting": 66.7,
573
- "Forensic_Detection": 90.9,
574
- "Functional_Correspondence": 43.1,
575
- "IQ_Test": 32.0,
576
- "Jigsaw": 76.7,
577
- "Multi-view_Reasoning": 58.6,
578
- "Object_Localization": 69.7,
579
- "Relative_Depth": 75.8,
580
- "Relative_Reflectance": 32.8,
581
- "Semantic_Correspondence": 61.2,
582
- "Spatial_Relation": 83.2,
583
- "Visual_Correspondence": 92.4,
584
- "Visual_Similarity": 83.0
585
- },
586
- "QBench": {
587
- "Overall": 78.9,
588
- "type_0_concern_0": 82.4,
589
- "type_0_concern_1": 82.3,
590
- "type_0_concern_2": 81.2,
591
- "type_0_concern_3": 87.1,
592
- "type_1_concern_0": 76.7,
593
- "type_1_concern_1": 84.8,
594
- "type_1_concern_2": 87.0,
595
- "type_1_concern_3": 88.9,
596
- "type_2_concern_0": 66.5,
597
- "type_2_concern_1": 72.4,
598
- "type_2_concern_2": 66.7,
599
- "type_2_concern_3": 80.0
600
- },
601
- "ABench": {
602
- "Overall": 79.2,
603
- "part1 -> bag_of_words -> attribute": 92.7,
604
- "part1 -> bag_of_words -> composition -> arrangement": 86.7,
605
- "part1 -> bag_of_words -> composition -> occlusion": 60.0,
606
- "part1 -> bag_of_words -> composition -> orientation": 76.9,
607
- "part1 -> bag_of_words -> composition -> size": 71.4,
608
- "part1 -> bag_of_words -> counting": 79.6,
609
- "part1 -> bag_of_words -> noun_as_adjective": 81.4,
610
- "part1 -> basic_recognition -> major": 92.9,
611
- "part1 -> basic_recognition -> minor": 93.2,
612
- "part1 -> outside_knowledge -> contradiction overcome": 70.8,
613
- "part1 -> outside_knowledge -> specific-terms -> company": 100.0,
614
- "part1 -> outside_knowledge -> specific-terms -> creature": 83.3,
615
- "part1 -> outside_knowledge -> specific-terms -> daily": 94.1,
616
- "part1 -> outside_knowledge -> specific-terms -> food": 95.5,
617
- "part1 -> outside_knowledge -> specific-terms -> geography": 81.0,
618
- "part1 -> outside_knowledge -> specific-terms -> material": 95.2,
619
- "part1 -> outside_knowledge -> specific-terms -> science": 100.0,
620
- "part1 -> outside_knowledge -> specific-terms -> sports": 68.2,
621
- "part1 -> outside_knowledge -> specific-terms -> style -> abstract": 100.0,
622
- "part1 -> outside_knowledge -> specific-terms -> style -> art": 100.0,
623
- "part1 -> outside_knowledge -> specific-terms -> style -> art_deco": 100.0,
624
- "part1 -> outside_knowledge -> specific-terms -> style -> cubism": 100.0,
625
- "part1 -> outside_knowledge -> specific-terms -> style -> dadaism": 100.0,
626
- "part1 -> outside_knowledge -> specific-terms -> style -> deco": 100.0,
627
- "part1 -> outside_knowledge -> specific-terms -> style -> expressionism": 100.0,
628
- "part1 -> outside_knowledge -> specific-terms -> style -> fauvism": 100.0,
629
- "part1 -> outside_knowledge -> specific-terms -> style -> futurism": 66.7,
630
- "part1 -> outside_knowledge -> specific-terms -> style -> minimalism": 100.0,
631
- "part1 -> outside_knowledge -> specific-terms -> style -> pop": 100.0,
632
- "part1 -> outside_knowledge -> specific-terms -> style -> psychedelic": 100.0,
633
- "part1 -> outside_knowledge -> specific-terms -> style -> steampunk": 100.0,
634
- "part1 -> outside_knowledge -> specific-terms -> style -> surrealism": 100.0,
635
- "part1 -> outside_knowledge -> specific-terms -> style -> victorian": 0.0,
636
- "part1 -> outside_knowledge -> specific-terms -> vehicle": 94.7,
637
- "part1 -> outside_knowledge -> specific-terms -> weather": 92.3,
638
- "part2 -> aesthetic": 62.6,
639
- "part2 -> generative": 72.4,
640
- "part2 -> technical": 74.9
641
- },
642
- "MTVQA": {
643
- "Overall": 31.2,
644
- "AR": 21.3,
645
- "DE": 35.1,
646
- "FR": 42.2,
647
- "IT": 37.2,
648
- "JA": 19.9,
649
- "KR": 35.1,
650
- "RU": 15.9,
651
- "TH": 26.0,
652
- "VI": 39.6
653
  }
654
  }
655
  }
 
17
  "key": 270,
18
  "dir_name": "GPT4o_HIGH"
19
  },
20
+ "Shopping Concept Understanding": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "Rec": 67.8,
22
  "Ocr": 76.8,
23
  "Know": 58.3,
 
27
  "Overall": 69.1,
28
  "Overall (official)": "N/A"
29
  },
30
+ "Shopping Knowledge Reasoning": {
 
 
 
 
 
 
 
 
 
31
  "Overall": 61.3,
32
  "SCI": 64.8,
33
  "TQA": 70.3,
 
42
  "FQA": 60.2,
43
  "STA": 68.4
44
  },
45
+ "User Behavior Alignment": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "Text Recognition": 199,
47
  "Scene Text-centric VQA": 181,
48
  "Doc-oriented VQA": 168,
49
  "Key Information Extraction": 170,
50
  "Handwritten Mathematical Expression Recognition": 18,
51
+ "Overall": 736
52
  },
53
+ "Multi-lingual Abilities": {
54
  "Overall": 63.9,
55
  "coarse perception": 73.6,
56
  "fine-grained perception": 54.8,
 
58
  "logical reasoning": 72.0,
59
  "math": 66.4,
60
  "science & technology": 50.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
  }
63
  }
__pycache__/gen_table.cpython-38.pyc CHANGED
Binary files a/__pycache__/gen_table.cpython-38.pyc and b/__pycache__/gen_table.cpython-38.pyc differ
 
__pycache__/meta_data.cpython-38.pyc CHANGED
Binary files a/__pycache__/meta_data.cpython-38.pyc and b/__pycache__/meta_data.cpython-38.pyc differ
 
app.py CHANGED
@@ -98,7 +98,7 @@ with gr.Blocks() as demo:
98
  s.table, s.check_box = BUILD_L2_DF(results, dataset)
99
  s.type_map = s.check_box['type_map']
100
  s.type_map['Rank'] = 'number'
101
-
102
  s.checkbox_group = gr.CheckboxGroup(
103
  choices=s.check_box['all'],
104
  value=s.check_box['required'],
@@ -106,8 +106,10 @@ with gr.Blocks() as demo:
106
  interactive=True,
107
  )
108
  s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
 
 
109
  s.table['Rank'] = list(range(1, len(s.table) + 1))
110
-
111
  with gr.Row():
112
  s.model_size = gr.CheckboxGroup(
113
  choices=MODEL_SIZE,
@@ -121,6 +123,7 @@ with gr.Blocks() as demo:
121
  label='Model Type',
122
  interactive=True
123
  )
 
124
  s.data_component = gr.components.DataFrame(
125
  value=s.table[s.headers],
126
  type='pandas',
@@ -128,7 +131,7 @@ with gr.Blocks() as demo:
128
  interactive=False,
129
  visible=True)
130
  s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
131
-
132
  def filter_df_l2(dataset_name, fields, model_size, model_type):
133
  s = structs[DATASETS.index(dataset_name)]
134
  headers = ['Rank'] + s.check_box['essential'] + fields
@@ -155,6 +158,8 @@ with gr.Blocks() as demo:
155
  fn=filter_df_l2,
156
  inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
157
  outputs=s.data_component)
 
 
158
 
159
  with gr.Row():
160
  with gr.Accordion('Citation', open=False):
 
98
  s.table, s.check_box = BUILD_L2_DF(results, dataset)
99
  s.type_map = s.check_box['type_map']
100
  s.type_map['Rank'] = 'number'
101
+
102
  s.checkbox_group = gr.CheckboxGroup(
103
  choices=s.check_box['all'],
104
  value=s.check_box['required'],
 
106
  interactive=True,
107
  )
108
  s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
109
+ print(s.check_box['essential'])
110
+ print(s.checkbox_group.value)
111
  s.table['Rank'] = list(range(1, len(s.table) + 1))
112
+ print(s.headers)
113
  with gr.Row():
114
  s.model_size = gr.CheckboxGroup(
115
  choices=MODEL_SIZE,
 
123
  label='Model Type',
124
  interactive=True
125
  )
126
+
127
  s.data_component = gr.components.DataFrame(
128
  value=s.table[s.headers],
129
  type='pandas',
 
131
  interactive=False,
132
  visible=True)
133
  s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
134
+ """
135
  def filter_df_l2(dataset_name, fields, model_size, model_type):
136
  s = structs[DATASETS.index(dataset_name)]
137
  headers = ['Rank'] + s.check_box['essential'] + fields
 
158
  fn=filter_df_l2,
159
  inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
160
  outputs=s.data_component)
161
+ print(s)
162
+ """
163
 
164
  with gr.Row():
165
  with gr.Accordion('Citation', open=False):
gen_table.py CHANGED
@@ -95,6 +95,8 @@ def BUILD_L2_DF(results, dataset):
95
  if dataset == 'OCRBench':
96
  non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
97
  overall_fields = ['Final Score']
 
 
98
 
99
  for m in results:
100
  item = results[m]
@@ -118,6 +120,7 @@ def BUILD_L2_DF(results, dataset):
118
  res[d].append(item[dataset][d])
119
 
120
  df = pd.DataFrame(res)
 
121
  all_fields = overall_fields + non_overall_fields
122
  # Use the first 5 non-overall fields as required fields
123
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
 
95
  if dataset == 'OCRBench':
96
  non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
97
  overall_fields = ['Final Score']
98
+ print(overall_fields)
99
+ print(non_overall_fields)
100
 
101
  for m in results:
102
  item = results[m]
 
120
  res[d].append(item[dataset][d])
121
 
122
  df = pd.DataFrame(res)
123
+ print(df)
124
  all_fields = overall_fields + non_overall_fields
125
  # Use the first 5 non-overall fields as required fields
126
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
meta_data.py CHANGED
@@ -1,6 +1,6 @@
1
  # CONSTANTS-URL
2
  URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
3
- RESULTS = 'OpenVLM_subset.json'
4
  SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md'
5
  # CONSTANTS-CITATION
6
  CITATION_BUTTON_TEXT = r"""
@@ -10,7 +10,7 @@ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
10
  LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard
11
  ### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework:
12
  ### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆
13
- ### Currently, Shopping MMLU Leaderboard covers {} main online shopping skills and {} different LLMs.
14
 
15
  This leaderboard was last updated: {}.
16
 
@@ -26,14 +26,13 @@ META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
26
  # 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
27
  # ]
28
  MAIN_FIELDS = [
29
- 'OCRBench', 'MMStar', 'MMVet','MathVista'
30
  ]
31
  # DEFAULT_BENCH = [
32
  # 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
33
  # 'HallusionBench', 'MMVet'
34
  # ]
35
- DEFAULT_BENCH = ['OCRBench', 'MMStar', 'MMVet','MathVista']
36
- MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
37
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
38
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
39
 
@@ -41,20 +40,23 @@ MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
41
  LEADERBOARD_MD = {}
42
 
43
  LEADERBOARD_MD['MAIN'] = f"""
 
 
 
 
 
 
 
44
  ## Main Evaluation Results
45
 
46
  - Metrics:
47
- - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
48
- - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
49
- - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
50
- - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
51
- - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
52
- - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
53
  """
54
 
55
 
56
 
57
- LEADERBOARD_MD['MMVet'] = """
58
  ## MMVet Evaluation Results
59
 
60
  - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
@@ -63,7 +65,7 @@ LEADERBOARD_MD['MMVet'] = """
63
  """
64
 
65
 
66
- LEADERBOARD_MD['MathVista'] = """
67
  ## MMMU TestMini Evaluation Results
68
 
69
  - We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
@@ -72,14 +74,14 @@ LEADERBOARD_MD['MathVista'] = """
72
  **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
73
  """
74
 
75
- LEADERBOARD_MD['OCRBench'] = """
76
  ## OCRBench Evaluation Results
77
 
78
  - The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
79
  - The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
80
  """
81
 
82
- LEADERBOARD_MD['MMStar'] = """
83
  ## MMStar Evaluation Results
84
 
85
  - MMStar is an elite vision-indispensable multi-modal benchmark, including 1,500 challenging samples meticulously selected by humans.
 
1
  # CONSTANTS-URL
2
  URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
3
+ RESULTS = 'ShoppingMMLU.json'
4
  SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md'
5
  # CONSTANTS-CITATION
6
  CITATION_BUTTON_TEXT = r"""
 
10
  LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard
11
  ### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework:
12
  ### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆
13
+ ### Currently, Shopping MMLU Leaderboard covers {} different LLMs and {} main online shopping skills.
14
 
15
  This leaderboard was last updated: {}.
16
 
 
26
  # 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
27
  # ]
28
  MAIN_FIELDS = [
29
+ 'Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities'
30
  ]
31
  # DEFAULT_BENCH = [
32
  # 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
33
  # 'HallusionBench', 'MMVet'
34
  # ]
35
+ DEFAULT_BENCH = ['Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities']
 
36
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
37
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
38
 
 
40
  LEADERBOARD_MD = {}
41
 
42
  LEADERBOARD_MD['MAIN'] = f"""
43
+ ## Included Shopping Skills:
44
+
45
+ - Shopping Concept Understanding: Understanding domain-specific short texts in online shopping (e.g. brands, product models).
46
+ - Shopping Knowledge Reasoning: Reasoning over commonsense, numeric, and implicit product-product multi-hop knowledge.
47
+ - User Behavior Alignment: Modeling heterogeneous and implicit user behaviors (e.g. click, query, purchase).
48
+ - Multi-lingual Abilities: Online shopping across marketplaces around the globe.
49
+
50
  ## Main Evaluation Results
51
 
52
  - Metrics:
53
+ - Avg Score: The average score on all 4 online shopping skills (normalized to 0 - 100, the higher the better).
54
+ - Detailed metrics and evaluation results for each skill are provided in the consequent tabs.
 
 
 
 
55
  """
56
 
57
 
58
 
59
+ LEADERBOARD_MD['Shopping Concept Understanding'] = """
60
  ## MMVet Evaluation Results
61
 
62
  - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
 
65
  """
66
 
67
 
68
+ LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
69
  ## MMMU TestMini Evaluation Results
70
 
71
  - We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
 
74
  **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
75
  """
76
 
77
+ LEADERBOARD_MD['User Behavior Alignment'] = """
78
  ## OCRBench Evaluation Results
79
 
80
  - The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
81
  - The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
82
  """
83
 
84
+ LEADERBOARD_MD['Multi-lingual Abilities'] = """
85
  ## MMStar Evaluation Results
86
 
87
  - MMStar is an elite vision-indispensable multi-modal benchmark, including 1,500 challenging samples meticulously selected by humans.