binwang commited on
Commit
1d3a534
•
1 Parent(s): 7d66eb7

new format

Browse files
Files changed (1) hide show
  1. app.py +1001 -1044
app.py CHANGED
@@ -38,19 +38,12 @@ def make_clickable_model(model_name, link=None):
38
 
39
 
40
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
41
-
42
-
43
-
44
-
45
  with open('all_results.json', 'r') as f:
46
  ALL_RESULTS = json.load(f)
47
 
48
-
49
  MODEL_LIST = list(ALL_RESULTS.keys())
50
  NUM_MODELS = len(set(MODEL_LIST))
51
  MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
52
-
53
-
54
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
55
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
56
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
@@ -1966,8 +1959,68 @@ MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot")
1966
 
1967
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1968
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1969
 
1970
- block = gr.Blocks()
1971
  with block:
1972
 
1973
  gr.Markdown(f"""
@@ -1979,1054 +2032,954 @@ with block:
1979
  - **Mode of Evaluation**: Zero-Shot, Five-Shot
1980
 
1981
  The following table shows the performance of the models on the SeaEval benchmark.
 
1982
 
1983
  """)
1984
 
1985
- with gr.Tabs():
1986
-
1987
-
1988
- # dataset 1: cross-mmlu
1989
- with gr.TabItem("Cross-MMLU"):
1990
- with gr.Row():
1991
- gr.Markdown("""
1992
- **Cross-MMLU Leaderboard** 🔮
1993
-
1994
- - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
1995
- - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
1996
- """)
1997
-
1998
- with gr.TabItem("zero_shot"):
1999
-
2000
-
2001
- with gr.TabItem("Overall"):
2002
-
2003
- with gr.Row():
2004
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2005
- CROSS_MMLU_ZERO_SHOT_OVERALL,
2006
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
2007
- type="pandas",
2008
- )
2009
-
2010
-
2011
- with gr.TabItem("Language Performance"):
2012
-
2013
- with gr.Row():
2014
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2015
- CROSS_MMLU_ZERO_SHOT_LANGUAGE,
2016
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
2017
- type="pandas",
2018
- )
2019
-
2020
-
2021
- with gr.TabItem("five_shot"):
2022
-
2023
-
2024
- with gr.TabItem("Overall"):
2025
-
2026
- with gr.Row():
2027
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2028
- CROSS_MMLU_FIVE_SHOT_OVERALL,
2029
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
2030
- type="pandas",
2031
- )
2032
 
2033
 
2034
- with gr.TabItem("Language Performance"):
2035
-
2036
- with gr.Row():
2037
- gr.components.Dataframe(
2038
- CROSS_MMLU_FIVE_SHOT_LANGUAGE,
2039
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
2040
- type="pandas",
2041
- )
2042
-
2043
-
2044
-
2045
- # dataset 2: cross-logiqa
2046
- with gr.TabItem("Cross-LogiQA"):
2047
- with gr.Row():
2048
- gr.Markdown("""
2049
- **Cross-LogiQA Leaderboard** 🔮
2050
-
2051
- - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2052
- - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2053
- """)
2054
-
2055
- with gr.TabItem("zero_shot"):
2056
-
2057
-
2058
- with gr.TabItem("Overall"):
2059
-
2060
- with gr.Row():
2061
- gr.components.Dataframe(
2062
- CROSS_LOGIQA_ZERO_SHOT_OVERALL,
2063
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
2064
- type="pandas",
2065
- )
2066
-
2067
-
2068
- with gr.TabItem("Language Performance"):
2069
-
2070
- with gr.Row():
2071
- gr.components.Dataframe(
2072
- CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
2073
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
2074
- type="pandas",
2075
- )
2076
-
2077
-
2078
- with gr.TabItem("five_shot"):
2079
-
2080
-
2081
- with gr.TabItem("Overall"):
2082
-
2083
- with gr.Row():
2084
- gr.components.Dataframe(
2085
- CROSS_LOGIQA_FIVE_SHOT_OVERALL,
2086
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
2087
- type="pandas",
2088
- )
2089
-
2090
-
2091
- with gr.TabItem("Language Performance"):
2092
-
2093
- with gr.Row():
2094
- gr.components.Dataframe(
2095
- CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
2096
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
2097
- type="pandas",
2098
- )
2099
-
2100
-
2101
- # dataset 3: SG_EVAL
2102
- with gr.TabItem("SG_EVAL"):
2103
- with gr.Row():
2104
- gr.Markdown("""
2105
- **SG_EVAL Leaderboard** 🔮
2106
-
2107
- - **Metric:** Accuracy
2108
- - **Languages:** English
2109
- """)
2110
-
2111
- with gr.TabItem("zero_shot"):
2112
- with gr.TabItem("Overall"):
2113
- with gr.Row():
2114
- gr.components.Dataframe(
2115
- SG_EVAL_ZERO_SHOT,
2116
- datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
2117
- type="pandas",
2118
- )
2119
-
2120
- with gr.TabItem("five_shot"):
2121
- with gr.TabItem("Overall"):
2122
- with gr.Row():
2123
- gr.components.Dataframe(
2124
- SG_EVAL_FIVE_SHOT,
2125
- datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
2126
- type="pandas",
2127
- )
2128
-
2129
-
2130
- # dataset 4:
2131
- with gr.TabItem("US_EVAL"):
2132
- with gr.Row():
2133
- gr.Markdown("""
2134
- **US_EVAL Leaderboard** 🔮
2135
-
2136
- - **Metric:** Accuracy
2137
- - **Languages:** English
2138
- """)
2139
-
2140
- with gr.TabItem("zero_shot"):
2141
- with gr.TabItem("Overall"):
2142
- with gr.Row():
2143
- gr.components.Dataframe(
2144
- US_EVAL_ZERO_SHOT,
2145
- datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
2146
- type="pandas",
2147
- )
2148
-
2149
- with gr.TabItem("five_shot"):
2150
- with gr.TabItem("Overall"):
2151
- with gr.Row():
2152
- gr.components.Dataframe(
2153
- US_EVAL_FIVE_SHOT,
2154
- datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
2155
- type="pandas",
2156
- )
2157
-
2158
-
2159
- # dataset 5:
2160
- with gr.TabItem("CN_EVAL"):
2161
- with gr.Row():
2162
- gr.Markdown("""
2163
- **CN_EVAL Leaderboard** 🔮
2164
-
2165
- - **Metric:** Accuracy
2166
- - **Languages:** Chinese
2167
- """)
2168
-
2169
- with gr.TabItem("zero_shot"):
2170
- with gr.TabItem("Overall"):
2171
- with gr.Row():
2172
- gr.components.Dataframe(
2173
- CN_EVAL_ZERO_SHOT,
2174
- datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
2175
- type="pandas",
2176
- )
2177
-
2178
- with gr.TabItem("five_shot"):
2179
- with gr.TabItem("Overall"):
2180
- with gr.Row():
2181
- gr.components.Dataframe(
2182
- CN_EVAL_FIVE_SHOT,
2183
- datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
2184
- type="pandas",
2185
- )
2186
-
2187
-
2188
-
2189
- # dataset 6:
2190
- with gr.TabItem("PH_EVAL"):
2191
- with gr.Row():
2192
- gr.Markdown("""
2193
- **PH_EVAL Leaderboard** 🔮
2194
-
2195
- - **Metric:** Accuracy
2196
- - **Languages:** English
2197
- """)
2198
-
2199
- with gr.TabItem("zero_shot"):
2200
- with gr.TabItem("Overall"):
2201
- with gr.Row():
2202
- gr.components.Dataframe(
2203
- PH_EVAL_ZERO_SHOT,
2204
- datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
2205
- type="pandas",
2206
- )
2207
-
2208
- with gr.TabItem("five_shot"):
2209
- with gr.TabItem("Overall"):
2210
- with gr.Row():
2211
- gr.components.Dataframe(
2212
- PH_EVAL_FIVE_SHOT,
2213
- datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
2214
- type="pandas",
2215
- )
2216
-
2217
-
2218
- # dataset 7:
2219
- with gr.TabItem("Singlish to English Translation"):
2220
- with gr.Row():
2221
- gr.Markdown("""
2222
- **SING2ENG Leaderboard** 🔮
2223
-
2224
- - **Metric:** BLEU Avg.
2225
- - **Languages:** English
2226
- """)
2227
-
2228
- with gr.TabItem("zero_shot"):
2229
- with gr.TabItem("Overall"):
2230
- with gr.Row():
2231
- gr.components.Dataframe(
2232
- SING2ENG_ZERO_SHOT,
2233
- datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
2234
- type="pandas",
2235
- )
2236
-
2237
- with gr.TabItem("five_shot"):
2238
- with gr.TabItem("Overall"):
2239
- with gr.Row():
2240
- gr.components.Dataframe(
2241
- SING2ENG_FIVE_SHOT,
2242
- datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
2243
- type="pandas",
2244
- )
2245
-
2246
-
2247
- gr.Markdown(f"""
2248
- The following are datasets that are not originally collected by SeaEval, but are included in the leaderboard for completeness.
2249
- """)
2250
-
2251
- with gr.Tabs():
2252
-
2253
-
2254
-
2255
-
2256
- # dataset 8:
2257
- with gr.TabItem("FLORES Indonesian to English Translation"):
2258
- with gr.Row():
2259
- gr.Markdown("""
2260
- **flores_ind2eng Leaderboard** 🔮
2261
-
2262
- - **Metric:** BLEU Avg.
2263
- - **Languages:** English
2264
- """)
2265
-
2266
- with gr.TabItem("zero_shot"):
2267
- with gr.TabItem("Overall"):
2268
- with gr.Row():
2269
- gr.components.Dataframe(
2270
- FLORES_IND2ENG_ZERO_SHOT,
2271
- datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
2272
- type="pandas",
2273
- )
2274
-
2275
- with gr.TabItem("five_shot"):
2276
- with gr.TabItem("Overall"):
2277
- with gr.Row():
2278
- gr.components.Dataframe(
2279
- FLORES_IND2ENG_FIVE_SHOT,
2280
- datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
2281
- type="pandas",
2282
- )
2283
-
2284
-
2285
- # dataset 9:
2286
- with gr.TabItem("FLORES Vitenamese to English Translation"):
2287
- with gr.Row():
2288
- gr.Markdown("""
2289
- **flores_vie2eng Leaderboard** 🔮
2290
-
2291
- - **Metric:** BLEU Avg.
2292
- - **Languages:** English
2293
- """)
2294
-
2295
- with gr.TabItem("zero_shot"):
2296
- with gr.TabItem("Overall"):
2297
- with gr.Row():
2298
- gr.components.Dataframe(
2299
- FLORES_VIE2ENG_ZERO_SHOT,
2300
- datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
2301
- type="pandas",
2302
- )
2303
-
2304
- with gr.TabItem("five_shot"):
2305
- with gr.TabItem("Overall"):
2306
- with gr.Row():
2307
- gr.components.Dataframe(
2308
- FLORES_VIE2ENG_FIVE_SHOT,
2309
- datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
2310
- type="pandas",
2311
- )
2312
-
2313
-
2314
-
2315
- # dataset 10:
2316
- with gr.TabItem("FLORES Chinese to English Translation"):
2317
- with gr.Row():
2318
- gr.Markdown("""
2319
- **flores_zho2eng Leaderboard** 🔮
2320
-
2321
- - **Metric:** BLEU Avg.
2322
- - **Languages:** English
2323
- """)
2324
-
2325
- with gr.TabItem("zero_shot"):
2326
- with gr.TabItem("Overall"):
2327
- with gr.Row():
2328
- gr.components.Dataframe(
2329
- FLORES_ZHO2ENG_ZERO_SHOT,
2330
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
2331
- type="pandas",
2332
- )
2333
-
2334
- with gr.TabItem("five_shot"):
2335
- with gr.TabItem("Overall"):
2336
- with gr.Row():
2337
- gr.components.Dataframe(
2338
- FLORES_ZHO2ENG_FIVE_SHOT,
2339
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
2340
- type="pandas",
2341
- )
2342
-
2343
-
2344
-
2345
- # dataset 11:
2346
- with gr.TabItem("FLORES Malay to English Translation"):
2347
- with gr.Row():
2348
- gr.Markdown("""
2349
- **flores_zsm2eng Leaderboard** 🔮
2350
-
2351
- - **Metric:** BLEU Avg.
2352
- - **Languages:** English
2353
- """)
2354
-
2355
- with gr.TabItem("zero_shot"):
2356
- with gr.TabItem("Overall"):
2357
- with gr.Row():
2358
- gr.components.Dataframe(
2359
- FLORES_ZSM2ENG_ZERO_SHOT,
2360
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
2361
- type="pandas",
2362
- )
2363
-
2364
- with gr.TabItem("five_shot"):
2365
- with gr.TabItem("Overall"):
2366
- with gr.Row():
2367
- gr.components.Dataframe(
2368
- FLORES_ZSM2ENG_FIVE_SHOT,
2369
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
2370
- type="pandas",
2371
- )
2372
-
2373
-
2374
- # dataset 12:
2375
- with gr.TabItem("MMLU"):
2376
- with gr.Row():
2377
- gr.Markdown("""
2378
- **MMLU Leaderboard** 🔮
2379
-
2380
- - **Metric:** Accuracy.
2381
- - **Languages:** English
2382
- """)
2383
-
2384
- with gr.TabItem("zero_shot"):
2385
- with gr.TabItem("Overall"):
2386
- with gr.Row():
2387
- gr.components.Dataframe(
2388
- MMLU_ZERO_SHOT,
2389
- datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
2390
- type="pandas",
2391
- )
2392
-
2393
- with gr.TabItem("five_shot"):
2394
- with gr.TabItem("Overall"):
2395
- with gr.Row():
2396
- gr.components.Dataframe(
2397
- MMLU_FIVE_SHOT,
2398
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
2399
- type="pandas",
2400
- )
2401
-
2402
-
2403
- # dataset 13:
2404
- with gr.TabItem("MMLU Full"):
2405
- with gr.Row():
2406
- gr.Markdown("""
2407
- **MMLU Full Leaderboard** 🔮
2408
-
2409
- - **Metric:** Accuracy.
2410
- - **Languages:** English
2411
- """)
2412
-
2413
- with gr.TabItem("zero_shot"):
2414
- with gr.TabItem("Overall"):
2415
- with gr.Row():
2416
- gr.components.Dataframe(
2417
- MMLU_FULL_ZERO_SHOT,
2418
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
2419
- type="pandas",
2420
- )
2421
-
2422
-
2423
-
2424
- with gr.TabItem("five_shot"):
2425
- with gr.TabItem("Overall"):
2426
- with gr.Row():
2427
- gr.components.Dataframe(
2428
- MMLU_FULL_FIVE_SHOT,
2429
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
2430
- type="pandas",
2431
- )
2432
-
2433
- # dataset 14:
2434
- with gr.TabItem("C_EVAL"):
2435
- with gr.Row():
2436
- gr.Markdown("""
2437
- **C_EVAL Leaderboard** 🔮
2438
-
2439
- - **Metric:** Accuracy.
2440
- - **Languages:** Chinese
2441
- """)
2442
-
2443
- with gr.TabItem("zero_shot"):
2444
- with gr.TabItem("Overall"):
2445
- with gr.Row():
2446
- gr.components.Dataframe(
2447
- C_EVAL_ZERO_SHOT,
2448
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
2449
- type="pandas",
2450
- )
2451
-
2452
-
2453
-
2454
- with gr.TabItem("five_shot"):
2455
- with gr.TabItem("Overall"):
2456
- with gr.Row():
2457
- gr.components.Dataframe(
2458
- C_EVAL_FIVE_SHOT,
2459
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
2460
- type="pandas",
2461
- )
2462
-
2463
-
2464
- # dataset 15:
2465
- with gr.TabItem("C_EVAL Full"):
2466
- with gr.Row():
2467
- gr.Markdown("""
2468
- **C_EVAL Full Leaderboard** 🔮
2469
-
2470
- - **Metric:** Accuracy.
2471
- - **Languages:** Chinese
2472
- """)
2473
-
2474
- with gr.TabItem("zero_shot"):
2475
- with gr.TabItem("Overall"):
2476
- with gr.Row():
2477
- gr.components.Dataframe(
2478
- C_EVAL_FULL_ZERO_SHOT,
2479
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
2480
- type="pandas",
2481
- )
2482
-
2483
-
2484
-
2485
- with gr.TabItem("five_shot"):
2486
- with gr.TabItem("Overall"):
2487
- with gr.Row():
2488
- gr.components.Dataframe(
2489
- C_EVAL_FULL_FIVE_SHOT,
2490
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
2491
- type="pandas",
2492
- )
2493
-
2494
- # dataset 16:
2495
- with gr.TabItem("CMMLU"):
2496
- with gr.Row():
2497
- gr.Markdown("""
2498
- **CMMLU Leaderboard** 🔮
2499
-
2500
- - **Metric:** Accuracy.
2501
- - **Languages:** Chinese
2502
- """)
2503
-
2504
- with gr.TabItem("zero_shot"):
2505
- with gr.TabItem("Overall"):
2506
- with gr.Row():
2507
- gr.components.Dataframe(
2508
- CMMLU_ZERO_SHOT,
2509
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
2510
- type="pandas",
2511
- )
2512
-
2513
-
2514
-
2515
- with gr.TabItem("five_shot"):
2516
- with gr.TabItem("Overall"):
2517
- with gr.Row():
2518
- gr.components.Dataframe(
2519
- CMMLU_FIVE_SHOT,
2520
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
2521
- type="pandas",
2522
- )
2523
-
2524
- # dataset 17:
2525
- with gr.TabItem("CMMLU Full"):
2526
- with gr.Row():
2527
- gr.Markdown("""
2528
- **CMMLU Full Leaderboard** 🔮
2529
-
2530
- - **Metric:** Accuracy.
2531
- - **Languages:** Chinese
2532
- """)
2533
-
2534
- with gr.TabItem("zero_shot"):
2535
- with gr.TabItem("Overall"):
2536
- with gr.Row():
2537
- gr.components.Dataframe(
2538
- CMMLU_FULL_ZERO_SHOT,
2539
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
2540
- type="pandas",
2541
- )
2542
-
2543
-
2544
-
2545
- with gr.TabItem("five_shot"):
2546
- with gr.TabItem("Overall"):
2547
- with gr.Row():
2548
- gr.components.Dataframe(
2549
- CMMLU_FULL_FIVE_SHOT,
2550
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
2551
- type="pandas",
2552
- )
2553
-
2554
- # dataset 18:
2555
- with gr.TabItem("ZBench"):
2556
- with gr.Row():
2557
- gr.Markdown("""
2558
- **ZBench Leaderboard** 🔮
2559
-
2560
- - **Metric:** Accuracy.
2561
- - **Languages:** Chinese
2562
- """)
2563
-
2564
- with gr.TabItem("zero_shot"):
2565
- with gr.TabItem("Overall"):
2566
- with gr.Row():
2567
- gr.components.Dataframe(
2568
- ZBENCH_ZERO_SHOT,
2569
- datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
2570
- type="pandas",
2571
- )
2572
-
2573
-
2574
-
2575
- with gr.TabItem("five_shot"):
2576
- with gr.TabItem("Overall"):
2577
- with gr.Row():
2578
- gr.components.Dataframe(
2579
- ZBENCH_FIVE_SHOT,
2580
- datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
2581
- type="pandas",
2582
- )
2583
-
2584
-
2585
- # dataset 18:
2586
- with gr.TabItem("ind_emotion"):
2587
- with gr.Row():
2588
- gr.Markdown("""
2589
- **ind_emotion Leaderboard** 🔮
2590
-
2591
- - **Metric:** Accuracy.
2592
- - **Languages:** Indonesian
2593
- """)
2594
-
2595
- with gr.TabItem("zero_shot"):
2596
- with gr.TabItem("Overall"):
2597
- with gr.Row():
2598
- gr.components.Dataframe(
2599
- IND_EMOTION_ZERO_SHOT,
2600
- datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
2601
- type="pandas",
2602
- )
2603
-
2604
-
2605
-
2606
- with gr.TabItem("five_shot"):
2607
- with gr.TabItem("Overall"):
2608
- with gr.Row():
2609
- gr.components.Dataframe(
2610
- IND_EMOTION_FIVE_SHOT,
2611
- datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns),
2612
- type="pandas",
2613
- )
2614
-
2615
-
2616
- # dataset
2617
- with gr.TabItem("OCNLI"):
2618
- with gr.Row():
2619
- gr.Markdown("""
2620
- **OCNLI Leaderboard** 🔮
2621
-
2622
- - **Metric:** Accuracy.
2623
- - **Languages:** Chinese
2624
- """)
2625
-
2626
- with gr.TabItem("zero_shot"):
2627
- with gr.TabItem("Overall"):
2628
- with gr.Row():
2629
- gr.components.Dataframe(
2630
- OCNLI_ZERO_SHOT,
2631
- datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns),
2632
- type="pandas",
2633
- )
2634
-
2635
-
2636
-
2637
- with gr.TabItem("five_shot"):
2638
- with gr.TabItem("Overall"):
2639
- with gr.Row():
2640
- gr.components.Dataframe(
2641
- OCNLI_FIVE_SHOT,
2642
- datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns),
2643
- type="pandas",
2644
- )
2645
-
2646
- # dataset
2647
- with gr.TabItem("C3"):
2648
- with gr.Row():
2649
- gr.Markdown("""
2650
- **C3 Leaderboard** 🔮
2651
-
2652
- - **Metric:** Accuracy.
2653
- - **Languages:** Chinese
2654
- """)
2655
-
2656
- with gr.TabItem("zero_shot"):
2657
- with gr.TabItem("Overall"):
2658
- with gr.Row():
2659
- gr.components.Dataframe(
2660
- C3_ZERO_SHOT,
2661
- datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns),
2662
- type="pandas",
2663
- )
2664
-
2665
-
2666
-
2667
- with gr.TabItem("five_shot"):
2668
- with gr.TabItem("Overall"):
2669
- with gr.Row():
2670
- gr.components.Dataframe(
2671
- C3_FIVE_SHOT,
2672
- datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns),
2673
- type="pandas",
2674
- )
2675
-
2676
-
2677
- # dataset
2678
- with gr.TabItem("DREAM"):
2679
- with gr.Row():
2680
- gr.Markdown("""
2681
- **DREAM Leaderboard** 🔮
2682
-
2683
- - **Metric:** Accuracy.
2684
- - **Languages:** English
2685
- """)
2686
-
2687
- with gr.TabItem("zero_shot"):
2688
- with gr.TabItem("Overall"):
2689
- with gr.Row():
2690
- gr.components.Dataframe(
2691
- DREAM_ZERO_SHOT,
2692
- datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns),
2693
- type="pandas",
2694
- )
2695
-
2696
-
2697
-
2698
- with gr.TabItem("five_shot"):
2699
- with gr.TabItem("Overall"):
2700
- with gr.Row():
2701
- gr.components.Dataframe(
2702
- DREAM_FIVE_SHOT,
2703
- datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns),
2704
- type="pandas",
2705
- )
2706
-
2707
- # dataset
2708
- with gr.TabItem("SAMSum"):
2709
- with gr.Row():
2710
- gr.Markdown("""
2711
- **SAMSum Leaderboard** 🔮
2712
-
2713
- - **Metric:** ROUGE.
2714
- - **Languages:** English
2715
- """)
2716
-
2717
- with gr.TabItem("zero_shot"):
2718
- with gr.TabItem("Overall"):
2719
- with gr.Row():
2720
- gr.components.Dataframe(
2721
- SAMSUM_ZERO_SHOT,
2722
- datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns),
2723
- type="pandas",
2724
- )
2725
-
2726
-
2727
-
2728
- with gr.TabItem("five_shot"):
2729
- with gr.TabItem("Overall"):
2730
- with gr.Row():
2731
- gr.components.Dataframe(
2732
- SAMSUM_FIVE_SHOT,
2733
- datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns),
2734
- type="pandas",
2735
- )
2736
-
2737
-
2738
-
2739
- # dataset
2740
- with gr.TabItem("DialogSum"):
2741
- with gr.Row():
2742
- gr.Markdown("""
2743
- **DialogSum Leaderboard** 🔮
2744
-
2745
- - **Metric:** ROUGE.
2746
- - **Languages:** English
2747
- """)
2748
-
2749
- with gr.TabItem("zero_shot"):
2750
- with gr.TabItem("Overall"):
2751
- with gr.Row():
2752
- gr.components.Dataframe(
2753
- DIALOGSUM_ZERO_SHOT,
2754
- datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns),
2755
- type="pandas",
2756
- )
2757
-
2758
-
2759
-
2760
- with gr.TabItem("five_shot"):
2761
- with gr.TabItem("Overall"):
2762
- with gr.Row():
2763
- gr.components.Dataframe(
2764
- DIALOGSUM_FIVE_SHOT,
2765
- datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns),
2766
- type="pandas",
2767
- )
2768
-
2769
-
2770
- # dataset
2771
- with gr.TabItem("SST2"):
2772
- with gr.Row():
2773
- gr.Markdown("""
2774
- **SST2 Leaderboard** 🔮
2775
-
2776
- - **Metric:** Accuracy.
2777
- - **Languages:** English
2778
- """)
2779
-
2780
- with gr.TabItem("zero_shot"):
2781
- with gr.TabItem("Overall"):
2782
- with gr.Row():
2783
- gr.components.Dataframe(
2784
- SST2_ZERO_SHOT,
2785
- datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns),
2786
- type="pandas",
2787
- )
2788
-
2789
-
2790
-
2791
- with gr.TabItem("five_shot"):
2792
- with gr.TabItem("Overall"):
2793
- with gr.Row():
2794
- gr.components.Dataframe(
2795
- SST2_FIVE_SHOT,
2796
- datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns),
2797
- type="pandas",
2798
- )
2799
-
2800
-
2801
- # dataset
2802
- with gr.TabItem("COLA"):
2803
- with gr.Row():
2804
- gr.Markdown("""
2805
- **COLA Leaderboard** 🔮
2806
-
2807
- - **Metric:** Accuracy.
2808
- - **Languages:** English
2809
- """)
2810
-
2811
- with gr.TabItem("zero_shot"):
2812
- with gr.TabItem("Overall"):
2813
- with gr.Row():
2814
- gr.components.Dataframe(
2815
- COLA_ZERO_SHOT,
2816
- datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns),
2817
- type="pandas",
2818
- )
2819
-
2820
-
2821
-
2822
- with gr.TabItem("five_shot"):
2823
- with gr.TabItem("Overall"):
2824
- with gr.Row():
2825
- gr.components.Dataframe(
2826
- COLA_FIVE_SHOT,
2827
- datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns),
2828
- type="pandas",
2829
- )
2830
-
2831
-
2832
- # dataset
2833
- with gr.TabItem("QQP"):
2834
- with gr.Row():
2835
- gr.Markdown("""
2836
- **QQP Leaderboard** 🔮
2837
-
2838
- - **Metric:** Accuracy.
2839
- - **Languages:** English
2840
- """)
2841
-
2842
- with gr.TabItem("zero_shot"):
2843
- with gr.TabItem("Overall"):
2844
- with gr.Row():
2845
- gr.components.Dataframe(
2846
- QQP_ZERO_SHOT,
2847
- datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns),
2848
- type="pandas",
2849
- )
2850
-
2851
-
2852
-
2853
- with gr.TabItem("five_shot"):
2854
- with gr.TabItem("Overall"):
2855
- with gr.Row():
2856
- gr.components.Dataframe(
2857
- QQP_FIVE_SHOT,
2858
- datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns),
2859
- type="pandas",
2860
- )
2861
-
2862
-
2863
- # dataset
2864
- with gr.TabItem("MNLI"):
2865
- with gr.Row():
2866
- gr.Markdown("""
2867
- **MNLI Leaderboard** 🔮
2868
-
2869
- - **Metric:** Accuracy.
2870
- - **Languages:** English
2871
- """)
2872
-
2873
- with gr.TabItem("zero_shot"):
2874
- with gr.TabItem("Overall"):
2875
- with gr.Row():
2876
- gr.components.Dataframe(
2877
- MNLI_ZERO_SHOT,
2878
- datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns),
2879
- type="pandas",
2880
- )
2881
-
2882
-
2883
-
2884
- with gr.TabItem("five_shot"):
2885
- with gr.TabItem("Overall"):
2886
- with gr.Row():
2887
- gr.components.Dataframe(
2888
- MNLI_FIVE_SHOT,
2889
- datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns),
2890
- type="pandas",
2891
- )
2892
-
2893
-
2894
- # dataset
2895
- with gr.TabItem("QNLI"):
2896
- with gr.Row():
2897
- gr.Markdown("""
2898
- **QNLI Leaderboard** 🔮
2899
-
2900
- - **Metric:** Accuracy.
2901
- - **Languages:** English
2902
- """)
2903
-
2904
- with gr.TabItem("zero_shot"):
2905
- with gr.TabItem("Overall"):
2906
- with gr.Row():
2907
- gr.components.Dataframe(
2908
- QNLI_ZERO_SHOT,
2909
- datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns),
2910
- type="pandas",
2911
- )
2912
-
2913
-
2914
-
2915
- with gr.TabItem("five_shot"):
2916
- with gr.TabItem("Overall"):
2917
- with gr.Row():
2918
- gr.components.Dataframe(
2919
- QNLI_FIVE_SHOT,
2920
- datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns),
2921
- type="pandas",
2922
- )
2923
-
2924
-
2925
- # dataset
2926
- with gr.TabItem("WNLI"):
2927
- with gr.Row():
2928
- gr.Markdown("""
2929
- **WNLI Leaderboard** 🔮
2930
-
2931
- - **Metric:** Accuracy.
2932
- - **Languages:** English
2933
- """)
2934
-
2935
- with gr.TabItem("zero_shot"):
2936
- with gr.TabItem("Overall"):
2937
- with gr.Row():
2938
- gr.components.Dataframe(
2939
- WNLI_ZERO_SHOT,
2940
- datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns),
2941
- type="pandas",
2942
- )
2943
-
2944
-
2945
-
2946
- with gr.TabItem("five_shot"):
2947
- with gr.TabItem("Overall"):
2948
- with gr.Row():
2949
- gr.components.Dataframe(
2950
- WNLI_FIVE_SHOT,
2951
- datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns),
2952
- type="pandas",
2953
- )
2954
-
2955
-
2956
- # dataset
2957
- with gr.TabItem("RTE"):
2958
- with gr.Row():
2959
- gr.Markdown("""
2960
- **RTE Leaderboard** 🔮
2961
-
2962
- - **Metric:** Accuracy.
2963
- - **Languages:** English
2964
- """)
2965
-
2966
- with gr.TabItem("zero_shot"):
2967
- with gr.TabItem("Overall"):
2968
- with gr.Row():
2969
- gr.components.Dataframe(
2970
- RTE_ZERO_SHOT,
2971
- datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns),
2972
- type="pandas",
2973
- )
2974
-
2975
-
2976
-
2977
- with gr.TabItem("five_shot"):
2978
- with gr.TabItem("Overall"):
2979
- with gr.Row():
2980
- gr.components.Dataframe(
2981
- RTE_FIVE_SHOT,
2982
- datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns),
2983
- type="pandas",
2984
- )
2985
-
2986
-
2987
- # dataset
2988
- with gr.TabItem("MRPC"):
2989
- with gr.Row():
2990
- gr.Markdown("""
2991
- **MRPC Leaderboard** 🔮
2992
-
2993
- - **Metric:** Accuracy.
2994
- - **Languages:** English
2995
- """)
2996
-
2997
- with gr.TabItem("zero_shot"):
2998
- with gr.TabItem("Overall"):
2999
- with gr.Row():
3000
- gr.components.Dataframe(
3001
- MRPC_ZERO_SHOT,
3002
- datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns),
3003
- type="pandas",
3004
- )
3005
-
3006
-
3007
-
3008
- with gr.TabItem("five_shot"):
3009
- with gr.TabItem("Overall"):
3010
- with gr.Row():
3011
- gr.components.Dataframe(
3012
- MRPC_FIVE_SHOT,
3013
- datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns),
3014
- type="pandas",
3015
- )
3016
-
3017
 
3018
 
3019
  gr.Markdown(r"""
3020
-
3021
- If this work is useful to you, please citing our work:
3022
-
3023
  ```bibtex
3024
  @article{SeaEval2023,
3025
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
3026
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
3027
  journal={arXiv preprint arXiv:2309.04766},
3028
- year={2023}
3029
- }
3030
  ```
3031
  """)
3032
  # Running the functions on page load in addition to when the button is clicked
@@ -3035,8 +2988,12 @@ with block:
3035
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
3036
  """
3037
 
 
 
 
 
3038
  block.queue(max_size=10)
3039
- block.launch(server_name="0.0.0.0", share=True)
3040
 
3041
 
3042
  # Possible changes:
 
38
 
39
 
40
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
 
 
41
  with open('all_results.json', 'r') as f:
42
  ALL_RESULTS = json.load(f)
43
 
 
44
  MODEL_LIST = list(ALL_RESULTS.keys())
45
  NUM_MODELS = len(set(MODEL_LIST))
46
  MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
 
 
47
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
48
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
49
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
1959
 
1960
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1961
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1962
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1963
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1964
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1965
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1966
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1967
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1968
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1969
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1970
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1971
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1972
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1973
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1974
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1975
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1976
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1977
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1978
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1979
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1980
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1981
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1982
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1983
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1984
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1985
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1986
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1987
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1988
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1989
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1990
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1991
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1992
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1993
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1994
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1995
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1996
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1997
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1998
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1999
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2000
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2001
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2002
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2003
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2004
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2005
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2006
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2007
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2008
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2009
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2010
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2011
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2012
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2013
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2014
+
2015
+ # block = gr.Blocks(theme=gr.themes.Soft())
2016
+
2017
+ theme = gr.themes.Soft().set(
2018
+ background_fill_primary='*secondary_50'
2019
+ )
2020
+
2021
+ block = gr.Blocks(theme='finlaymacklon/smooth_slate')
2022
+
2023
 
 
2024
  with block:
2025
 
2026
  gr.Markdown(f"""
 
2032
  - **Mode of Evaluation**: Zero-Shot, Five-Shot
2033
 
2034
  The following table shows the performance of the models on the SeaEval benchmark.
2035
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2036
 
2037
  """)
2038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2039
 
2040
 
2041
+ with gr.Tabs():
2042
+
2043
+
2044
+ with gr.TabItem("Cross-Lingual Consistency"):
2045
+
2046
+ # dataset 1: cross-mmlu
2047
+ with gr.TabItem("Cross-MMLU"):
2048
+ with gr.TabItem("Zero Shot"):
2049
+ with gr.TabItem("Overall"):
2050
+ with gr.Row():
2051
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2052
+ CROSS_MMLU_ZERO_SHOT_OVERALL,
2053
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
2054
+ type="pandas",
2055
+ )
2056
+ with gr.TabItem("Language Performance"):
2057
+
2058
+ with gr.Row():
2059
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2060
+ CROSS_MMLU_ZERO_SHOT_LANGUAGE,
2061
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
2062
+ type="pandas",
2063
+ )
2064
+ with gr.TabItem("Five Shot"):
2065
+ with gr.TabItem("Overall"):
2066
+
2067
+ with gr.Row():
2068
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2069
+ CROSS_MMLU_FIVE_SHOT_OVERALL,
2070
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
2071
+ type="pandas",
2072
+ )
2073
+ with gr.TabItem("Language Performance"):
2074
+
2075
+ with gr.Row():
2076
+ gr.components.Dataframe(
2077
+ CROSS_MMLU_FIVE_SHOT_LANGUAGE,
2078
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
2079
+ type="pandas",
2080
+ )
2081
+
2082
+ with gr.Row():
2083
+ gr.Markdown("""
2084
+ **Cross-MMLU Leaderboard** 🔮
2085
+
2086
+ - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2087
+ - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2088
+ """)
2089
+
2090
+
2091
+ # dataset 2: cross-logiqa
2092
+ with gr.TabItem("Cross-LogiQA"):
2093
+ with gr.TabItem("Zero Shot"):
2094
+ with gr.TabItem("Overall"):
2095
+ with gr.Row():
2096
+ gr.components.Dataframe(
2097
+ CROSS_LOGIQA_ZERO_SHOT_OVERALL,
2098
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
2099
+ type="pandas",
2100
+ )
2101
+ with gr.TabItem("Language Performance"):
2102
+
2103
+ with gr.Row():
2104
+ gr.components.Dataframe(
2105
+ CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
2106
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
2107
+ type="pandas",
2108
+ )
2109
+ with gr.TabItem("Five Shot"):
2110
+ with gr.TabItem("Overall"):
2111
+ with gr.Row():
2112
+ gr.components.Dataframe(
2113
+ CROSS_LOGIQA_FIVE_SHOT_OVERALL,
2114
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
2115
+ type="pandas",
2116
+ )
2117
+ with gr.TabItem("Language Performance"):
2118
+ with gr.Row():
2119
+ gr.components.Dataframe(
2120
+ CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
2121
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
2122
+ type="pandas",
2123
+ )
2124
+ with gr.Row():
2125
+ gr.Markdown("""
2126
+ **Cross-LogiQA Leaderboard** 🔮
2127
+
2128
+ - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2129
+ - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2130
+ """)
2131
+
2132
+
2133
+
2134
+ with gr.TabItem("Cultural Reasoning and Understanding"):
2135
+
2136
+ # dataset 3: SG_EVAL
2137
+ with gr.TabItem("SG_EVAL"):
2138
+ with gr.TabItem("Zero Shot"):
2139
+ with gr.TabItem("Overall"):
2140
+ with gr.Row():
2141
+ gr.components.Dataframe(
2142
+ SG_EVAL_ZERO_SHOT,
2143
+ datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
2144
+ type="pandas",
2145
+ )
2146
+ with gr.TabItem("Five Shot"):
2147
+ with gr.TabItem("Overall"):
2148
+ with gr.Row():
2149
+ gr.components.Dataframe(
2150
+ SG_EVAL_FIVE_SHOT,
2151
+ datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
2152
+ type="pandas",
2153
+ )
2154
+ with gr.Row():
2155
+ gr.Markdown("""
2156
+ **SG_EVAL Leaderboard** 🔮
2157
+
2158
+ - **Metric:** Accuracy
2159
+ - **Languages:** English
2160
+ """)
2161
+
2162
+
2163
+
2164
+
2165
+ # dataset 4:
2166
+ with gr.TabItem("US_EVAL"):
2167
+ with gr.TabItem("Zero Shot"):
2168
+ with gr.TabItem("Overall"):
2169
+ with gr.Row():
2170
+ gr.components.Dataframe(
2171
+ US_EVAL_ZERO_SHOT,
2172
+ datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
2173
+ type="pandas",
2174
+ )
2175
+ with gr.TabItem("Five Shot"):
2176
+ with gr.TabItem("Overall"):
2177
+ with gr.Row():
2178
+ gr.components.Dataframe(
2179
+ US_EVAL_FIVE_SHOT,
2180
+ datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
2181
+ type="pandas",
2182
+ )
2183
+ with gr.Row():
2184
+ gr.Markdown("""
2185
+ **US_EVAL Leaderboard** 🔮
2186
+
2187
+ - **Metric:** Accuracy
2188
+ - **Languages:** English
2189
+ """)
2190
+
2191
+
2192
+
2193
+ # dataset 5:
2194
+ with gr.TabItem("CN_EVAL"):
2195
+ with gr.TabItem("Zero Shot"):
2196
+ with gr.TabItem("Overall"):
2197
+ with gr.Row():
2198
+ gr.components.Dataframe(
2199
+ CN_EVAL_ZERO_SHOT,
2200
+ datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
2201
+ type="pandas",
2202
+ )
2203
+ with gr.TabItem("Five Shot"):
2204
+ with gr.TabItem("Overall"):
2205
+ with gr.Row():
2206
+ gr.components.Dataframe(
2207
+ CN_EVAL_FIVE_SHOT,
2208
+ datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
2209
+ type="pandas",
2210
+ )
2211
+ with gr.Row():
2212
+ gr.Markdown("""
2213
+ **CN_EVAL Leaderboard** 🔮
2214
+
2215
+ - **Metric:** Accuracy
2216
+ - **Languages:** Chinese
2217
+ """)
2218
+
2219
+
2220
+ # dataset 6:
2221
+ with gr.TabItem("PH_EVAL"):
2222
+ with gr.TabItem("Zero Shot"):
2223
+ with gr.TabItem("Overall"):
2224
+ with gr.Row():
2225
+ gr.components.Dataframe(
2226
+ PH_EVAL_ZERO_SHOT,
2227
+ datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
2228
+ type="pandas",
2229
+ )
2230
+ with gr.TabItem("Five Shot"):
2231
+ with gr.TabItem("Overall"):
2232
+ with gr.Row():
2233
+ gr.components.Dataframe(
2234
+ PH_EVAL_FIVE_SHOT,
2235
+ datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
2236
+ type="pandas",
2237
+ )
2238
+ with gr.Row():
2239
+ gr.Markdown("""
2240
+ **PH_EVAL Leaderboard** 🔮
2241
+
2242
+ - **Metric:** Accuracy
2243
+ - **Languages:** English
2244
+ """)
2245
+
2246
+
2247
+ # dataset 7:
2248
+ with gr.TabItem("Singlish to English Translation"):
2249
+ with gr.TabItem("Zero Shot"):
2250
+ with gr.TabItem("Overall"):
2251
+ with gr.Row():
2252
+ gr.components.Dataframe(
2253
+ SING2ENG_ZERO_SHOT,
2254
+ datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
2255
+ type="pandas",
2256
+ )
2257
+ with gr.TabItem("Five Shot"):
2258
+ with gr.TabItem("Overall"):
2259
+ with gr.Row():
2260
+ gr.components.Dataframe(
2261
+ SING2ENG_FIVE_SHOT,
2262
+ datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
2263
+ type="pandas",
2264
+ )
2265
+ with gr.Row():
2266
+ gr.Markdown("""
2267
+ **SING2ENG Leaderboard** 🔮
2268
+
2269
+ - **Metric:** BLEU Avg.
2270
+ - **Languages:** English
2271
+ """)
2272
+
2273
+
2274
+ with gr.TabItem("Reasoning"):
2275
+
2276
+
2277
+ # dataset 12:
2278
+ with gr.TabItem("MMLU"):
2279
+ with gr.TabItem("Zero Shot"):
2280
+ with gr.TabItem("Overall"):
2281
+ with gr.Row():
2282
+ gr.components.Dataframe(
2283
+ MMLU_ZERO_SHOT,
2284
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
2285
+ type="pandas",
2286
+ )
2287
+ with gr.TabItem("Five Shot"):
2288
+ with gr.TabItem("Overall"):
2289
+ with gr.Row():
2290
+ gr.components.Dataframe(
2291
+ MMLU_FIVE_SHOT,
2292
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
2293
+ type="pandas",
2294
+ )
2295
+ with gr.Row():
2296
+ gr.Markdown("""
2297
+ **MMLU Leaderboard** 🔮
2298
+
2299
+ - **Metric:** Accuracy.
2300
+ - **Languages:** English
2301
+ """)
2302
+
2303
+
2304
+
2305
+ # dataset 13:
2306
+ with gr.TabItem("MMLU Full"):
2307
+ with gr.TabItem("Zero Shot"):
2308
+ with gr.TabItem("Overall"):
2309
+ with gr.Row():
2310
+ gr.components.Dataframe(
2311
+ MMLU_FULL_ZERO_SHOT,
2312
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
2313
+ type="pandas",
2314
+ )
2315
+ with gr.TabItem("Five Shot"):
2316
+ with gr.TabItem("Overall"):
2317
+ with gr.Row():
2318
+ gr.components.Dataframe(
2319
+ MMLU_FULL_FIVE_SHOT,
2320
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
2321
+ type="pandas",
2322
+ )
2323
+ with gr.Row():
2324
+ gr.Markdown("""
2325
+ **MMLU Full Leaderboard** 🔮
2326
+
2327
+ - **Metric:** Accuracy.
2328
+ - **Languages:** English
2329
+ """)
2330
+
2331
+
2332
+
2333
+ # dataset 14:
2334
+ with gr.TabItem("C_EVAL"):
2335
+ with gr.TabItem("Zero Shot"):
2336
+ with gr.TabItem("Overall"):
2337
+ with gr.Row():
2338
+ gr.components.Dataframe(
2339
+ C_EVAL_ZERO_SHOT,
2340
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
2341
+ type="pandas",
2342
+ )
2343
+ with gr.TabItem("Five Shot"):
2344
+ with gr.TabItem("Overall"):
2345
+ with gr.Row():
2346
+ gr.components.Dataframe(
2347
+ C_EVAL_FIVE_SHOT,
2348
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
2349
+ type="pandas",
2350
+ )
2351
+ with gr.Row():
2352
+ gr.Markdown("""
2353
+ **C_EVAL Leaderboard** 🔮
2354
+
2355
+ - **Metric:** Accuracy.
2356
+ - **Languages:** Chinese
2357
+ """)
2358
+
2359
+
2360
+
2361
+ # dataset 15:
2362
+ with gr.TabItem("C_EVAL Full"):
2363
+ with gr.TabItem("Zero Shot"):
2364
+ with gr.TabItem("Overall"):
2365
+ with gr.Row():
2366
+ gr.components.Dataframe(
2367
+ C_EVAL_FULL_ZERO_SHOT,
2368
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
2369
+ type="pandas",
2370
+ )
2371
+ with gr.TabItem("Five Shot"):
2372
+ with gr.TabItem("Overall"):
2373
+ with gr.Row():
2374
+ gr.components.Dataframe(
2375
+ C_EVAL_FULL_FIVE_SHOT,
2376
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
2377
+ type="pandas",
2378
+ )
2379
+ with gr.Row():
2380
+ gr.Markdown("""
2381
+ **C_EVAL Full Leaderboard** 🔮
2382
+
2383
+ - **Metric:** Accuracy.
2384
+ - **Languages:** Chinese
2385
+ """)
2386
+
2387
+
2388
+ # dataset 16:
2389
+ with gr.TabItem("CMMLU"):
2390
+ with gr.TabItem("Zero Shot"):
2391
+ with gr.TabItem("Overall"):
2392
+ with gr.Row():
2393
+ gr.components.Dataframe(
2394
+ CMMLU_ZERO_SHOT,
2395
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
2396
+ type="pandas",
2397
+ )
2398
+ with gr.TabItem("Five Shot"):
2399
+ with gr.TabItem("Overall"):
2400
+ with gr.Row():
2401
+ gr.components.Dataframe(
2402
+ CMMLU_FIVE_SHOT,
2403
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
2404
+ type="pandas",
2405
+ )
2406
+ with gr.Row():
2407
+ gr.Markdown("""
2408
+ **CMMLU Leaderboard** 🔮
2409
+
2410
+ - **Metric:** Accuracy.
2411
+ - **Languages:** Chinese
2412
+ """)
2413
+
2414
+
2415
+
2416
+ # dataset 17:
2417
+ with gr.TabItem("CMMLU Full"):
2418
+ with gr.TabItem("Zero Shot"):
2419
+ with gr.TabItem("Overall"):
2420
+ with gr.Row():
2421
+ gr.components.Dataframe(
2422
+ CMMLU_FULL_ZERO_SHOT,
2423
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
2424
+ type="pandas",
2425
+ )
2426
+ with gr.TabItem("Five Shot"):
2427
+ with gr.TabItem("Overall"):
2428
+ with gr.Row():
2429
+ gr.components.Dataframe(
2430
+ CMMLU_FULL_FIVE_SHOT,
2431
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
2432
+ type="pandas",
2433
+ )
2434
+ with gr.Row():
2435
+ gr.Markdown("""
2436
+ **CMMLU Full Leaderboard** 🔮
2437
+
2438
+ - **Metric:** Accuracy.
2439
+ - **Languages:** Chinese
2440
+ """)
2441
+
2442
+
2443
+ # dataset 18:
2444
+ with gr.TabItem("ZBench"):
2445
+ with gr.TabItem("Zero Shot"):
2446
+ with gr.TabItem("Overall"):
2447
+ with gr.Row():
2448
+ gr.components.Dataframe(
2449
+ ZBENCH_ZERO_SHOT,
2450
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
2451
+ type="pandas",
2452
+ )
2453
+ with gr.TabItem("Five Shot"):
2454
+ with gr.TabItem("Overall"):
2455
+ with gr.Row():
2456
+ gr.components.Dataframe(
2457
+ ZBENCH_FIVE_SHOT,
2458
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
2459
+ type="pandas",
2460
+ )
2461
+ with gr.Row():
2462
+ gr.Markdown("""
2463
+ **ZBench Leaderboard** 🔮
2464
+
2465
+ - **Metric:** Accuracy.
2466
+ - **Languages:** Chinese
2467
+ """)
2468
+
2469
+
2470
+
2471
+ with gr.TabItem("FLORES Translation"):
2472
+
2473
+
2474
+ # dataset 8:
2475
+ with gr.TabItem("FLORES Indonesian to English Translation"):
2476
+ with gr.TabItem("Zero Shot"):
2477
+ with gr.TabItem("Overall"):
2478
+ with gr.Row():
2479
+ gr.components.Dataframe(
2480
+ FLORES_IND2ENG_ZERO_SHOT,
2481
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
2482
+ type="pandas",
2483
+ )
2484
+ with gr.TabItem("Five Shot"):
2485
+ with gr.TabItem("Overall"):
2486
+ with gr.Row():
2487
+ gr.components.Dataframe(
2488
+ FLORES_IND2ENG_FIVE_SHOT,
2489
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
2490
+ type="pandas",
2491
+ )
2492
+ with gr.Row():
2493
+ gr.Markdown("""
2494
+ **flores_ind2eng Leaderboard** 🔮
2495
+
2496
+ - **Metric:** BLEU Avg.
2497
+ - **Languages:** English
2498
+ """)
2499
+
2500
+
2501
+ # dataset 9:
2502
+ with gr.TabItem("FLORES Vitenamese to English Translation"):
2503
+ with gr.TabItem("Zero Shot"):
2504
+ with gr.TabItem("Overall"):
2505
+ with gr.Row():
2506
+ gr.components.Dataframe(
2507
+ FLORES_VIE2ENG_ZERO_SHOT,
2508
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
2509
+ type="pandas",
2510
+ )
2511
+ with gr.TabItem("Five Shot"):
2512
+ with gr.TabItem("Overall"):
2513
+ with gr.Row():
2514
+ gr.components.Dataframe(
2515
+ FLORES_VIE2ENG_FIVE_SHOT,
2516
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
2517
+ type="pandas",
2518
+ )
2519
+ with gr.Row():
2520
+ gr.Markdown("""
2521
+ **flores_vie2eng Leaderboard** 🔮
2522
+
2523
+ - **Metric:** BLEU Avg.
2524
+ - **Languages:** English
2525
+ """)
2526
+
2527
+
2528
+
2529
+ # dataset 10:
2530
+ with gr.TabItem("FLORES Chinese to English Translation"):
2531
+ with gr.TabItem("Zero Shot"):
2532
+ with gr.TabItem("Overall"):
2533
+ with gr.Row():
2534
+ gr.components.Dataframe(
2535
+ FLORES_ZHO2ENG_ZERO_SHOT,
2536
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
2537
+ type="pandas",
2538
+ )
2539
+ with gr.TabItem("Five Shot"):
2540
+ with gr.TabItem("Overall"):
2541
+ with gr.Row():
2542
+ gr.components.Dataframe(
2543
+ FLORES_ZHO2ENG_FIVE_SHOT,
2544
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
2545
+ type="pandas",
2546
+ )
2547
+ with gr.Row():
2548
+ gr.Markdown("""
2549
+ **flores_zho2eng Leaderboard** 🔮
2550
+
2551
+ - **Metric:** BLEU Avg.
2552
+ - **Languages:** English
2553
+ """)
2554
+
2555
+
2556
+ # dataset 11:
2557
+ with gr.TabItem("FLORES Malay to English Translation"):
2558
+ with gr.TabItem("Zero Shot"):
2559
+ with gr.TabItem("Overall"):
2560
+ with gr.Row():
2561
+ gr.components.Dataframe(
2562
+ FLORES_ZSM2ENG_ZERO_SHOT,
2563
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
2564
+ type="pandas",
2565
+ )
2566
+ with gr.TabItem("Five Shot"):
2567
+ with gr.TabItem("Overall"):
2568
+ with gr.Row():
2569
+ gr.components.Dataframe(
2570
+ FLORES_ZSM2ENG_FIVE_SHOT,
2571
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
2572
+ type="pandas",
2573
+ )
2574
+ with gr.Row():
2575
+ gr.Markdown("""
2576
+ **flores_zsm2eng Leaderboard** 🔮
2577
+
2578
+ - **Metric:** BLEU Avg.
2579
+ - **Languages:** English
2580
+ """)
2581
+
2582
+
2583
+ with gr.TabItem("Emotion Recognition"):
2584
+
2585
+ # dataset 18:
2586
+ with gr.TabItem("ind_emotion"):
2587
+ with gr.TabItem("Zero Shot"):
2588
+ with gr.TabItem("Overall"):
2589
+ with gr.Row():
2590
+ gr.components.Dataframe(
2591
+ IND_EMOTION_ZERO_SHOT,
2592
+ datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
2593
+ type="pandas",
2594
+ )
2595
+ with gr.TabItem("Five Shot"):
2596
+ with gr.TabItem("Overall"):
2597
+ with gr.Row():
2598
+ gr.components.Dataframe(
2599
+ IND_EMOTION_FIVE_SHOT,
2600
+ datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns),
2601
+ type="pandas",
2602
+ )
2603
+ with gr.Row():
2604
+ gr.Markdown("""
2605
+ **ind_emotion Leaderboard** 🔮
2606
+
2607
+ - **Metric:** Accuracy.
2608
+ - **Languages:** Indonesian
2609
+ """)
2610
+
2611
+
2612
+ # dataset
2613
+ with gr.TabItem("SST2"):
2614
+ with gr.TabItem("Zero Shot"):
2615
+ with gr.TabItem("Overall"):
2616
+ with gr.Row():
2617
+ gr.components.Dataframe(
2618
+ SST2_ZERO_SHOT,
2619
+ datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns),
2620
+ type="pandas",
2621
+ )
2622
+ with gr.TabItem("Five Shot"):
2623
+ with gr.TabItem("Overall"):
2624
+ with gr.Row():
2625
+ gr.components.Dataframe(
2626
+ SST2_FIVE_SHOT,
2627
+ datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns),
2628
+ type="pandas",
2629
+ )
2630
+ with gr.Row():
2631
+ gr.Markdown("""
2632
+ **SST2 Leaderboard** 🔮
2633
+
2634
+ - **Metric:** Accuracy.
2635
+ - **Languages:** English
2636
+ """)
2637
+
2638
+
2639
+
2640
+ with gr.TabItem("Dialogue"):
2641
+
2642
+
2643
+ # dataset
2644
+ with gr.TabItem("DREAM"):
2645
+ with gr.TabItem("Zero Shot"):
2646
+ with gr.TabItem("Overall"):
2647
+ with gr.Row():
2648
+ gr.components.Dataframe(
2649
+ DREAM_ZERO_SHOT,
2650
+ datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns),
2651
+ type="pandas",
2652
+ )
2653
+ with gr.TabItem("Five Shot"):
2654
+ with gr.TabItem("Overall"):
2655
+ with gr.Row():
2656
+ gr.components.Dataframe(
2657
+ DREAM_FIVE_SHOT,
2658
+ datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns),
2659
+ type="pandas",
2660
+ )
2661
+ with gr.Row():
2662
+ gr.Markdown("""
2663
+ **DREAM Leaderboard** 🔮
2664
+
2665
+ - **Metric:** Accuracy.
2666
+ - **Languages:** English
2667
+ """)
2668
+
2669
+ # dataset
2670
+ with gr.TabItem("SAMSum"):
2671
+ with gr.TabItem("Zero Shot"):
2672
+ with gr.TabItem("Overall"):
2673
+ with gr.Row():
2674
+ gr.components.Dataframe(
2675
+ SAMSUM_ZERO_SHOT,
2676
+ datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns),
2677
+ type="pandas",
2678
+ )
2679
+ with gr.TabItem("Five Shot"):
2680
+ with gr.TabItem("Overall"):
2681
+ with gr.Row():
2682
+ gr.components.Dataframe(
2683
+ SAMSUM_FIVE_SHOT,
2684
+ datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns),
2685
+ type="pandas",
2686
+ )
2687
+ with gr.Row():
2688
+ gr.Markdown("""
2689
+ **SAMSum Leaderboard** 🔮
2690
+
2691
+ - **Metric:** ROUGE.
2692
+ - **Languages:** English
2693
+ """)
2694
+
2695
+
2696
+ # dataset
2697
+ with gr.TabItem("DialogSum"):
2698
+ with gr.TabItem("Zero Shot"):
2699
+ with gr.TabItem("Overall"):
2700
+ with gr.Row():
2701
+ gr.components.Dataframe(
2702
+ DIALOGSUM_ZERO_SHOT,
2703
+ datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns),
2704
+ type="pandas",
2705
+ )
2706
+ with gr.TabItem("Five Shot"):
2707
+ with gr.TabItem("Overall"):
2708
+ with gr.Row():
2709
+ gr.components.Dataframe(
2710
+ DIALOGSUM_FIVE_SHOT,
2711
+ datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns),
2712
+ type="pandas",
2713
+ )
2714
+ with gr.Row():
2715
+ gr.Markdown("""
2716
+ **DialogSum Leaderboard** 🔮
2717
+
2718
+ - **Metric:** ROUGE.
2719
+ - **Languages:** English
2720
+ """)
2721
+
2722
+
2723
+
2724
+ with gr.TabItem("Foundamental NLP"):
2725
+
2726
+
2727
+ # dataset
2728
+ with gr.TabItem("OCNLI"):
2729
+ with gr.TabItem("Zero Shot"):
2730
+ with gr.TabItem("Overall"):
2731
+ with gr.Row():
2732
+ gr.components.Dataframe(
2733
+ OCNLI_ZERO_SHOT,
2734
+ datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns),
2735
+ type="pandas",
2736
+ )
2737
+ with gr.TabItem("Five Shot"):
2738
+ with gr.TabItem("Overall"):
2739
+ with gr.Row():
2740
+ gr.components.Dataframe(
2741
+ OCNLI_FIVE_SHOT,
2742
+ datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns),
2743
+ type="pandas",
2744
+ )
2745
+ with gr.Row():
2746
+ gr.Markdown("""
2747
+ **OCNLI Leaderboard** 🔮
2748
+
2749
+ - **Metric:** Accuracy.
2750
+ - **Languages:** Chinese
2751
+ """)
2752
+
2753
+
2754
+ # dataset
2755
+ with gr.TabItem("C3"):
2756
+ with gr.TabItem("Zero Shot"):
2757
+ with gr.TabItem("Overall"):
2758
+ with gr.Row():
2759
+ gr.components.Dataframe(
2760
+ C3_ZERO_SHOT,
2761
+ datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns),
2762
+ type="pandas",
2763
+ )
2764
+ with gr.TabItem("Five Shot"):
2765
+ with gr.TabItem("Overall"):
2766
+ with gr.Row():
2767
+ gr.components.Dataframe(
2768
+ C3_FIVE_SHOT,
2769
+ datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns),
2770
+ type="pandas",
2771
+ )
2772
+ with gr.Row():
2773
+ gr.Markdown("""
2774
+ **C3 Leaderboard** 🔮
2775
+
2776
+ - **Metric:** Accuracy.
2777
+ - **Languages:** Chinese
2778
+ """)
2779
+
2780
+
2781
+
2782
+
2783
+ # dataset
2784
+ with gr.TabItem("COLA"):
2785
+ with gr.TabItem("Zero Shot"):
2786
+ with gr.TabItem("Overall"):
2787
+ with gr.Row():
2788
+ gr.components.Dataframe(
2789
+ COLA_ZERO_SHOT,
2790
+ datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns),
2791
+ type="pandas",
2792
+ )
2793
+ with gr.TabItem("Five Shot"):
2794
+ with gr.TabItem("Overall"):
2795
+ with gr.Row():
2796
+ gr.components.Dataframe(
2797
+ COLA_FIVE_SHOT,
2798
+ datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns),
2799
+ type="pandas",
2800
+ )
2801
+ with gr.Row():
2802
+ gr.Markdown("""
2803
+ **COLA Leaderboard** 🔮
2804
+
2805
+ - **Metric:** Accuracy.
2806
+ - **Languages:** English
2807
+ """)
2808
+
2809
+
2810
+ # dataset
2811
+ with gr.TabItem("QQP"):
2812
+ with gr.TabItem("Zero Shot"):
2813
+ with gr.TabItem("Overall"):
2814
+ with gr.Row():
2815
+ gr.components.Dataframe(
2816
+ QQP_ZERO_SHOT,
2817
+ datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns),
2818
+ type="pandas",
2819
+ )
2820
+ with gr.TabItem("Five Shot"):
2821
+ with gr.TabItem("Overall"):
2822
+ with gr.Row():
2823
+ gr.components.Dataframe(
2824
+ QQP_FIVE_SHOT,
2825
+ datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns),
2826
+ type="pandas",
2827
+ )
2828
+ with gr.Row():
2829
+ gr.Markdown("""
2830
+ **QQP Leaderboard** 🔮
2831
+
2832
+ - **Metric:** Accuracy.
2833
+ - **Languages:** English
2834
+ """)
2835
+
2836
+
2837
+ # dataset
2838
+ with gr.TabItem("MNLI"):
2839
+ with gr.TabItem("Zero Shot"):
2840
+ with gr.TabItem("Overall"):
2841
+ with gr.Row():
2842
+ gr.components.Dataframe(
2843
+ MNLI_ZERO_SHOT,
2844
+ datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns),
2845
+ type="pandas",
2846
+ )
2847
+ with gr.TabItem("Five Shot"):
2848
+ with gr.TabItem("Overall"):
2849
+ with gr.Row():
2850
+ gr.components.Dataframe(
2851
+ MNLI_FIVE_SHOT,
2852
+ datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns),
2853
+ type="pandas",
2854
+ )
2855
+ with gr.Row():
2856
+ gr.Markdown("""
2857
+ **MNLI Leaderboard** 🔮
2858
+
2859
+ - **Metric:** Accuracy.
2860
+ - **Languages:** English
2861
+ """)
2862
+
2863
+
2864
+ # dataset
2865
+ with gr.TabItem("QNLI"):
2866
+ with gr.TabItem("Zero Shot"):
2867
+ with gr.TabItem("Overall"):
2868
+ with gr.Row():
2869
+ gr.components.Dataframe(
2870
+ QNLI_ZERO_SHOT,
2871
+ datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns),
2872
+ type="pandas",
2873
+ )
2874
+ with gr.TabItem("Five Shot"):
2875
+ with gr.TabItem("Overall"):
2876
+ with gr.Row():
2877
+ gr.components.Dataframe(
2878
+ QNLI_FIVE_SHOT,
2879
+ datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns),
2880
+ type="pandas",
2881
+ )
2882
+ with gr.Row():
2883
+ gr.Markdown("""
2884
+ **QNLI Leaderboard** 🔮
2885
+
2886
+ - **Metric:** Accuracy.
2887
+ - **Languages:** English
2888
+ """)
2889
+
2890
+
2891
+
2892
+ # dataset
2893
+ with gr.TabItem("WNLI"):
2894
+ with gr.TabItem("Zero Shot"):
2895
+ with gr.TabItem("Overall"):
2896
+ with gr.Row():
2897
+ gr.components.Dataframe(
2898
+ WNLI_ZERO_SHOT,
2899
+ datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns),
2900
+ type="pandas",
2901
+ )
2902
+ with gr.TabItem("Five Shot"):
2903
+ with gr.TabItem("Overall"):
2904
+ with gr.Row():
2905
+ gr.components.Dataframe(
2906
+ WNLI_FIVE_SHOT,
2907
+ datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns),
2908
+ type="pandas",
2909
+ )
2910
+ with gr.Row():
2911
+ gr.Markdown("""
2912
+ **WNLI Leaderboard** 🔮
2913
+
2914
+ - **Metric:** Accuracy.
2915
+ - **Languages:** English
2916
+ """)
2917
+
2918
+
2919
+
2920
+ # dataset
2921
+ with gr.TabItem("RTE"):
2922
+ with gr.TabItem("Zero Shot"):
2923
+ with gr.TabItem("Overall"):
2924
+ with gr.Row():
2925
+ gr.components.Dataframe(
2926
+ RTE_ZERO_SHOT,
2927
+ datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns),
2928
+ type="pandas",
2929
+ )
2930
+ with gr.TabItem("Five Shot"):
2931
+ with gr.TabItem("Overall"):
2932
+ with gr.Row():
2933
+ gr.components.Dataframe(
2934
+ RTE_FIVE_SHOT,
2935
+ datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns),
2936
+ type="pandas",
2937
+ )
2938
+ with gr.Row():
2939
+ gr.Markdown("""
2940
+ **RTE Leaderboard** 🔮
2941
+
2942
+ - **Metric:** Accuracy.
2943
+ - **Languages:** English
2944
+ """)
2945
+
2946
+
2947
+
2948
+ # dataset
2949
+ with gr.TabItem("MRPC"):
2950
+ with gr.TabItem("Zero Shot"):
2951
+ with gr.TabItem("Overall"):
2952
+ with gr.Row():
2953
+ gr.components.Dataframe(
2954
+ MRPC_ZERO_SHOT,
2955
+ datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns),
2956
+ type="pandas",
2957
+ )
2958
+ with gr.TabItem("Five Shot"):
2959
+ with gr.TabItem("Overall"):
2960
+ with gr.Row():
2961
+ gr.components.Dataframe(
2962
+ MRPC_FIVE_SHOT,
2963
+ datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns),
2964
+ type="pandas",
2965
+ )
2966
+ with gr.Row():
2967
+ gr.Markdown("""
2968
+ **MRPC Leaderboard** 🔮
2969
+
2970
+ - **Metric:** Accuracy.
2971
+ - **Languages:** English
2972
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2973
 
2974
 
2975
  gr.Markdown(r"""
2976
+ If our datasets and leaderboard are useful, please consider cite:
 
 
2977
  ```bibtex
2978
  @article{SeaEval2023,
2979
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
2980
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
2981
  journal={arXiv preprint arXiv:2309.04766},
2982
+ year={2023}}
 
2983
  ```
2984
  """)
2985
  # Running the functions on page load in addition to when the button is clicked
 
2988
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
2989
  """
2990
 
2991
+
2992
+
2993
+
2994
+
2995
  block.queue(max_size=10)
2996
+ block.launch(server_name="0.0.0.0", share=False)
2997
 
2998
 
2999
  # Possible changes: