Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
•
ec01232
1
Parent(s):
6bd3956
fix due to vision leaderboard addtion upstream
Browse files- app.py +3 -0
- release_date_mapping.json +75 -0
- utils.py +15 -0
app.py
CHANGED
@@ -48,6 +48,9 @@ latest_elo_file_local = download_latest_data_from_space(
|
|
48 |
with open(latest_elo_file_local, "rb") as fin:
|
49 |
elo_results = pickle.load(fin)
|
50 |
|
|
|
|
|
|
|
51 |
arena_dfs = {}
|
52 |
for k in KEY_TO_CATEGORY_NAME.keys():
|
53 |
if k not in elo_results:
|
|
|
48 |
with open(latest_elo_file_local, "rb") as fin:
|
49 |
elo_results = pickle.load(fin)
|
50 |
|
51 |
+
# TO-DO: need to also include vision
|
52 |
+
elo_results = elo_results["text"]
|
53 |
+
|
54 |
arena_dfs = {}
|
55 |
for k in KEY_TO_CATEGORY_NAME.keys():
|
56 |
if k not in elo_results:
|
release_date_mapping.json
CHANGED
@@ -493,5 +493,80 @@
|
|
493 |
"key": "yi-large-preview",
|
494 |
"Model": "Yi-Large-preview",
|
495 |
"Release Date": "2024-05-23"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
}
|
497 |
]
|
|
|
493 |
"key": "yi-large-preview",
|
494 |
"Model": "Yi-Large-preview",
|
495 |
"Release Date": "2024-05-23"
|
496 |
+
},
|
497 |
+
{
|
498 |
+
"key": "claude-3-5-sonnet-20240620",
|
499 |
+
"Model": "Claude 3.5 Sonnet",
|
500 |
+
"Release Date": "2024-07-01"
|
501 |
+
},
|
502 |
+
{
|
503 |
+
"key": "deepseek-coder-v2",
|
504 |
+
"Model": "DeepSeek-Coder-V2-Instruct",
|
505 |
+
"Release Date": "2024-07-01"
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"key": "gemini-1.5-flash-api-0514",
|
509 |
+
"Model": "Gemini-1.5-Flash-API-0514",
|
510 |
+
"Release Date": "2024-07-01"
|
511 |
+
},
|
512 |
+
{
|
513 |
+
"key": "gemini-1.5-pro-api-0514",
|
514 |
+
"Model": "Gemini-1.5-Pro-API-0514",
|
515 |
+
"Release Date": "2024-07-01"
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"key": "gemini-advanced-0514",
|
519 |
+
"Model": "Gemini-Advanced-0514",
|
520 |
+
"Release Date": "2024-07-01"
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"key": "gemma-2-27b-it",
|
524 |
+
"Model": "Gemma-2-27B-it",
|
525 |
+
"Release Date": "2024-07-01"
|
526 |
+
},
|
527 |
+
{
|
528 |
+
"key": "gemma-2-9b-it",
|
529 |
+
"Model": "Gemma-2-9B-it",
|
530 |
+
"Release Date": "2024-07-01"
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"key": "glm-4-0520",
|
534 |
+
"Model": "GLM-4-0520",
|
535 |
+
"Release Date": "2024-07-01"
|
536 |
+
},
|
537 |
+
{
|
538 |
+
"key": "nemotron-4-340b-instruct",
|
539 |
+
"Model": "Nemotron-4-340B-Instruct",
|
540 |
+
"Release Date": "2024-07-01"
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"key": "phi-3-medium-4k-instruct",
|
544 |
+
"Model": "Phi-3-Medium-4k-Instruct",
|
545 |
+
"Release Date": "2024-07-01"
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"key": "phi-3-small-8k-instruct",
|
549 |
+
"Model": "Phi-3-Small-8k-Instruct",
|
550 |
+
"Release Date": "2024-07-01"
|
551 |
+
},
|
552 |
+
{
|
553 |
+
"key": "qwen2-72b-instruct",
|
554 |
+
"Model": "Qwen2-72B-Instruct",
|
555 |
+
"Release Date": "2024-07-01"
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"key": "reka-flash-preview-20240611",
|
559 |
+
"Model": "Reka-Flash-Preview-20240611",
|
560 |
+
"Release Date": "2024-07-01"
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"key": "yi-1.5-34b-chat",
|
564 |
+
"Model": "Yi-1.5-34B-Chat",
|
565 |
+
"Release Date": "2024-07-01"
|
566 |
+
},
|
567 |
+
{
|
568 |
+
"key": "yi-large",
|
569 |
+
"Model": "Yi-Large",
|
570 |
+
"Release Date": "2024-07-01"
|
571 |
}
|
572 |
]
|
utils.py
CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import HfFileSystem, hf_hub_download
|
|
11 |
KEY_TO_CATEGORY_NAME = {
|
12 |
"full": "Overall",
|
13 |
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
|
|
|
14 |
"coding": "Coding",
|
15 |
"hard_6": "Hard Prompts (Overall)",
|
16 |
"hard_english_6": "Hard Prompts (English)",
|
@@ -18,14 +19,22 @@ KEY_TO_CATEGORY_NAME = {
|
|
18 |
"english": "English",
|
19 |
"chinese": "Chinese",
|
20 |
"french": "French",
|
|
|
|
|
|
|
|
|
|
|
21 |
"no_tie": "Exclude Ties",
|
22 |
"no_short": "Exclude Short Query (< 5 tokens)",
|
23 |
"no_refusal": "Exclude Refusal",
|
24 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
|
|
25 |
}
|
|
|
26 |
CAT_NAME_TO_EXPLANATION = {
|
27 |
"Overall": "Overall Questions",
|
28 |
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
|
|
29 |
"Coding": "Coding: whether conversation contains code snippets",
|
30 |
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
31 |
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
@@ -33,10 +42,16 @@ CAT_NAME_TO_EXPLANATION = {
|
|
33 |
"English": "English Prompts",
|
34 |
"Chinese": "Chinese Prompts",
|
35 |
"French": "French Prompts",
|
|
|
|
|
|
|
|
|
|
|
36 |
"Exclude Ties": "Exclude Ties and Bothbad",
|
37 |
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
38 |
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
39 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
|
|
40 |
}
|
41 |
|
42 |
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
|
|
|
11 |
KEY_TO_CATEGORY_NAME = {
|
12 |
"full": "Overall",
|
13 |
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
|
14 |
+
"multiturn": "Multi-Turn",
|
15 |
"coding": "Coding",
|
16 |
"hard_6": "Hard Prompts (Overall)",
|
17 |
"hard_english_6": "Hard Prompts (English)",
|
|
|
19 |
"english": "English",
|
20 |
"chinese": "Chinese",
|
21 |
"french": "French",
|
22 |
+
"german": "German",
|
23 |
+
"spanish": "Spanish",
|
24 |
+
"russian": "Russian",
|
25 |
+
"japanese": "Japanese",
|
26 |
+
"korean": "Korean",
|
27 |
"no_tie": "Exclude Ties",
|
28 |
"no_short": "Exclude Short Query (< 5 tokens)",
|
29 |
"no_refusal": "Exclude Refusal",
|
30 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
31 |
+
"full_old": "Overall (Deprecated)",
|
32 |
}
|
33 |
+
|
34 |
CAT_NAME_TO_EXPLANATION = {
|
35 |
"Overall": "Overall Questions",
|
36 |
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
37 |
+
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
|
38 |
"Coding": "Coding: whether conversation contains code snippets",
|
39 |
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
40 |
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
|
|
42 |
"English": "English Prompts",
|
43 |
"Chinese": "Chinese Prompts",
|
44 |
"French": "French Prompts",
|
45 |
+
"German": "German Prompts",
|
46 |
+
"Spanish": "Spanish Prompts",
|
47 |
+
"Russian": "Russian Prompts",
|
48 |
+
"Japanese": "Japanese Prompts",
|
49 |
+
"Korean": "Korean Prompts",
|
50 |
"Exclude Ties": "Exclude Ties and Bothbad",
|
51 |
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
52 |
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
53 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
54 |
+
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
55 |
}
|
56 |
|
57 |
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
|