andrewrreed HF staff commited on
Commit
ec01232
1 Parent(s): 6bd3956

fix due to vision leaderboard addtion upstream

Browse files
Files changed (3) hide show
  1. app.py +3 -0
  2. release_date_mapping.json +75 -0
  3. utils.py +15 -0
app.py CHANGED
@@ -48,6 +48,9 @@ latest_elo_file_local = download_latest_data_from_space(
48
  with open(latest_elo_file_local, "rb") as fin:
49
  elo_results = pickle.load(fin)
50
 
 
 
 
51
  arena_dfs = {}
52
  for k in KEY_TO_CATEGORY_NAME.keys():
53
  if k not in elo_results:
 
48
  with open(latest_elo_file_local, "rb") as fin:
49
  elo_results = pickle.load(fin)
50
 
51
+ # TO-DO: need to also include vision
52
+ elo_results = elo_results["text"]
53
+
54
  arena_dfs = {}
55
  for k in KEY_TO_CATEGORY_NAME.keys():
56
  if k not in elo_results:
release_date_mapping.json CHANGED
@@ -493,5 +493,80 @@
493
  "key": "yi-large-preview",
494
  "Model": "Yi-Large-preview",
495
  "Release Date": "2024-05-23"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  }
497
  ]
 
493
  "key": "yi-large-preview",
494
  "Model": "Yi-Large-preview",
495
  "Release Date": "2024-05-23"
496
+ },
497
+ {
498
+ "key": "claude-3-5-sonnet-20240620",
499
+ "Model": "Claude 3.5 Sonnet",
500
+ "Release Date": "2024-07-01"
501
+ },
502
+ {
503
+ "key": "deepseek-coder-v2",
504
+ "Model": "DeepSeek-Coder-V2-Instruct",
505
+ "Release Date": "2024-07-01"
506
+ },
507
+ {
508
+ "key": "gemini-1.5-flash-api-0514",
509
+ "Model": "Gemini-1.5-Flash-API-0514",
510
+ "Release Date": "2024-07-01"
511
+ },
512
+ {
513
+ "key": "gemini-1.5-pro-api-0514",
514
+ "Model": "Gemini-1.5-Pro-API-0514",
515
+ "Release Date": "2024-07-01"
516
+ },
517
+ {
518
+ "key": "gemini-advanced-0514",
519
+ "Model": "Gemini-Advanced-0514",
520
+ "Release Date": "2024-07-01"
521
+ },
522
+ {
523
+ "key": "gemma-2-27b-it",
524
+ "Model": "Gemma-2-27B-it",
525
+ "Release Date": "2024-07-01"
526
+ },
527
+ {
528
+ "key": "gemma-2-9b-it",
529
+ "Model": "Gemma-2-9B-it",
530
+ "Release Date": "2024-07-01"
531
+ },
532
+ {
533
+ "key": "glm-4-0520",
534
+ "Model": "GLM-4-0520",
535
+ "Release Date": "2024-07-01"
536
+ },
537
+ {
538
+ "key": "nemotron-4-340b-instruct",
539
+ "Model": "Nemotron-4-340B-Instruct",
540
+ "Release Date": "2024-07-01"
541
+ },
542
+ {
543
+ "key": "phi-3-medium-4k-instruct",
544
+ "Model": "Phi-3-Medium-4k-Instruct",
545
+ "Release Date": "2024-07-01"
546
+ },
547
+ {
548
+ "key": "phi-3-small-8k-instruct",
549
+ "Model": "Phi-3-Small-8k-Instruct",
550
+ "Release Date": "2024-07-01"
551
+ },
552
+ {
553
+ "key": "qwen2-72b-instruct",
554
+ "Model": "Qwen2-72B-Instruct",
555
+ "Release Date": "2024-07-01"
556
+ },
557
+ {
558
+ "key": "reka-flash-preview-20240611",
559
+ "Model": "Reka-Flash-Preview-20240611",
560
+ "Release Date": "2024-07-01"
561
+ },
562
+ {
563
+ "key": "yi-1.5-34b-chat",
564
+ "Model": "Yi-1.5-34B-Chat",
565
+ "Release Date": "2024-07-01"
566
+ },
567
+ {
568
+ "key": "yi-large",
569
+ "Model": "Yi-Large",
570
+ "Release Date": "2024-07-01"
571
  }
572
  ]
utils.py CHANGED
@@ -11,6 +11,7 @@ from huggingface_hub import HfFileSystem, hf_hub_download
11
  KEY_TO_CATEGORY_NAME = {
12
  "full": "Overall",
13
  "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
 
14
  "coding": "Coding",
15
  "hard_6": "Hard Prompts (Overall)",
16
  "hard_english_6": "Hard Prompts (English)",
@@ -18,14 +19,22 @@ KEY_TO_CATEGORY_NAME = {
18
  "english": "English",
19
  "chinese": "Chinese",
20
  "french": "French",
 
 
 
 
 
21
  "no_tie": "Exclude Ties",
22
  "no_short": "Exclude Short Query (< 5 tokens)",
23
  "no_refusal": "Exclude Refusal",
24
  "overall_limit_5_user_vote": "overall_limit_5_user_vote",
 
25
  }
 
26
  CAT_NAME_TO_EXPLANATION = {
27
  "Overall": "Overall Questions",
28
  "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
 
29
  "Coding": "Coding: whether conversation contains code snippets",
30
  "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
31
  "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
@@ -33,10 +42,16 @@ CAT_NAME_TO_EXPLANATION = {
33
  "English": "English Prompts",
34
  "Chinese": "Chinese Prompts",
35
  "French": "French Prompts",
 
 
 
 
 
36
  "Exclude Ties": "Exclude Ties and Bothbad",
37
  "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
38
  "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
39
  "overall_limit_5_user_vote": "overall_limit_5_user_vote",
 
40
  }
41
 
42
  PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
 
11
  KEY_TO_CATEGORY_NAME = {
12
  "full": "Overall",
13
  "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
14
+ "multiturn": "Multi-Turn",
15
  "coding": "Coding",
16
  "hard_6": "Hard Prompts (Overall)",
17
  "hard_english_6": "Hard Prompts (English)",
 
19
  "english": "English",
20
  "chinese": "Chinese",
21
  "french": "French",
22
+ "german": "German",
23
+ "spanish": "Spanish",
24
+ "russian": "Russian",
25
+ "japanese": "Japanese",
26
+ "korean": "Korean",
27
  "no_tie": "Exclude Ties",
28
  "no_short": "Exclude Short Query (< 5 tokens)",
29
  "no_refusal": "Exclude Refusal",
30
  "overall_limit_5_user_vote": "overall_limit_5_user_vote",
31
+ "full_old": "Overall (Deprecated)",
32
  }
33
+
34
  CAT_NAME_TO_EXPLANATION = {
35
  "Overall": "Overall Questions",
36
  "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
37
+ "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
38
  "Coding": "Coding: whether conversation contains code snippets",
39
  "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
40
  "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
 
42
  "English": "English Prompts",
43
  "Chinese": "Chinese Prompts",
44
  "French": "French Prompts",
45
+ "German": "German Prompts",
46
+ "Spanish": "Spanish Prompts",
47
+ "Russian": "Russian Prompts",
48
+ "Japanese": "Japanese Prompts",
49
+ "Korean": "Korean Prompts",
50
  "Exclude Ties": "Exclude Ties and Bothbad",
51
  "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
52
  "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
53
  "overall_limit_5_user_vote": "overall_limit_5_user_vote",
54
+ "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
55
  }
56
 
57
  PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]