xuanricheng commited on
Commit
b6b9254
β€’
1 Parent(s): 85128b4

update result format

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Open LLM Leaderboard
3
  emoji: πŸ†
4
  colorFrom: green
5
  colorTo: indigo
@@ -8,7 +8,7 @@ sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
  fullWidth: true
13
  space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
14
  private: true
 
1
  ---
2
+ title: Chinese Open LLM Leaderboard
3
  emoji: πŸ†
4
  colorFrom: green
5
  colorTo: indigo
 
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ # duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
  fullWidth: true
13
  space_ci: # See https://huggingface.co/spaces/Wauplin/gradio-space-ci
14
  private: true
src/display/about.py CHANGED
@@ -1,10 +1,10 @@
1
  from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">πŸ€— FlagEval Chinese LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- πŸ“ The πŸ€— FlagEval Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
- [FlagEval](https://flageval.baai.ac.cn/)
8
 
9
  πŸ€— Submit a model for automated evaluation on the πŸ€— GPU cluster on the "Submit" page!
10
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
@@ -69,8 +69,8 @@ To get more information about quantization, see:
69
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
70
 
71
  ## Useful links
72
- - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
73
- - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
74
  """
75
 
76
  FAQ_TEXT = """
 
1
  from src.display.utils import ModelType
2
 
3
+ TITLE = """<h1 align="center" id="space-title">πŸ€— Open Chinese LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ πŸ“ The πŸ€— Open Chinese LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
+ This leaderboard is subset of the [FlagEval](https://flageval.baai.ac.cn/)
8
 
9
  πŸ€— Submit a model for automated evaluation on the πŸ€— GPU cluster on the "Submit" page!
10
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 
69
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
70
 
71
  ## Useful links
72
+ - [Community resources](https://huggingface.co/spaces/BAAI/open_cn_llm_leaderboard/discussions/174)
73
+ - [Collection of best models](https://huggingface.co/collections/open-cn-llm-leaderboard/chinese-llm-leaderboard-best-models-65b0d4511dbd85fd0c3ad9cd)
74
  """
75
 
76
  FAQ_TEXT = """
src/display/utils.py CHANGED
@@ -14,12 +14,13 @@ class Task:
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
- arc = Task("arc:challenge", "acc_norm", "ARC")
18
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
- mmlu = Task("hendrycksTest", "acc", "MMLU")
20
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
- winogrande = Task("winogrande", "acc", "Winogrande")
22
- gsm8k = Task("gsm8k", "acc", "GSM8K")
 
23
 
24
  # These classes are for user facing column names,
25
  # to avoid having to change them all around the code
@@ -82,6 +83,7 @@ baseline_row = {
82
  AutoEvalColumn.truthfulqa.name: 25.0,
83
  AutoEvalColumn.winogrande.name: 50.0,
84
  AutoEvalColumn.gsm8k.name: 0.21,
 
85
  AutoEvalColumn.dummy.name: "baseline",
86
  AutoEvalColumn.model_type.name: "",
87
  AutoEvalColumn.flagged.name: False,
@@ -107,6 +109,7 @@ human_baseline_row = {
107
  AutoEvalColumn.truthfulqa.name: 94.0,
108
  AutoEvalColumn.winogrande.name: 94.0,
109
  AutoEvalColumn.gsm8k.name: 100,
 
110
  AutoEvalColumn.dummy.name: "human_baseline",
111
  AutoEvalColumn.model_type.name: "",
112
  AutoEvalColumn.flagged.name: False,
 
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
+ arc = Task("arc:challenge", "acc_norm", "C-ARC")
18
+ hellaswag = Task("hellaswag", "acc_norm", "C-HellaSwag")
19
+ truthfulqa = Task("truthfulqa:mc", "mc2", "C-TruthfulQA")
20
+ winogrande = Task("winogrande", "acc", "C-Winogrande")
21
+ gsm8k = Task("gsm8k", "acc", "C-GSM8K")
22
+ c_sem = Task("c-sem-v2", "acc", "C-SEM")
23
+ mmlu = Task("cmmlu", "acc", "C-MMLU")
24
 
25
  # These classes are for user facing column names,
26
  # to avoid having to change them all around the code
 
83
  AutoEvalColumn.truthfulqa.name: 25.0,
84
  AutoEvalColumn.winogrande.name: 50.0,
85
  AutoEvalColumn.gsm8k.name: 0.21,
86
+ AutoEvalColumn.c_sem.name: 25.0,
87
  AutoEvalColumn.dummy.name: "baseline",
88
  AutoEvalColumn.model_type.name: "",
89
  AutoEvalColumn.flagged.name: False,
 
109
  AutoEvalColumn.truthfulqa.name: 94.0,
110
  AutoEvalColumn.winogrande.name: 94.0,
111
  AutoEvalColumn.gsm8k.name: 100,
112
+ AutoEvalColumn.c_sem.name: 100,
113
  AutoEvalColumn.dummy.name: "human_baseline",
114
  AutoEvalColumn.model_type.name: "",
115
  AutoEvalColumn.flagged.name: False,
src/leaderboard/read_evals.py CHANGED
@@ -87,7 +87,7 @@ class EvalResult:
87
  if accs.size == 0 or any([acc is None for acc in accs]):
88
  continue
89
 
90
- mean_acc = np.mean(accs) * 100.0
91
  results[task.benchmark] = mean_acc
92
 
93
  return self(
@@ -149,7 +149,7 @@ class EvalResult:
149
  }
150
 
151
  for task in Tasks:
152
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
153
 
154
  return data_dict
155
 
 
87
  if accs.size == 0 or any([acc is None for acc in accs]):
88
  continue
89
 
90
+ mean_acc = np.mean(accs)
91
  results[task.benchmark] = mean_acc
92
 
93
  return self(
 
149
  }
150
 
151
  for task in Tasks:
152
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
153
 
154
  return data_dict
155
 
src/scripts/create_request_file.py CHANGED
@@ -11,7 +11,7 @@ from src.submission.check_validity import get_model_size
11
  from src.display.utils import ModelType, WeightType
12
 
13
  EVAL_REQUESTS_PATH = "eval-queue"
14
- QUEUE_REPO = "open_cn_llm_leaderboard/requests"
15
 
16
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
17
  model_types = [e.name for e in ModelType]
 
11
  from src.display.utils import ModelType, WeightType
12
 
13
  EVAL_REQUESTS_PATH = "eval-queue"
14
+ QUEUE_REPO = "open-cn-llm-leaderboard/requests"
15
 
16
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
17
  model_types = [e.name for e in ModelType]