Clémentine commited on
Commit
12cea14
1 Parent(s): 99b25b8

FT: precision and adapter models

Browse files
app.py CHANGED
@@ -28,7 +28,6 @@ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
28
  PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
29
 
30
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
31
- ADD_PLOTS = False
32
 
33
  EVAL_REQUESTS_PATH = "eval-queue"
34
  EVAL_RESULTS_PATH = "eval-results"
@@ -56,8 +55,8 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an
56
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
57
 
58
  if not IS_PUBLIC:
59
- COLS.insert(2, AutoEvalColumn.is_8bit.name)
60
- TYPES.insert(2, AutoEvalColumn.is_8bit.type)
61
 
62
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
63
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
@@ -177,25 +176,27 @@ def add_new_eval(
177
  model: str,
178
  base_model: str,
179
  revision: str,
180
- is_8_bit_eval: bool,
181
  private: bool,
182
- is_delta_weight: bool,
183
  ):
 
184
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
185
 
186
  # check the model actually exists before adding the eval
187
  if revision == "":
188
  revision = "main"
189
 
190
- if is_delta_weight:
191
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
192
  if not base_model_on_hub:
193
  return styled_error(f'Base model "{base_model}" {error}')
 
194
 
195
  model_on_hub, error = is_model_on_hub(model, revision)
196
  if not model_on_hub:
197
  return styled_error(f'Model "{model}" {error}')
198
-
199
  print("adding new eval")
200
 
201
  eval_entry = {
@@ -203,8 +204,8 @@ def add_new_eval(
203
  "base_model": base_model,
204
  "revision": revision,
205
  "private": private,
206
- "8bit_eval": is_8_bit_eval,
207
- "is_delta_weight": is_delta_weight,
208
  "status": "PENDING",
209
  "submitted_time": current_time,
210
  }
@@ -217,7 +218,7 @@ def add_new_eval(
217
 
218
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
219
  os.makedirs(OUT_DIR, exist_ok=True)
220
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
221
 
222
  # Check for duplicate submission
223
  if out_path.split("eval-queue/")[1].lower() in requested_models:
@@ -381,17 +382,29 @@ with demo:
381
  revision_name_textbox = gr.Textbox(
382
  label="revision", placeholder="main"
383
  )
 
 
 
384
 
385
  with gr.Column():
386
- is_8bit_toggle = gr.Checkbox(
387
- False, label="8 bit eval", visible=not IS_PUBLIC
 
 
 
 
 
388
  )
389
- private = gr.Checkbox(
390
- False, label="Private", visible=not IS_PUBLIC
 
 
 
 
 
391
  )
392
- is_delta_weight = gr.Checkbox(False, label="Delta weights")
393
  base_model_name_textbox = gr.Textbox(
394
- label="base model (for delta)"
395
  )
396
 
397
  submit_button = gr.Button("Submit Eval")
@@ -402,9 +415,9 @@ with demo:
402
  model_name_textbox,
403
  base_model_name_textbox,
404
  revision_name_textbox,
405
- is_8bit_toggle,
406
  private,
407
- is_delta_weight,
408
  ],
409
  submission_result,
410
  )
 
28
  PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
29
 
30
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 
31
 
32
  EVAL_REQUESTS_PATH = "eval-queue"
33
  EVAL_RESULTS_PATH = "eval-results"
 
55
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
56
 
57
  if not IS_PUBLIC:
58
+ COLS.insert(2, AutoEvalColumn.precision.name)
59
+ TYPES.insert(2, AutoEvalColumn.precision.type)
60
 
61
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
62
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
176
  model: str,
177
  base_model: str,
178
  revision: str,
179
+ precision: str,
180
  private: bool,
181
+ weight_type: str,
182
  ):
183
+ precision = precision.split(" ")[0]
184
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
185
 
186
  # check the model actually exists before adding the eval
187
  if revision == "":
188
  revision = "main"
189
 
190
+ if weight_type in ["Delta", "Adapter"]:
191
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
192
  if not base_model_on_hub:
193
  return styled_error(f'Base model "{base_model}" {error}')
194
+
195
 
196
  model_on_hub, error = is_model_on_hub(model, revision)
197
  if not model_on_hub:
198
  return styled_error(f'Model "{model}" {error}')
199
+
200
  print("adding new eval")
201
 
202
  eval_entry = {
 
204
  "base_model": base_model,
205
  "revision": revision,
206
  "private": private,
207
+ "precision": precision,
208
+ "weight_type": weight_type,
209
  "status": "PENDING",
210
  "submitted_time": current_time,
211
  }
 
218
 
219
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
220
  os.makedirs(OUT_DIR, exist_ok=True)
221
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
222
 
223
  # Check for duplicate submission
224
  if out_path.split("eval-queue/")[1].lower() in requested_models:
 
382
  revision_name_textbox = gr.Textbox(
383
  label="revision", placeholder="main"
384
  )
385
+ private = gr.Checkbox(
386
+ False, label="Private", visible=not IS_PUBLIC
387
+ )
388
 
389
  with gr.Column():
390
+ precision = gr.Dropdown(
391
+ choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
392
+ label="Precision",
393
+ multiselect=False,
394
+ value="float16",
395
+ max_choices=1,
396
+ interactive=True,
397
  )
398
+ weight_type = gr.Dropdown(
399
+ choices=["Original", "Delta", "Adapter"],
400
+ label="Weights type",
401
+ multiselect=False,
402
+ value="Original",
403
+ max_choices=1,
404
+ interactive=True,
405
  )
 
406
  base_model_name_textbox = gr.Textbox(
407
+ label="Base model (for delta or adapter weights)"
408
  )
409
 
410
  submit_button = gr.Button("Submit Eval")
 
415
  model_name_textbox,
416
  base_model_name_textbox,
417
  revision_name_textbox,
418
+ precision,
419
  private,
420
+ weight_type,
421
  ],
422
  submission_result,
423
  )
src/assets/hardcoded_evals.py CHANGED
@@ -3,7 +3,7 @@ from src.utils_display import AutoEvalColumn, model_hyperlink
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
  AutoEvalColumn.revision.name: "tech report",
6
- AutoEvalColumn.is_8bit.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
  AutoEvalColumn.hellaswag.name: 95.3,
@@ -15,7 +15,7 @@ gpt4_values = {
15
  gpt35_values = {
16
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
17
  AutoEvalColumn.revision.name: "tech report",
18
- AutoEvalColumn.is_8bit.name: None,
19
  AutoEvalColumn.average.name: 71.9,
20
  AutoEvalColumn.arc.name: 85.2,
21
  AutoEvalColumn.hellaswag.name: 85.5,
@@ -27,7 +27,7 @@ gpt35_values = {
27
  baseline = {
28
  AutoEvalColumn.model.name: "<p>Baseline</p>",
29
  AutoEvalColumn.revision.name: "N/A",
30
- AutoEvalColumn.is_8bit.name: None,
31
  AutoEvalColumn.average.name: 25.0,
32
  AutoEvalColumn.arc.name: 25.0,
33
  AutoEvalColumn.hellaswag.name: 25.0,
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
  AutoEvalColumn.revision.name: "tech report",
6
+ AutoEvalColumn.precision.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
  AutoEvalColumn.hellaswag.name: 95.3,
 
15
  gpt35_values = {
16
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
17
  AutoEvalColumn.revision.name: "tech report",
18
+ AutoEvalColumn.precision.name: None,
19
  AutoEvalColumn.average.name: 71.9,
20
  AutoEvalColumn.arc.name: 85.2,
21
  AutoEvalColumn.hellaswag.name: 85.5,
 
27
  baseline = {
28
  AutoEvalColumn.model.name: "<p>Baseline</p>",
29
  AutoEvalColumn.revision.name: "N/A",
30
+ AutoEvalColumn.precision.name: None,
31
  AutoEvalColumn.average.name: 25.0,
32
  AutoEvalColumn.arc.name: 25.0,
33
  AutoEvalColumn.hellaswag.name: 25.0,
src/assets/text_content.py CHANGED
@@ -122,12 +122,16 @@ The tasks and few shots parameters are:
122
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
123
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
124
 
 
 
 
 
 
125
  # In case of model failure
126
  If your model is displayed in the `FAILED` category, its execution stopped.
127
  Make sure you have followed the above steps first.
128
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
129
 
130
-
131
  """
132
 
133
  EVALUATION_QUEUE_TEXT = f"""
 
122
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
123
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
124
 
125
+ ### Quantization
126
+ To get more information about quantization, see:
127
+ - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
128
+ - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
129
+
130
  # In case of model failure
131
  If your model is displayed in the `FAILED` category, its execution stopped.
132
  Make sure you have followed the above steps first.
133
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
134
 
 
135
  """
136
 
137
  EVALUATION_QUEUE_TEXT = f"""
src/auto_leaderboard/get_model_metadata.py CHANGED
@@ -36,7 +36,7 @@ def get_model_license(model_info):
36
  def get_model_likes(model_info):
37
  return model_info.likes
38
 
39
- size_pattern = re.compile(r"\d+(b|m)")
40
 
41
  def get_model_size(model_name, model_info):
42
  # In billions
@@ -46,7 +46,7 @@ def get_model_size(model_name, model_info):
46
  try:
47
  size_match = re.search(size_pattern, model_name.lower())
48
  size = size_match.group(0)
49
- return round(int(size[:-1]) if size[-1] == "b" else int(size[:-1]) / 1e3, 3)
50
  except AttributeError:
51
  return None
52
 
 
36
  def get_model_likes(model_info):
37
  return model_info.likes
38
 
39
+ size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
40
 
41
  def get_model_size(model_name, model_info):
42
  # In billions
 
46
  try:
47
  size_match = re.search(size_pattern, model_name.lower())
48
  size = size_match.group(0)
49
+ return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
50
  except AttributeError:
51
  return None
52
 
src/auto_leaderboard/load_results.py CHANGED
@@ -24,7 +24,7 @@ class EvalResult:
24
  model: str
25
  revision: str
26
  results: dict
27
- is_8bit: bool = False
28
 
29
  def to_dict(self):
30
  if self.org is not None:
@@ -34,7 +34,7 @@ class EvalResult:
34
  data_dict = {}
35
 
36
  data_dict["eval_name"] = self.eval_name # not a column, just a save name
37
- data_dict[AutoEvalColumn.is_8bit.name] = self.is_8bit
38
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
39
  data_dict[AutoEvalColumn.dummy.name] = base_model
40
  data_dict[AutoEvalColumn.revision.name] = self.revision
 
24
  model: str
25
  revision: str
26
  results: dict
27
+ precision: str = "16bit"
28
 
29
  def to_dict(self):
30
  if self.org is not None:
 
34
  data_dict = {}
35
 
36
  data_dict["eval_name"] = self.eval_name # not a column, just a save name
37
+ data_dict[AutoEvalColumn.precision.name] = self.precision
38
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
39
  data_dict[AutoEvalColumn.dummy.name] = base_model
40
  data_dict[AutoEvalColumn.revision.name] = self.revision
src/auto_leaderboard/model_metadata_type.py CHANGED
@@ -161,3 +161,12 @@ TYPE_METADATA: Dict[str, ModelType] = {
161
  def get_model_type(leaderboard_data: List[dict]):
162
  for model_data in leaderboard_data:
163
  model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
 
 
 
 
 
 
 
 
 
 
161
  def get_model_type(leaderboard_data: List[dict]):
162
  for model_data in leaderboard_data:
163
  model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
164
+ if model_data["Type"] == "N/A":
165
+ if any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
166
+ model_data["Type"] = ModelType.SFT
167
+ elif any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
168
+ model_data["Type"] = ModelType.PT
169
+ elif any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
170
+ model_data["Type"] = ModelType.RL
171
+
172
+
src/utils_display.py CHANGED
@@ -20,8 +20,8 @@ class AutoEvalColumn: # Auto evals column
20
  hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
21
  mmlu = ColumnContent("MMLU ⬆️", "number", True)
22
  truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
23
- model_type = ColumnContent("Type", "bool", False)
24
- is_8bit = ColumnContent("8bit", "bool", False, True)
25
  license = ColumnContent("Hub License", "str", False)
26
  params = ColumnContent("#Params (B)", "number", False)
27
  likes = ColumnContent("Hub ❤️", "number", False)
@@ -42,8 +42,8 @@ class EvalQueueColumn: # Queue column
42
  model = ColumnContent("model", "markdown", True)
43
  revision = ColumnContent("revision", "str", True)
44
  private = ColumnContent("private", "bool", True)
45
- is_8bit = ColumnContent("8bit_eval", "bool", True)
46
- has_delta_weight = ColumnContent("is_delta_weight", "bool", True)
47
  status = ColumnContent("status", "str", True)
48
 
49
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
 
20
  hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
21
  mmlu = ColumnContent("MMLU ⬆️", "number", True)
22
  truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
23
+ model_type = ColumnContent("Type", "str", False)
24
+ precision = ColumnContent("Precision", "str", False, True)
25
  license = ColumnContent("Hub License", "str", False)
26
  params = ColumnContent("#Params (B)", "number", False)
27
  likes = ColumnContent("Hub ❤️", "number", False)
 
42
  model = ColumnContent("model", "markdown", True)
43
  revision = ColumnContent("revision", "str", True)
44
  private = ColumnContent("private", "bool", True)
45
+ precision = ColumnContent("precision", "bool", True)
46
+ weight_type = ColumnContent("weight_type", "str", "Original")
47
  status = ColumnContent("status", "str", True)
48
 
49
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]