ClΓ©mentine commited on
Commit
699e8ff
β€’
1 Parent(s): 6254b87

Adding flagging systemi, removing changelog

Browse files
app.py CHANGED
@@ -82,7 +82,7 @@ def get_leaderboard_df():
82
  print("Pulling evaluation results for the leaderboard.")
83
  eval_results_private.git_pull()
84
 
85
- all_data = get_eval_results_dicts(IS_PUBLIC)
86
 
87
  if not IS_PUBLIC:
88
  all_data.append(gpt4_values)
@@ -341,7 +341,7 @@ with demo:
341
  elem_id="filter-columns"
342
  )
343
  leaderboard_table = gr.components.Dataframe(
344
- value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value+ [AutoEvalColumn.dummy.name]],
345
  headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
346
  datatype=TYPES,
347
  max_rows=None,
 
82
  print("Pulling evaluation results for the leaderboard.")
83
  eval_results_private.git_pull()
84
 
85
+ all_data = get_eval_results_dicts()
86
 
87
  if not IS_PUBLIC:
88
  all_data.append(gpt4_values)
 
341
  elem_id="filter-columns"
342
  )
343
  leaderboard_table = gr.components.Dataframe(
344
+ value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name]],
345
  headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
346
  datatype=TYPES,
347
  max_rows=None,
src/assets/css_html_js.py CHANGED
@@ -1,11 +1,4 @@
1
  custom_css = """
2
- #changelog-text {
3
- font-size: 16px !important;
4
- }
5
-
6
- #changelog-text h2 {
7
- font-size: 18px !important;
8
- }
9
 
10
  .markdown-text {
11
  font-size: 16px !important;
 
1
  custom_css = """
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
src/assets/text_content.py CHANGED
@@ -1,61 +1,5 @@
1
  from ..auto_leaderboard.model_metadata_type import ModelType
2
 
3
- CHANGELOG_TEXT = f"""
4
- ## [2023-06-19]
5
- - Added model type column
6
- - Hid revision and 8bit columns since all models are the same atm
7
-
8
- ## [2023-06-16]
9
- - Refactored code base
10
- - Added new columns: number of parameters, hub likes, license
11
-
12
- ## [2023-06-13]
13
- - Adjust description for TruthfulQA
14
-
15
- ## [2023-06-12]
16
- - Add Human & GPT-4 Evaluations
17
-
18
- ## [2023-06-05]
19
- - Increase concurrent thread count to 40
20
- - Search models on ENTER
21
-
22
- ## [2023-06-02]
23
- - Add a typeahead search bar
24
- - Use webhooks to automatically spawn a new Space when someone opens a PR
25
- - Start recording `submitted_time` for eval requests
26
- - Limit AutoEvalColumn max-width
27
-
28
- ## [2023-05-30]
29
- - Add a citation button
30
- - Simplify Gradio layout
31
-
32
- ## [2023-05-29]
33
- - Auto-restart every hour for the latest results
34
- - Sync with the internal version (minor style changes)
35
-
36
- ## [2023-05-24]
37
- - Add a baseline that has 25.0 for all values
38
- - Add CHANGELOG
39
-
40
- ## [2023-05-23]
41
- - Fix a CSS issue that made the leaderboard hard to read in dark mode
42
-
43
- ## [2023-05-22]
44
- - Display a success/error message after submitting evaluation requests
45
- - Reject duplicate submission
46
- - Do not display results that have incomplete results
47
- - Display different queues for jobs that are RUNNING, PENDING, FINISHED status
48
-
49
- ## [2023-05-15]
50
- - Fix a typo: from "TruthQA" to "QA"
51
-
52
- ## [2023-05-10]
53
- - Fix a bug that prevented auto-refresh
54
-
55
- ## [2023-05-10]
56
- - Release the leaderboard to public
57
- """
58
-
59
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
60
 
61
  INTRODUCTION_TEXT = f"""
@@ -81,6 +25,9 @@ With the plethora of large language models (LLMs) and chatbots being released we
81
  {ModelType.RL.to_str(" : ")} model
82
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
83
 
 
 
 
84
  ## How it works
85
 
86
  πŸ“ˆ We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 
1
  from ..auto_leaderboard.model_metadata_type import ModelType
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = f"""
 
25
  {ModelType.RL.to_str(" : ")} model
26
  If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
27
 
28
+ πŸ΄β€β˜ οΈ indicates that this model has been flagged by the community, and should probably be ignored! Clicking the icon will redirect you to the discussion about the model.
29
+ (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
30
+
31
  ## How it works
32
 
33
  πŸ“ˆ We evaluate models on 4 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
src/auto_leaderboard/get_model_metadata.py CHANGED
@@ -1,10 +1,14 @@
1
  import re
2
  import os
 
 
 
3
  from typing import List
4
  from tqdm import tqdm
5
 
6
- from src.utils_display import AutoEvalColumn
7
- from src.auto_leaderboard.model_metadata_type import get_model_type
 
8
 
9
  from huggingface_hub import HfApi
10
  import huggingface_hub
@@ -52,6 +56,60 @@ def get_model_size(model_name, model_info):
52
  return None
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def apply_metadata(leaderboard_data: List[dict]):
56
  get_model_type(leaderboard_data)
57
  get_model_infos_from_hub(leaderboard_data)
 
 
1
  import re
2
  import os
3
+ import glob
4
+ import json
5
+ import os
6
  from typing import List
7
  from tqdm import tqdm
8
 
9
+ from src.utils_display import AutoEvalColumn, model_hyperlink
10
+ from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
11
+ from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS
12
 
13
  from huggingface_hub import HfApi
14
  import huggingface_hub
 
56
  return None
57
 
58
 
59
+ def get_model_type(leaderboard_data: List[dict]):
60
+ for model_data in leaderboard_data:
61
+ request_files = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
62
+ request_files = glob.glob(request_files)
63
+
64
+ # Select correct request file (precision)
65
+ request_file = ""
66
+ if len(request_files) == 1:
67
+ request_file = request_files[0]
68
+ elif len(request_files) > 1:
69
+ request_files = sorted(request_files, reverse=True)
70
+ for tmp_request_file in request_files:
71
+ with open(tmp_request_file, "r") as f:
72
+ req_content = json.load(f)
73
+ if req_content["status"] == "FINISHED" and req_content["precision"] == model_data["Precision"].split(".")[-1]:
74
+ request_file = tmp_request_file
75
+
76
+ if request_file == "":
77
+ model_data[AutoEvalColumn.model_type.name] = ""
78
+ model_data[AutoEvalColumn.model_type_symbol.name] = ""
79
+ continue
80
+
81
+ try:
82
+ with open(request_file, "r") as f:
83
+ request = json.load(f)
84
+ is_delta = request["weight_type"] != "Original"
85
+ except Exception:
86
+ is_delta = False
87
+
88
+ try:
89
+ with open(request_file, "r") as f:
90
+ request = json.load(f)
91
+ model_type = model_type_from_str(request["model_type"])
92
+ model_data[AutoEvalColumn.model_type.name] = model_type.value.name
93
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol #+ ("πŸ”Ί" if is_delta else "")
94
+ except KeyError:
95
+ if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
96
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.name
97
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[model_data["model_name_for_query"]].value.symbol #+ ("πŸ”Ί" if is_delta else "")
98
+ else:
99
+ model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
100
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
101
+
102
+ def flag_models(leaderboard_data:List[dict]):
103
+ flag_symbol = "πŸ’€"
104
+ for model_data in leaderboard_data:
105
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
106
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
107
+ issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
108
+
109
+ model_data[AutoEvalColumn.model_type_symbol.name] = flag_symbol
110
+ model_data[AutoEvalColumn.model.name] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
111
+
112
  def apply_metadata(leaderboard_data: List[dict]):
113
  get_model_type(leaderboard_data)
114
  get_model_infos_from_hub(leaderboard_data)
115
+ flag_models(leaderboard_data)
src/auto_leaderboard/load_results.py CHANGED
@@ -102,7 +102,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
102
  return result_key, eval_results
103
 
104
 
105
- def get_eval_results(is_public) -> List[EvalResult]:
106
  json_filepaths = []
107
 
108
  for root, dir, files in os.walk("eval-results"):
@@ -135,7 +135,7 @@ def get_eval_results(is_public) -> List[EvalResult]:
135
  return eval_results
136
 
137
 
138
- def get_eval_results_dicts(is_public=True) -> List[Dict]:
139
- eval_results = get_eval_results(is_public)
140
 
141
  return [e.to_dict() for e in eval_results]
 
102
  return result_key, eval_results
103
 
104
 
105
+ def get_eval_results() -> List[EvalResult]:
106
  json_filepaths = []
107
 
108
  for root, dir, files in os.walk("eval-results"):
 
135
  return eval_results
136
 
137
 
138
+ def get_eval_results_dicts() -> List[Dict]:
139
+ eval_results = get_eval_results()
140
 
141
  return [e.to_dict() for e in eval_results]
src/auto_leaderboard/model_metadata_flags.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Model name to forum discussion id
2
+ FLAGGED_MODELS = {
3
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
4
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207"
5
+ }
src/auto_leaderboard/model_metadata_type.py CHANGED
@@ -1,11 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
- import glob
4
- import json
5
- import os
6
- from typing import Dict, List
7
 
8
- from ..utils_display import AutoEvalColumn
9
 
10
  @dataclass
11
  class ModelInfo:
@@ -24,7 +20,7 @@ class ModelType(Enum):
24
  return f"{self.value.symbol}{separator}{self.value.name}"
25
 
26
 
27
- TYPE_METADATA: Dict[str, ModelType] = {
28
  'notstoic/PygmalionCoT-7b': ModelType.IFT,
29
  'aisquared/dlite-v1-355m': ModelType.IFT,
30
  'aisquared/dlite-v1-1_5b': ModelType.IFT,
@@ -553,45 +549,3 @@ def model_type_from_str(type):
553
  return ModelType.IFT
554
  return ModelType.Unknown
555
 
556
-
557
- def get_model_type(leaderboard_data: List[dict]):
558
- for model_data in leaderboard_data:
559
- request_files = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
560
- request_files = glob.glob(request_files)
561
-
562
- request_file = ""
563
- if len(request_files) == 1:
564
- request_file = request_files[0]
565
- elif len(request_files) > 1:
566
- request_files = sorted(request_files, reverse=True)
567
- for tmp_request_file in request_files:
568
- with open(tmp_request_file, "r") as f:
569
- req_content = json.load(f)
570
- if req_content["status"] == "FINISHED" and req_content["precision"] == model_data["Precision"].split(".")[-1]:
571
- request_file = tmp_request_file
572
-
573
- if request_file == "":
574
- model_data[AutoEvalColumn.model_type.name] = ""
575
- model_data[AutoEvalColumn.model_type_symbol.name] = ""
576
- continue
577
-
578
- try:
579
- with open(request_file, "r") as f:
580
- request = json.load(f)
581
- is_delta = request["weight_type"] != "Original"
582
- except Exception:
583
- is_delta = False
584
-
585
- try:
586
- with open(request_file, "r") as f:
587
- request = json.load(f)
588
- model_type = model_type_from_str(request["model_type"])
589
- model_data[AutoEvalColumn.model_type.name] = model_type.value.name
590
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol #+ ("πŸ”Ί" if is_delta else "")
591
- except KeyError:
592
- if model_data["model_name_for_query"] in TYPE_METADATA:
593
- model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
594
- model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol #+ ("πŸ”Ί" if is_delta else "")
595
- else:
596
- model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
597
- model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
+ from typing import Dict
 
 
 
4
 
 
5
 
6
  @dataclass
7
  class ModelInfo:
 
20
  return f"{self.value.symbol}{separator}{self.value.name}"
21
 
22
 
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
  'notstoic/PygmalionCoT-7b': ModelType.IFT,
25
  'aisquared/dlite-v1-355m': ModelType.IFT,
26
  'aisquared/dlite-v1-1_5b': ModelType.IFT,
 
549
  return ModelType.IFT
550
  return ModelType.Unknown
551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils_display.py CHANGED
@@ -89,20 +89,22 @@ def make_clickable_model(model_name):
89
  link = KOALA_LINK
90
  elif model_name == "oasst-12b":
91
  link = OASST_LINK
92
- #else:
93
- # link = MODEL_PAGE
94
  details_model_name = model_name.replace('/', '__')
95
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
96
- print(f"details_link: {details_link}")
97
- try:
98
- check_path = list(API.list_files_info(repo_id=f"open-llm-leaderboard/details_{details_model_name}",
99
- paths="README.md",
100
- repo_type="dataset"))
101
- print(f"check_path: {check_path}")
102
- except Exception as err:
103
- # No details repo for this model
104
- print(f"No details repo for this model: {err}")
105
- return model_hyperlink(link, model_name)
 
 
 
106
 
107
  return model_hyperlink(link, model_name) + ' ' + model_hyperlink(details_link, "πŸ“‘")
108
 
 
89
  link = KOALA_LINK
90
  elif model_name == "oasst-12b":
91
  link = OASST_LINK
92
+
 
93
  details_model_name = model_name.replace('/', '__')
94
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
95
+
96
+ if not bool(os.getenv("DEBUG", "False")):
97
+ # We only add these checks when not debugging, as they are extremely slow
98
+ print(f"details_link: {details_link}")
99
+ try:
100
+ check_path = list(API.list_files_info(repo_id=f"open-llm-leaderboard/details_{details_model_name}",
101
+ paths="README.md",
102
+ repo_type="dataset"))
103
+ print(f"check_path: {check_path}")
104
+ except Exception as err:
105
+ # No details repo for this model
106
+ print(f"No details repo for this model: {err}")
107
+ return model_hyperlink(link, model_name)
108
 
109
  return model_hyperlink(link, model_name) + ' ' + model_hyperlink(details_link, "πŸ“‘")
110