eduagarcia commited on
Commit
439afd4
β€’
1 Parent(s): 7a2430c

Fix model eval links and remove huggingface icon from Leaderboard name

Browse files
src/display/about.py CHANGED
@@ -4,21 +4,22 @@ from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEAD
4
 
5
  LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
6
 
7
- TITLE = F"""<h1 align="center" id="space-title">πŸ€— {LEADERBOARD_NAME}</h1>"""
8
 
9
  INTRODUCTION_TEXT = f"""
10
- πŸ“ The πŸ€— {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
11
 
12
  This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">πŸ€— Open LLM Leaderboard</a> with different benchmarks.
13
 
14
- πŸ€— Submit a model for automated evaluation on the πŸ€— GPU cluster on the "Submit" page!
15
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
16
  """
17
-
18
  TASKS_LIST= ""
19
  for task in Tasks:
20
  task = task.value
21
  TASKS_LIST += f'- <a href="{task.link}" target="_blank"> {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
 
22
 
23
  TASKS_PARAMETERS = ""
24
  for task in Tasks:
@@ -33,7 +34,7 @@ With the plethora of large language models (LLMs) and chatbots being released we
33
 
34
  ## How it works
35
 
36
- πŸ“ˆ We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
37
 
38
  {TASKS_LIST}
39
 
@@ -133,9 +134,9 @@ I have an issue about accessing the leaderboard through the Gradio API
133
 
134
 
135
  EVALUATION_QUEUE_TEXT = f"""
136
- # Evaluation Queue for the πŸ€— {LEADERBOARD_NAME}
137
 
138
- Models added here will be automatically evaluated on the πŸ€— cluster.
139
 
140
  ## First steps before submitting a model
141
 
 
4
 
5
  LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
6
 
7
+ TITLE = F"""<h1 align="center" id="space-title">πŸ“ˆ {LEADERBOARD_NAME}</h1>"""
8
 
9
  INTRODUCTION_TEXT = f"""
10
+ πŸ“ The πŸ“ˆ {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
11
 
12
  This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">πŸ€— Open LLM Leaderboard</a> with different benchmarks.
13
 
14
+ Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
15
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
16
  """
17
+ task_count = 0
18
  TASKS_LIST= ""
19
  for task in Tasks:
20
  task = task.value
21
  TASKS_LIST += f'- <a href="{task.link}" target="_blank"> {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
22
+ task_count += 1
23
 
24
  TASKS_PARAMETERS = ""
25
  for task in Tasks:
 
34
 
35
  ## How it works
36
 
37
+ πŸ“ˆ We evaluate models on {task_count} key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
38
 
39
  {TASKS_LIST}
40
 
 
134
 
135
 
136
  EVALUATION_QUEUE_TEXT = f"""
137
+ # Evaluation Queue for the πŸ“ˆ {LEADERBOARD_NAME}
138
 
139
+ Models added here will be automatically evaluated on our evaluation cluster.
140
 
141
  ## First steps before submitting a model
142
 
src/display/formatting.py CHANGED
@@ -4,6 +4,7 @@ from datetime import datetime, timezone
4
  from huggingface_hub import HfApi
5
  from huggingface_hub.hf_api import ModelInfo
6
 
 
7
 
8
  API = HfApi()
9
 
@@ -11,11 +12,25 @@ def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
13
 
14
- def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
17
- details_model_name = model_name.replace("/", "__")
18
- details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "πŸ“‘")
21
 
 
4
  from huggingface_hub import HfApi
5
  from huggingface_hub.hf_api import ModelInfo
6
 
7
+ from src.envs import RESULTS_REPO, QUEUE_REPO
8
 
9
  API = HfApi()
10
 
 
12
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
13
 
14
 
15
+ def make_requests_clickable_model(model_name, json_path=None):
16
  link = f"https://huggingface.co/{model_name}"
17
 
18
+ #details_model_name = model_name.replace("/", "__")
19
+ details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/tree/main"
20
+ if '/' in model_name:
21
+ details_link += f"/{model_name.split('/')[0]}"
22
+ if json_path is not None:
23
+ details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/blob/main/{json_path}"
24
+
25
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "πŸ“‘")
26
+
27
+ def make_clickable_model(model_name, json_path=None):
28
+ link = f"https://huggingface.co/{model_name}"
29
+
30
+ #details_model_name = model_name.replace("/", "__")
31
+ details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/tree/main/{model_name}"
32
+ if json_path is not None:
33
+ details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/blob/main/{model_name}/{json_path}"
34
 
35
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "πŸ“‘")
36
 
src/display/utils.py CHANGED
@@ -31,7 +31,7 @@ class Tasks(Enum):
31
  limit=None,
32
  task_list=["oab_exams_generate"],
33
  link="https://huggingface.co/datasets/eduagarcia/oab_exams",
34
- description="OAB Exams is a dataset of 1,000 questions from the Brazilian Bar Association's exams."
35
  )
36
  brazilian_court_decisions_judgment = Task(
37
  benchmark="brazilian_court_decisions_judgment",
 
31
  limit=None,
32
  task_list=["oab_exams_generate"],
33
  link="https://huggingface.co/datasets/eduagarcia/oab_exams",
34
+ description="OAB Exams is a dataset of 2,000 questions from the Brazilian Bar Association's exams."
35
  )
36
  brazilian_court_decisions_judgment = Task(
37
  benchmark="brazilian_court_decisions_judgment",
src/leaderboard/read_evals.py CHANGED
@@ -35,6 +35,7 @@ class EvalResult:
35
  flagged: bool = False
36
  status: str = "FINISHED"
37
  tags: list = None
 
38
 
39
  @classmethod
40
  def init_from_json_file(self, json_filepath):
@@ -42,6 +43,8 @@ class EvalResult:
42
  with open(json_filepath) as fp:
43
  data = json.load(fp)
44
 
 
 
45
  # We manage the legacy config format
46
  config = data.get("config_general")
47
 
@@ -100,6 +103,7 @@ class EvalResult:
100
  results=results,
101
  precision=precision,
102
  revision= config.get("model_sha", ""),
 
103
  )
104
 
105
  def update_with_request_file(self, requests_path):
@@ -137,7 +141,7 @@ class EvalResult:
137
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
138
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
139
  AutoEvalColumn.architecture.name: self.architecture,
140
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
141
  AutoEvalColumn.dummy.name: self.full_model,
142
  AutoEvalColumn.revision.name: self.revision,
143
  AutoEvalColumn.average.name: average,
@@ -202,6 +206,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
202
  eval_results = {}
203
  for model_result_filepath in model_result_filepaths:
204
  # Creation of result
 
205
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
206
  eval_result.update_with_request_file(requests_path)
207
  if eval_result.full_model in dynamic_data:
 
35
  flagged: bool = False
36
  status: str = "FINISHED"
37
  tags: list = None
38
+ json_filename: str = None
39
 
40
  @classmethod
41
  def init_from_json_file(self, json_filepath):
 
43
  with open(json_filepath) as fp:
44
  data = json.load(fp)
45
 
46
+ json_filename = os.path.basename(json_filepath)
47
+
48
  # We manage the legacy config format
49
  config = data.get("config_general")
50
 
 
103
  results=results,
104
  precision=precision,
105
  revision= config.get("model_sha", ""),
106
+ json_filename=json_filename
107
  )
108
 
109
  def update_with_request_file(self, requests_path):
 
141
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
142
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
143
  AutoEvalColumn.architecture.name: self.architecture,
144
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.json_filename),
145
  AutoEvalColumn.dummy.name: self.full_model,
146
  AutoEvalColumn.revision.name: self.revision,
147
  AutoEvalColumn.average.name: average,
 
206
  eval_results = {}
207
  for model_result_filepath in model_result_filepaths:
208
  # Creation of result
209
+ print(model_result_filepath)
210
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
211
  eval_result.update_with_request_file(requests_path)
212
  if eval_result.full_model in dynamic_data:
src/populate.py CHANGED
@@ -3,7 +3,7 @@ import os
3
 
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results
@@ -35,7 +35,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
35
  with open(file_path) as fp:
36
  data = json.load(fp)
37
 
38
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
39
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
40
 
41
  all_evals.append(data)
@@ -47,7 +47,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
47
  with open(file_path) as fp:
48
  data = json.load(fp)
49
 
50
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
51
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
  all_evals.append(data)
53
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.display.formatting import has_no_nan_values, make_requests_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
35
  with open(file_path) as fp:
36
  data = json.load(fp)
37
 
38
+ data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], entry)
39
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
40
 
41
  all_evals.append(data)
 
47
  with open(file_path) as fp:
48
  data = json.load(fp)
49
 
50
+ data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], os.path.join(entry, sub_entry))
51
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
  all_evals.append(data)
53