Clémentine commited on
Commit
943f952
·
1 Parent(s): 314f91a

update read

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Open LLM Leaderboard
3
- emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
@@ -12,4 +12,25 @@ license: apache-2.0
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
- Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Demo Leaderboard
3
+ emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
+ Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
16
+
17
+ Results files should have the following format:
18
+ ```
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
src/display/about.py CHANGED
@@ -10,15 +10,17 @@ class Task:
10
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
- task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
14
- task1 = Task("Key in the harness", "metric in the harness", "Display name 2")
 
15
 
16
 
17
  # Your leaderboard name
18
- TITLE = """<h1 align="center" id="space-title">Leaderboard</h1>"""
19
 
20
  # What does your leaderboard evaluate?
21
  INTRODUCTION_TEXT = """
 
22
  """
23
 
24
  # Which evaluations are you running? how can people reproduce what you have?
 
10
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("task_name1", "metric_name", "First task")
15
+ task1 = Task("task_name2", "metric_name", "Second task")
16
 
17
 
18
  # Your leaderboard name
19
+ TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
20
 
21
  # What does your leaderboard evaluate?
22
  INTRODUCTION_TEXT = """
23
+ Intro text
24
  """
25
 
26
  # Which evaluations are you running? how can people reproduce what you have?
src/leaderboard/read_evals.py CHANGED
@@ -5,8 +5,6 @@ import os
5
  from dataclasses import dataclass
6
 
7
  import dateutil
8
- from datetime import datetime
9
- from transformers import AutoConfig
10
  import numpy as np
11
 
12
  from src.display.formatting import make_clickable_model
@@ -16,7 +14,6 @@ from src.submission.check_validity import is_model_on_hub
16
 
17
  @dataclass
18
  class EvalResult:
19
- # Also see src.display.utils.AutoEvalColumn for what will be displayed.
20
  eval_name: str # org_model_precision (uid)
21
  full_model: str # org/model (path on hub)
22
  org: str
@@ -26,7 +23,7 @@ class EvalResult:
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
29
- architecture: str = "Unknown" # From config file
30
  license: str = "?"
31
  likes: int = 0
32
  num_params: int = 0
@@ -39,8 +36,7 @@ class EvalResult:
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
42
- # We manage the legacy config format
43
- config = data.get("config", data.get("config_general", None))
44
 
45
  # Precision
46
  precision = Precision.from_str(config.get("model_dtype"))
@@ -59,7 +55,7 @@ class EvalResult:
59
  result_key = f"{org}_{model}_{precision.value.name}"
60
  full_model = "/".join(org_and_model)
61
 
62
- still_on_hub, error, model_config = is_model_on_hub(
63
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
64
  )
65
  architecture = "?"
@@ -73,8 +69,8 @@ class EvalResult:
73
  for task in Tasks:
74
  task = task.value
75
 
76
- # We average all scores of a given metric
77
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
  continue
80
 
 
5
  from dataclasses import dataclass
6
 
7
  import dateutil
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
 
14
 
15
  @dataclass
16
  class EvalResult:
 
17
  eval_name: str # org_model_precision (uid)
18
  full_model: str # org/model (path on hub)
19
  org: str
 
23
  precision: Precision = Precision.Unknown
24
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
  weight_type: WeightType = WeightType.Original # Original or Adapter
26
+ architecture: str = "Unknown"
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
 
36
  with open(json_filepath) as fp:
37
  data = json.load(fp)
38
 
39
+ config = data.get("config")
 
40
 
41
  # Precision
42
  precision = Precision.from_str(config.get("model_dtype"))
 
55
  result_key = f"{org}_{model}_{precision.value.name}"
56
  full_model = "/".join(org_and_model)
57
 
58
+ still_on_hub, _, model_config = is_model_on_hub(
59
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
60
  )
61
  architecture = "?"
 
69
  for task in Tasks:
70
  task = task.value
71
 
72
+ # We average all scores of a given metric (not all metrics are present in all files)
73
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
74
  if accs.size == 0 or any([acc is None for acc in accs]):
75
  continue
76