Paul Hager commited on
Commit
5c1f78d
Β·
1 Parent(s): adad63e

Adjusted to CDM orga and text. Removed submission

Browse files
Files changed (3) hide show
  1. app.py +108 -98
  2. src/about.py +59 -30
  3. src/envs.py +8 -6
app.py CHANGED
@@ -22,7 +22,7 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,18 +32,29 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
@@ -51,11 +62,12 @@ except Exception:
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -80,9 +92,7 @@ def init_leaderboard(dataframe):
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -101,92 +111,92 @@ with demo:
101
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("πŸ“™ Citation", open=False):
@@ -201,4 +211,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+
36
  ### Space initialisation
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
+ repo_id=QUEUE_REPO,
41
+ local_dir=EVAL_REQUESTS_PATH,
42
+ repo_type="dataset",
43
+ tqdm_class=None,
44
+ etag_timeout=30,
45
+ token=TOKEN,
46
  )
47
  except Exception:
48
  restart_space()
49
  try:
50
  print(EVAL_RESULTS_PATH)
51
  snapshot_download(
52
+ repo_id=RESULTS_REPO,
53
+ local_dir=EVAL_RESULTS_PATH,
54
+ repo_type="dataset",
55
+ tqdm_class=None,
56
+ etag_timeout=30,
57
+ token=TOKEN,
58
  )
59
  except Exception:
60
  restart_space()
 
62
 
63
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
64
 
65
+ # (
66
+ # finished_eval_queue_df,
67
+ # running_eval_queue_df,
68
+ # pending_eval_queue_df,
69
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
70
+
71
 
72
  def init_leaderboard(dataframe):
73
  if dataframe is None or dataframe.empty:
 
92
  max=150,
93
  label="Select the number of parameters (B)",
94
  ),
95
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
 
 
96
  ],
97
  bool_checkboxgroup_label="Hide models",
98
  interactive=False,
 
111
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
112
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
113
 
114
+ # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
115
+ # with gr.Column():
116
+ # with gr.Row():
117
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
118
+
119
+ # with gr.Column():
120
+ # with gr.Accordion(
121
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
122
+ # open=False,
123
+ # ):
124
+ # with gr.Row():
125
+ # finished_eval_table = gr.components.Dataframe(
126
+ # value=finished_eval_queue_df,
127
+ # headers=EVAL_COLS,
128
+ # datatype=EVAL_TYPES,
129
+ # row_count=5,
130
+ # )
131
+ # with gr.Accordion(
132
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
133
+ # open=False,
134
+ # ):
135
+ # with gr.Row():
136
+ # running_eval_table = gr.components.Dataframe(
137
+ # value=running_eval_queue_df,
138
+ # headers=EVAL_COLS,
139
+ # datatype=EVAL_TYPES,
140
+ # row_count=5,
141
+ # )
142
+
143
+ # with gr.Accordion(
144
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
145
+ # open=False,
146
+ # ):
147
+ # with gr.Row():
148
+ # pending_eval_table = gr.components.Dataframe(
149
+ # value=pending_eval_queue_df,
150
+ # headers=EVAL_COLS,
151
+ # datatype=EVAL_TYPES,
152
+ # row_count=5,
153
+ # )
154
+ # with gr.Row():
155
+ # gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
156
+
157
+ # with gr.Row():
158
+ # with gr.Column():
159
+ # model_name_textbox = gr.Textbox(label="Model name")
160
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
161
+ # model_type = gr.Dropdown(
162
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
163
+ # label="Model type",
164
+ # multiselect=False,
165
+ # value=None,
166
+ # interactive=True,
167
+ # )
168
+
169
+ # with gr.Column():
170
+ # precision = gr.Dropdown(
171
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
172
+ # label="Precision",
173
+ # multiselect=False,
174
+ # value="float16",
175
+ # interactive=True,
176
+ # )
177
+ # weight_type = gr.Dropdown(
178
+ # choices=[i.value.name for i in WeightType],
179
+ # label="Weights type",
180
+ # multiselect=False,
181
+ # value="Original",
182
+ # interactive=True,
183
+ # )
184
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
185
+
186
+ # submit_button = gr.Button("Submit Eval")
187
+ # submission_result = gr.Markdown()
188
+ # submit_button.click(
189
+ # add_new_eval,
190
+ # [
191
+ # model_name_textbox,
192
+ # base_model_name_textbox,
193
+ # revision_name_textbox,
194
+ # precision,
195
+ # weight_type,
196
+ # model_type,
197
+ # ],
198
+ # submission_result,
199
+ # )
200
 
201
  with gr.Row():
202
  with gr.Accordion("πŸ“™ Citation", open=False):
 
211
  scheduler = BackgroundScheduler()
212
  scheduler.add_job(restart_space, "interval", seconds=1800)
213
  scheduler.start()
214
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,21 +12,30 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
 
 
 
20
 
21
 
 
 
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
@@ -35,37 +45,56 @@ LLM_BENCHMARKS_TEXT = f"""
35
  ## Reproducibility
36
  To reproduce our results, here is the commands you can run:
37
 
38
- """
 
 
 
 
 
 
 
39
 
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
  ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
 
 
 
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
 
 
 
 
 
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
- """
 
 
 
 
 
 
 
 
 
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("MIMIC CDM Appendicitis", "acc", "CDM App")
17
+ task1 = Task("MIMIC CDM Cholecystitis", "acc", "CDM Cholec")
18
+ task2 = Task("MIMIC CDM Diverticulitis", "acc", "CDM Divert")
19
+ task3 = Task("MIMIC CDM Pancreatitis", "acc", "CDM Pancr")
20
+ task4 = Task("MIMIC CDM Mean", "acc", "CDM Mean")
21
 
22
+ task5 = Task("MIMIC CDM FI Appendicitis", "acc", "CDM FI App")
23
+ task6 = Task("MIMIC CDM FI Cholecystitis", "acc", "CDM FI Cholec")
24
+ task7 = Task("MIMIC CDM FI Diverticulitis", "acc", "CDM FI Divert")
25
+ task8 = Task("MIMIC CDM FI Pancreatitis", "acc", "CDM FI Pancr")
26
+ task9 = Task("MIMIC CDM FI Mean", "acc", "CDM FI Mean")
27
 
28
 
29
+ NUM_FEWSHOT = 0 # Change with your few shot
30
+ # ---------------------------------------------------
31
+
32
 
33
  # Your leaderboard name
34
+ TITLE = """<h1 align="center" id="space-title">MIMIC Clinical Decision Making</h1>"""
35
 
36
  # What does your leaderboard evaluate?
37
  INTRODUCTION_TEXT = """
38
+ This leaderboard shows current scores of models on the MIMIC Clinical Decision Making (MIMIC-CDM) and MIMIC Clinical Decision Making Full Information (MIMIC-CDM-FI) datasets. The dataset can be found [here](https://physionet.org/content/mimic-iv-ext-cdm/). The code used to run the models can be found [here](https://github.com/paulhager/MIMIC-Clinical-Decision-Making-Framework).
39
  """
40
 
41
  # Which evaluations are you running? how can people reproduce what you have?
 
45
  ## Reproducibility
46
  To reproduce our results, here is the commands you can run:
47
 
48
+ For MIMIC-CDM, navigate to the MIMIC-Clinical-Decision-Making-Framework repository and execute:
49
+
50
+ ```
51
+ python run.py pathology=appendicitis model=<YOUR_MODEL_NAME>
52
+ python run.py pathology=cholecystitis model=<YOUR_MODEL_NAME>
53
+ python run.py pathology=pancreatitis model=<YOUR_MODEL_NAME>
54
+ python run.py pathology=diverticulitis model=<YOUR_MODEL_NAME>
55
+ ```
56
 
57
+ For MIMIC-CDM-FI, navigate to the MIMIC-Clinical-Decision-Making-Framework repository and execute:
 
58
 
 
 
 
 
 
 
59
  ```
60
+ python run_full_info.py pathology=appendicitis model=<YOUR_MODEL_NAME>
61
+ python run_full_info.py pathology=cholecystitis model=<YOUR_MODEL_NAME>
62
+ python run_full_info.py pathology=pancreatitis model=<YOUR_MODEL_NAME>
63
+ python run_full_info.py pathology=diverticulitis model=<YOUR_MODEL_NAME>
64
 
65
+ ```
66
+
67
+ """
68
 
69
+ # EVALUATION_QUEUE_TEXT = """
70
+ # ## Some good practices before submitting a model
71
 
72
+ # ### 1) Make sure you can load your model and tokenizer using AutoClasses:
73
+ # ```python
74
+ # from transformers import AutoConfig, AutoModel, AutoTokenizer
75
+ # config = AutoConfig.from_pretrained("your model name", revision=revision)
76
+ # model = AutoModel.from_pretrained("your model name", revision=revision)
77
+ # tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
78
+ # ```
79
+ # If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
80
 
81
+ # Note: make sure your model is public!
82
+ # Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
83
 
84
+ # ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
85
+ # It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
86
+
87
+ # ### 3) Make sure your model has an open license!
88
+ # This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
89
+
90
+ # ### 4) Fill up your model card
91
+ # When we add extra information about models to the leaderboard, it will be automatically taken from the model card
92
+
93
+ # ## In case of model failure
94
+ # If your model is displayed in the `FAILED` category, its execution stopped.
95
+ # Make sure you have followed the above steps first.
96
+ # If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
97
+ # """
98
 
99
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
100
  CITATION_BUTTON_TEXT = r"""
src/envs.py CHANGED
@@ -4,22 +4,24 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = (
10
+ "MIMIC-CDM" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ )
12
  # ----------------------------------
13
 
14
  REPO_ID = f"{OWNER}/leaderboard"
15
+ # QUEUE_REPO = f"{OWNER}/requests"
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
  # If you setup a cache later, just change HF_HOME
19
+ CACHE_PATH = os.getenv("HF_HOME", ".")
20
 
21
  # Local caches
22
+ # EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
23
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
24
+ # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
25
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
26
 
27
  API = HfApi(token=TOKEN)