eduagarcia commited on
Commit
1dbfacb
โ€ข
1 Parent(s): 6db2f85

Add proprietary model results v1

Browse files
app.py CHANGED
@@ -198,6 +198,9 @@ def filter_models(
198
  if "Flagged" in hide_models:
199
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
200
 
 
 
 
201
  type_emoji = [t[0] for t in type_query]
202
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
203
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -231,7 +234,7 @@ leaderboard_df = filter_models(
231
  size_query=list(NUMERIC_INTERVALS.keys()),
232
  precision_query=[i.value.name for i in Precision],
233
  language_query=[i.value.name for i in Language],
234
- hide_models=["Contains a merge/moerge", "Flagged"], # "Private or deleted", "Contains a merge/moerge", "Flagged"
235
  )
236
 
237
  demo = gr.Blocks(css=custom_css)
@@ -268,8 +271,8 @@ with demo:
268
  with gr.Row():
269
  hide_models = gr.CheckboxGroup(
270
  label="Hide models",
271
- choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
272
- value=["Contains a merge/moerge", "Flagged"],
273
  interactive=True
274
  )
275
  with gr.Column(min_width=320):
@@ -465,7 +468,7 @@ with demo:
465
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
466
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
467
  model_type = gr.Dropdown(
468
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
469
  label="Model type",
470
  multiselect=False,
471
  value=ModelType.FT.to_str(" : "),
 
198
  if "Flagged" in hide_models:
199
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
200
 
201
+ if "Proprietary" in hide_models:
202
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.license.name] != "Proprietary"]
203
+
204
  type_emoji = [t[0] for t in type_query]
205
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
206
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
234
  size_query=list(NUMERIC_INTERVALS.keys()),
235
  precision_query=[i.value.name for i in Precision],
236
  language_query=[i.value.name for i in Language],
237
+ hide_models=["Flagged"], # "Private or deleted", "Contains a merge/moerge", "Flagged"
238
  )
239
 
240
  demo = gr.Blocks(css=custom_css)
 
271
  with gr.Row():
272
  hide_models = gr.CheckboxGroup(
273
  label="Hide models",
274
+ choices = ["Proprietary", "Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
275
+ value=["Flagged"],
276
  interactive=True
277
  )
278
  with gr.Column(min_width=320):
 
468
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
469
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
470
  model_type = gr.Dropdown(
471
+ choices=[t.to_str(" : ") for t in ModelType if t not in [ModelType.Unknown, ModelType.proprietary]],
472
  label="Model type",
473
  multiselect=False,
474
  value=ModelType.FT.to_str(" : "),
proprietary_models_results.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "sabia-2-small",
4
+ "name": "Sabiรก-2 Small",
5
+ "link": "https://www.maritaca.ai/",
6
+ "date": "2024-04-12",
7
+ "status": "full",
8
+ "main_language": "Portuguese",
9
+ "result_metrics": {
10
+ "enem_challenge": 0.7172848145556333,
11
+ "bluex": 0.5549374130737135,
12
+ "oab_exams": 0.6364464692482916,
13
+ "assin2_sts": 0.7053302344881672,
14
+ "assin2_rte": 0.9121728362223306,
15
+ "faquad_nli": 0.7575848453041435,
16
+ "hatebr_offensive": 0.5025338637870607,
17
+ "portuguese_hate_speech": 0.4650217578860529,
18
+ "tweetsentbr": 0.533977453070735
19
+ },
20
+ "result_metrics_average": 0.6428099652929031,
21
+ "result_metrics_npm": 0.43960062672137007
22
+ },
23
+ {
24
+ "model": "sabia-2-medium",
25
+ "name": "Sabiรก-2 Medium",
26
+ "link": "https://www.maritaca.ai/",
27
+ "date": "2024-04-13",
28
+ "status": "full",
29
+ "main_language": "Portuguese",
30
+ "result_metrics": {
31
+ "enem_challenge": 0.8180545836249126,
32
+ "bluex": 0.717663421418637,
33
+ "oab_exams": 0.7321184510250569,
34
+ "assin2_sts": 0.7804108376537757,
35
+ "assin2_rte": 0.923459363368553,
36
+ "faquad_nli": 0.7657657657657658,
37
+ "hatebr_offensive": 0.8349989882997386,
38
+ "portuguese_hate_speech": 0.7379326358571694,
39
+ "tweetsentbr": 0.7269533040381798
40
+ },
41
+ "result_metrics_average": 0.7819285945613098,
42
+ "result_metrics_npm": 0.6676121786922709
43
+ },
44
+ {
45
+ "model": "gpt-3.5-turbo-0125",
46
+ "name": "GPT-3.5 Turbo (0125)",
47
+ "link": "https://www.openai.com/",
48
+ "date": "2024-03-08",
49
+ "status": "full",
50
+ "main_language": "English",
51
+ "result_metrics": {
52
+ "enem_challenge": 0.7214835549335199,
53
+ "bluex": 0.6244784422809457,
54
+ "oab_exams": 0.5430523917995445,
55
+ "assin2_sts": 0.7378460201077941,
56
+ "assin2_rte": 0.8823038414050672,
57
+ "faquad_nli": 0.746353108609074,
58
+ "hatebr_offensive": 0.8056205941193919,
59
+ "portuguese_hate_speech": 0.7363692688971499,
60
+ "tweetsentbr": 0.7028981330613626
61
+ },
62
+ "result_metrics_average": 0.7222672616904278,
63
+ "result_metrics_npm": 0.5841504766165372
64
+ },
65
+ {
66
+ "model": "claude-3-haiku-20240307",
67
+ "name": "Claude-3 Haiku (20240307)",
68
+ "link": "https://www.claude.ai/",
69
+ "date": "2024-04-13",
70
+ "status": "full",
71
+ "main_language": "English",
72
+ "result_metrics": {
73
+ "enem_challenge": 0.7718684394681595,
74
+ "bluex": 0.6662030598052852,
75
+ "oab_exams": 0.626879271070615,
76
+ "assin2_sts": 0.7892124744168747,
77
+ "assin2_rte": 0.9184462138121732,
78
+ "faquad_nli": 0.6340996599941455,
79
+ "hatebr_offensive": 0.8023698759439051,
80
+ "portuguese_hate_speech": 0.7342166269560177,
81
+ "tweetsentbr": 0.5477486799750156
82
+ },
83
+ "result_metrics_average": 0.7212271446046878,
84
+ "result_metrics_npm": 0.5735261536314672
85
+ },
86
+ {
87
+ "model": "gemini-1.0-pro",
88
+ "name": "Gemini 1.0 Pro",
89
+ "link": "https://ai.google.dev/",
90
+ "date": "2024-03-08",
91
+ "status": "full",
92
+ "main_language": "English",
93
+ "result_metrics": {
94
+ "enem_challenge": 0.7130860741777467,
95
+ "bluex": 0.5869262865090403,
96
+ "oab_exams": 0.4988610478359909,
97
+ "assin2_sts": 0.7058831239763663,
98
+ "assin2_rte": 0.8945993304651698,
99
+ "faquad_nli": 0.7070913567220611,
100
+ "hatebr_offensive": 0.8086330094493972,
101
+ "portuguese_hate_speech": 0.699119105113102,
102
+ "tweetsentbr": 0.6803240476660983
103
+ },
104
+ "result_metrics_average": 0.6993914868794414,
105
+ "result_metrics_npm": 0.551208000273598
106
+ }
107
+ ]
src/display/about.py CHANGED
@@ -19,8 +19,6 @@ if 'readme' in TASK_CONFIG:
19
  INTRODUCTION_TEXT = f"""
20
  {GENERAL_DESCRIPTION}
21
 
22
- This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">๐Ÿค— Open LLM Leaderboard</a> with different benchmarks.
23
-
24
  Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
25
  The leaderboard's backend runs on a [fork](https://github.com/eduagarcia/lm-evaluation-harness-pt) of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
26
 
 
19
  INTRODUCTION_TEXT = f"""
20
  {GENERAL_DESCRIPTION}
21
 
 
 
22
  Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
23
  The leaderboard's backend runs on a [fork](https://github.com/eduagarcia/lm-evaluation-harness-pt) of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
24
 
src/display/utils.py CHANGED
@@ -2,6 +2,9 @@ from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  from typing import List
4
  import pandas as pd
 
 
 
5
  from yaml import safe_load
6
  from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
7
 
@@ -87,12 +90,6 @@ baseline_row = {
87
  AutoEvalColumn.precision.name: "?",
88
  AutoEvalColumn.merged.name: False,
89
  #AutoEvalColumn.average.name: 31.0,
90
- #AutoEvalColumn.arc.name: 25.0,
91
- #AutoEvalColumn.hellaswag.name: 25.0,
92
- #AutoEvalColumn.mmlu.name: 25.0,
93
- #AutoEvalColumn.truthfulqa.name: 25.0,
94
- #AutoEvalColumn.winogrande.name: 50.0,
95
- #AutoEvalColumn.gsm8k.name: 0.21,
96
  AutoEvalColumn.dummy.name: "baseline",
97
  AutoEvalColumn.model_type.name: "",
98
  AutoEvalColumn.flagged.name: False,
@@ -119,8 +116,8 @@ for task in Tasks:
119
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
120
  baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
121
 
122
- #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
123
- baseline_row["๐Ÿค— Leaderboard Average"] = None
124
 
125
  # Average โฌ†๏ธ human baseline is 0.897 (source: averaging human baselines below)
126
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -136,12 +133,6 @@ human_baseline_row = {
136
  AutoEvalColumn.precision.name: "?",
137
  #AutoEvalColumn.average.name: 92.75,
138
  AutoEvalColumn.merged.name: False,
139
- #AutoEvalColumn.arc.name: 80.0,
140
- #AutoEvalColumn.hellaswag.name: 95.0,
141
- #AutoEvalColumn.mmlu.name: 89.8,
142
- #AutoEvalColumn.truthfulqa.name: 94.0,
143
- #AutoEvalColumn.winogrande.name: 94.0,
144
- #AutoEvalColumn.gsm8k.name: 100,
145
  AutoEvalColumn.dummy.name: "human_baseline",
146
  AutoEvalColumn.model_type.name: "",
147
  AutoEvalColumn.flagged.name: False,
@@ -168,8 +159,27 @@ for task in Tasks:
168
  npm.append((res - task.value.baseline) / (100 - task.value.baseline))
169
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
170
  human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
171
- #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
172
- human_baseline_row["๐Ÿค— Leaderboard Average"] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  @dataclass
175
  class ModelDetails:
@@ -183,6 +193,7 @@ class ModelType(Enum):
183
  FT = ModelDetails(name="fine-tuned/fp on domain-specific datasets", symbol="๐Ÿ”ถ")
184
  chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="๐Ÿ’ฌ")
185
  merges = ModelDetails(name="base merges and moerges", symbol="๐Ÿค")
 
186
  Unknown = ModelDetails(name="", symbol="?")
187
 
188
  def to_str(self, separator=" "):
@@ -200,6 +211,8 @@ class ModelType(Enum):
200
  return ModelType.chat
201
  if "merge" in type or "๐Ÿค" in type:
202
  return ModelType.merges
 
 
203
  return ModelType.Unknown
204
 
205
  class WeightType(Enum):
@@ -240,7 +253,7 @@ class Language(Enum):
240
  language = language.lower().replace('-', '').replace('_', '')
241
  if language in ["pt", "ptpt", "ptbr", "portuguese"]:
242
  return Language.Portuguese
243
- if language in ["en", "enus", "engb", "english", ]:
244
  return Language.English
245
  if language in ["es", "spanish"]:
246
  return Language.Spanish
 
2
  from enum import Enum
3
  from typing import List
4
  import pandas as pd
5
+ import os
6
+ import json
7
+ from copy import deepcopy
8
  from yaml import safe_load
9
  from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
10
 
 
90
  AutoEvalColumn.precision.name: "?",
91
  AutoEvalColumn.merged.name: False,
92
  #AutoEvalColumn.average.name: 31.0,
 
 
 
 
 
 
93
  AutoEvalColumn.dummy.name: "baseline",
94
  AutoEvalColumn.model_type.name: "",
95
  AutoEvalColumn.flagged.name: False,
 
116
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
117
  baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
118
 
119
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
120
+ baseline_row["๐Ÿค— Leaderboard Average"] = None
121
 
122
  # Average โฌ†๏ธ human baseline is 0.897 (source: averaging human baselines below)
123
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
133
  AutoEvalColumn.precision.name: "?",
134
  #AutoEvalColumn.average.name: 92.75,
135
  AutoEvalColumn.merged.name: False,
 
 
 
 
 
 
136
  AutoEvalColumn.dummy.name: "human_baseline",
137
  AutoEvalColumn.model_type.name: "",
138
  AutoEvalColumn.flagged.name: False,
 
159
  npm.append((res - task.value.baseline) / (100 - task.value.baseline))
160
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
161
  human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
162
+ if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
163
+ human_baseline_row["๐Ÿค— Leaderboard Average"] = None
164
+
165
+ #Proprietary models
166
+ proprietary_rows = []
167
+ if os.path.exists('proprietary_models_results.json'):
168
+ with open('proprietary_models_results.json', 'r', encoding='utf8') as f:
169
+ all_models = json.load(f)
170
+ for model_data in all_models:
171
+ model_row = deepcopy(baseline_row)
172
+ model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
173
+ model_row[AutoEvalColumn.dummy.name] = model_data['model']
174
+ model_row[AutoEvalColumn.license.name] = "Proprietary"
175
+ for task in Tasks:
176
+ model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
177
+ model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
178
+ model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
179
+ model_row[AutoEvalColumn.model_type.name] = "proprietary models (closed)"
180
+ model_row[AutoEvalColumn.model_type_symbol.name] = "๐Ÿ”’"
181
+ model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
182
+ proprietary_rows.append(model_row)
183
 
184
  @dataclass
185
  class ModelDetails:
 
193
  FT = ModelDetails(name="fine-tuned/fp on domain-specific datasets", symbol="๐Ÿ”ถ")
194
  chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="๐Ÿ’ฌ")
195
  merges = ModelDetails(name="base merges and moerges", symbol="๐Ÿค")
196
+ proprietary = ModelDetails(name="proprietary models (closed)", symbol="๐Ÿ”’")
197
  Unknown = ModelDetails(name="", symbol="?")
198
 
199
  def to_str(self, separator=" "):
 
211
  return ModelType.chat
212
  if "merge" in type or "๐Ÿค" in type:
213
  return ModelType.merges
214
+ if "proprietary" in type or "๐Ÿ”’" in type:
215
+ return ModelType.proprietary
216
  return ModelType.Unknown
217
 
218
  class WeightType(Enum):
 
253
  language = language.lower().replace('-', '').replace('_', '')
254
  if language in ["pt", "ptpt", "ptbr", "portuguese"]:
255
  return Language.Portuguese
256
+ if language in ["en", "enus", "engb", "english"]:
257
  return Language.English
258
  if language in ["es", "spanish"]:
259
  return Language.Spanish
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_requests_clickable_model
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
9
  from src.leaderboard.filter_models import filter_models_flags
10
  from src.leaderboard.read_evals import get_raw_eval_results
11
 
@@ -14,6 +14,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
14
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
  all_data_json.append(baseline_row)
 
 
17
  filter_models_flags(all_data_json)
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
 
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_requests_clickable_model
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, proprietary_rows
9
  from src.leaderboard.filter_models import filter_models_flags
10
  from src.leaderboard.read_evals import get_raw_eval_results
11
 
 
14
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
  all_data_json.append(baseline_row)
17
+ for proprietary_row in proprietary_rows:
18
+ all_data_json.append(proprietary_row)
19
  filter_models_flags(all_data_json)
20
 
21
  df = pd.DataFrame.from_records(all_data_json)
src/scripts/update_all_request_files.py CHANGED
@@ -94,7 +94,7 @@ def update_dynamic_files():
94
  start = time.time()
95
 
96
  models = list(API.list_models(
97
- filter=ModelFilter(task="text-generation"),
98
  full=False,
99
  cardData=True,
100
  fetch_config=True,
 
94
  start = time.time()
95
 
96
  models = list(API.list_models(
97
+ # filter=ModelFilter(task="text-generation"),
98
  full=False,
99
  cardData=True,
100
  fetch_config=True,
tasks_config/pt_config.yaml CHANGED
@@ -17,18 +17,14 @@ readme:
17
  general_description: |
18
  ๐Ÿ“ The ๐Ÿš€ Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
19
  Large Language Models (LLMs) in the Portuguese language across a variety of tasks
20
- and datasets.
21
- The leaderboard is open to submissions of models from the community
22
- and is designed to be a resource for researchers, practitioners, and enthusiasts
23
- interested in the development and evaluation of LLMs for the Portuguese language.
24
- If you have any questions, suggestions, or would like to contribute to the leaderboard,
25
- please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
26
  support_description: |
27
  This leaderboard is made possible by the support of the
28
  [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
29
  [Federal University of Goiรกs (UFG)](https://international.ufg.br/).
30
 
31
- Add the results to your model card: [๐Ÿง Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
 
32
  about_description: |
33
  The ๐Ÿš€ Open PT-LLM Leaderboard is a benchmark for the evaluation of
34
  Large Language Models (LLMs) in the Portuguese language.
@@ -41,8 +37,7 @@ readme:
41
  [Federal University of Goiรกs (UFG)](https://international.ufg.br/), this leaderboard
42
  operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to
43
  resource availability, which is not exclusive. Therefore, please be patient if
44
- your model is in the queue. If you'd like to support the leaderboard, feel free to
45
- reach out.
46
 
47
  This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">๐Ÿค— Open LLM Leaderboard</a> with
48
  portuguese benchmarks.
 
17
  general_description: |
18
  ๐Ÿ“ The ๐Ÿš€ Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
19
  Large Language Models (LLMs) in the Portuguese language across a variety of tasks
20
+ and datasets.
 
 
 
 
 
21
  support_description: |
22
  This leaderboard is made possible by the support of the
23
  [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
24
  [Federal University of Goiรกs (UFG)](https://international.ufg.br/).
25
 
26
+ If you have any questions, suggestions, or would like to contribute to the leaderboard,
27
+ please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
28
  about_description: |
29
  The ๐Ÿš€ Open PT-LLM Leaderboard is a benchmark for the evaluation of
30
  Large Language Models (LLMs) in the Portuguese language.
 
37
  [Federal University of Goiรกs (UFG)](https://international.ufg.br/), this leaderboard
38
  operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to
39
  resource availability, which is not exclusive. Therefore, please be patient if
40
+ your model is in the queue.
 
41
 
42
  This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">๐Ÿค— Open LLM Leaderboard</a> with
43
  portuguese benchmarks.