Terry Zhuo
commited on
Commit
·
de4c2d6
1
Parent(s):
2970f67
update
Browse files- app.py +1 -1
- src/display/utils.py +1 -1
app.py
CHANGED
@@ -396,7 +396,7 @@ with main_block as demo:
|
|
396 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
397 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
398 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
399 |
-
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the
|
400 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
401 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
402 |
- For more details check the 📝 About section.
|
|
|
396 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
397 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
398 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
399 |
+
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
|
400 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
401 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
402 |
- For more details check the 📝 About section.
|
src/display/utils.py
CHANGED
@@ -105,7 +105,7 @@ auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["s
|
|
105 |
auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
|
106 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
|
107 |
auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
|
108 |
-
auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
|
109 |
|
110 |
# We use make dataclass to dynamically fill the scores from Tasks
|
111 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
105 |
auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
|
106 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
|
107 |
auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
|
108 |
+
# auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
|
109 |
|
110 |
# We use make dataclass to dynamically fill the scores from Tasks
|
111 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|