Terry Zhuo commited on
Commit
de4c2d6
·
1 Parent(s): 2970f67
Files changed (2) hide show
  1. app.py +1 -1
  2. src/display/utils.py +1 -1
app.py CHANGED
@@ -396,7 +396,7 @@ with main_block as demo:
396
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
397
  - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
398
  - `Average` is the average of `Complete` and `Instruct` when both are available.
399
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
400
  - `#Act Params (B)` is the number of activated model parameters during inference.
401
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
402
  - For more details check the 📝 About section.
 
396
  - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
397
  - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
398
  - `Average` is the average of `Complete` and `Instruct` when both are available.
399
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
400
  - `#Act Params (B)` is the number of activated model parameters during inference.
401
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
402
  - For more details check the 📝 About section.
src/display/utils.py CHANGED
@@ -105,7 +105,7 @@ auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["s
105
  auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
106
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
107
  auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
108
- auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
109
 
110
  # We use make dataclass to dynamically fill the scores from Tasks
111
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
105
  auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
106
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
107
  auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
108
+ # auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
109
 
110
  # We use make dataclass to dynamically fill the scores from Tasks
111
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)