pminervini commited on
Commit
90dff75
·
1 Parent(s): 669da77
Files changed (3) hide show
  1. src/backend/envs.py +3 -2
  2. src/display/utils.py +0 -12
  3. src/tools/plots.py +0 -19
src/backend/envs.py CHANGED
@@ -20,8 +20,9 @@ class Tasks(Enum):
20
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
21
  # task0 = Task("anli_r1", "acc", "ANLI")
22
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
- task0 = Task("nq_open", "em", "NQ Open", 64) # 64 ?
24
- task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64 ?
 
25
 
26
 
27
  # NUM_FEWSHOT = 64 # Change with your few shot
 
20
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
21
  # task0 = Task("anli_r1", "acc", "ANLI")
22
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
+ task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
+ task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
+ task2 = Task("truthfulqa:mc", "mc2", "TruthfulQA", 0) # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
26
 
27
 
28
  # NUM_FEWSHOT = 64 # Change with your few shot
src/display/utils.py CHANGED
@@ -73,18 +73,6 @@ class EvalQueueColumn: # Queue column
73
  status = ColumnContent("status", "str", True)
74
 
75
 
76
- # Define the human baselines
77
- human_baseline_row = {
78
- AutoEvalColumn.model.name: "<p>Human performance</p>",
79
- AutoEvalColumn.revision.name: "N/A",
80
- AutoEvalColumn.precision.name: None,
81
- AutoEvalColumn.average.name: 100.0,
82
- AutoEvalColumn.nqopen.name: 100.0,
83
- AutoEvalColumn.triviaqa.name: 100.0,
84
- AutoEvalColumn.dummy.name: "human_baseline",
85
- AutoEvalColumn.model_type.name: "",
86
- }
87
-
88
  @dataclass
89
  class ModelDetails:
90
  name: str
 
73
  status = ColumnContent("status", "str", True)
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  @dataclass
77
  class ModelDetails:
78
  name: str
src/tools/plots.py CHANGED
@@ -92,10 +92,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
92
  # Filter the DataFrame based on the specified metrics
93
  df = df[df["task"].isin(metrics)]
94
 
95
- # Filter the human baselines based on the specified metrics
96
- from src.display.utils import human_baseline_row as HUMAN_BASELINE
97
- filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
98
-
99
  # Create a line figure using plotly express with specified markers and custom data
100
  fig = px.line(
101
  df,
@@ -129,21 +125,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
129
  for trace in fig.data:
130
  metric_color_mapping[trace.name] = trace.line.color
131
 
132
- # Iterate over filtered human baselines and add horizontal lines to the figure
133
- for metric, value in filtered_human_baselines.items():
134
- color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
135
- location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
136
- # Add horizontal line with matched color and positioned annotation
137
- fig.add_hline(
138
- y=value,
139
- line_dash="dot",
140
- annotation_text=f"{metric} human baseline",
141
- annotation_position=location,
142
- annotation_font_size=10,
143
- annotation_font_color=color,
144
- line_color=color,
145
- )
146
-
147
  return fig
148
 
149
 
 
92
  # Filter the DataFrame based on the specified metrics
93
  df = df[df["task"].isin(metrics)]
94
 
 
 
 
 
95
  # Create a line figure using plotly express with specified markers and custom data
96
  fig = px.line(
97
  df,
 
125
  for trace in fig.data:
126
  metric_color_mapping[trace.name] = trace.line.color
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  return fig
129
 
130