yuchenlin commited on
Commit
3d2e59d
·
1 Parent(s): d74dfe0

explore data

Browse files
Files changed (8) hide show
  1. .gitignore +1 -0
  2. _header.md +2 -2
  3. app.py +48 -9
  4. constants.py +14 -9
  5. data_utils.py +93 -5
  6. eval_utils.py +217 -0
  7. update_data.sh +3 -2
  8. zebra_banner.png +0 -0
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
 
2
  *.pyc
3
  *.DS_Store
 
 
1
 
2
  *.pyc
3
  *.DS_Store
4
+ ZeroEval-main/result_dirs/zebra-grid/
_header.md CHANGED
@@ -1,5 +1,5 @@
1
  <br/>
2
 
3
- # 🦁 WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
4
- [📑 Paper](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X](https://x.com/billyuchenlin/status/1795746137875554531) | [💬 Discussion](https://huggingface.co/spaces/allenai/WildBench/discussions) | ⚙️ **Version**: **V2** | **# Models**: {model_num} | Updated: **{LAST_UPDATED}**
5
 
 
1
  <br/>
2
 
3
+ # 🦓 ZebraLogic Bench: Testing the Limits of LLMs in Logical Reasoning
4
+ [📑 Blog](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X]() | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
5
 
app.py CHANGED
@@ -18,7 +18,7 @@ import os, uuid
18
  from utils_display import model_info
19
  from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
20
  import pytz
21
- from data_utils import post_processing
22
 
23
  # get the last updated time from the elo_ranks.all.jsonl file
24
  LAST_UPDATED = None
@@ -34,6 +34,7 @@ with open("_header.md", "r") as f:
34
  with open("_metrics.md", "r") as f:
35
  METRICS_MD = f.read()
36
 
 
37
  original_df = None
38
  # available_models = [] # to be filled in later
39
  available_models = list(model_info.keys())
@@ -89,7 +90,44 @@ def _tab_leaderboard():
89
  mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
90
 
91
 
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def _tab_submit():
94
  pass
95
 
@@ -101,13 +139,14 @@ def build_demo():
101
  gr.HTML(BANNER, elem_id="banner")
102
  # convert LAST_UPDATED to the PDT time
103
  LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
104
- # header_md_text = HEADER_MD.replace("{model_num}", str(len(original_df["-1"]))).replace("{LAST_UPDATED}", str(LAST_UPDATED))
105
- # gr.Markdown(header_md_text, elem_classes="markdown-text")
106
 
107
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
108
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
109
  _tab_leaderboard()
110
-
 
111
  with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
112
  _tab_submit()
113
 
@@ -129,7 +168,7 @@ def build_demo():
129
 
130
 
131
  def data_load(result_file):
132
- global original_df
133
  print(f"Loading {result_file}")
134
  column_names_main = column_names.copy()
135
  # column_names_main.update({})
@@ -137,15 +176,15 @@ def data_load(result_file):
137
  click_url = True
138
  # read json file from the result_file
139
  with open(result_file, "r") as f:
140
- data = json.load(f)
141
  # floatify the data, if possible
142
- for d in data:
143
  for k, v in d.items():
144
  try:
145
  d[k] = float(v)
146
  except:
147
  pass
148
- original_df = pd.DataFrame(data)
149
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
150
  # print(original_df.columns)
151
 
 
18
  from utils_display import model_info
19
  from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
20
  import pytz
21
+ from data_utils import post_processing, get_random_item
22
 
23
  # get the last updated time from the elo_ranks.all.jsonl file
24
  LAST_UPDATED = None
 
34
  with open("_metrics.md", "r") as f:
35
  METRICS_MD = f.read()
36
 
37
+ raw_data = None
38
  original_df = None
39
  # available_models = [] # to be filled in later
40
  available_models = list(model_info.keys())
 
90
  mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
91
 
92
 
93
+ def sample_explore_item(model_name, size_H, size_W, greedy_or_sample):
94
+ print(model_name, size_H, size_W, greedy_or_sample)
95
+ explore_item = get_random_item(model_name, size_H, size_W)
96
+ if explore_item is None:
97
+ return "No item found", "No item found", "No item found", "No item found"
98
+ model_name = explore_item['Model']
99
+ example_id = explore_item['id']
100
+ puzzle_md = f"### Puzzle [{example_id}]:\n\n" + explore_item['puzzle'].replace("## Clues", "### Clues").replace("\n", "<br>")
101
+ model_reasoning_md = f"### {model_name}'s Reasoning:\n\n {explore_item['reasoning']}"
102
+ model_prediction_md = f"### {model_name}'s Prediction:\n\n {explore_item['solution']}" + "\n\n" + explore_item['solution_table_md']
103
+ puzzle_solved = explore_item['correct_cells'] == explore_item['total_cells']
104
+ cell_acc = explore_item["correct_cells"] / explore_item["total_cells"] * 100
105
+ model_eval_md = f"### Evaluation:\n\n **Total Cells**: {explore_item['total_cells']} | **Correct Cells**: {explore_item['correct_cells']} | **Puzzle solved**: {puzzle_solved} | **Cell Acc**: {cell_acc:.2f}%"
106
+ return puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md
107
+
108
+
109
+ def _tab_explore():
110
+ global raw_data
111
+ model_names = [item["Model"] for item in raw_data]
112
+ with gr.Row():
113
+ model_selection = gr.Dropdown(choices = ["random"] + model_names, label="Model: ", elem_id="select-models", value="random", interactive=True)
114
+ size_H_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Houses", elem_id="select-H", value="random", interactive=True)
115
+ size_W_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Features", elem_id="select-W", value="random", interactive=True)
116
+ with gr.Column(scale=1):
117
+ greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
118
+ explore_button = gr.Button("Sample", elem_id="explore-button")
119
+
120
+ puzzle_md = gr.Markdown("\n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
121
+ model_reasoning_md = gr.Markdown("\n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
122
+ model_prediction_md = gr.Markdown("\n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
123
+ model_eval_md = gr.Markdown("\n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
124
+
125
+ explore_button.click(fn=sample_explore_item,
126
+ inputs=[model_selection, size_H_selection, size_W_selection, greedy_or_sample],
127
+ outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md])
128
+
129
+
130
+
131
  def _tab_submit():
132
  pass
133
 
 
139
  gr.HTML(BANNER, elem_id="banner")
140
  # convert LAST_UPDATED to the PDT time
141
  LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
142
+ header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
143
+ gr.Markdown(header_md_text, elem_classes="markdown-text")
144
 
145
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
146
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
147
  _tab_leaderboard()
148
+ with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
149
+ _tab_explore()
150
  with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
151
  _tab_submit()
152
 
 
168
 
169
 
170
  def data_load(result_file):
171
+ global raw_data, original_df
172
  print(f"Loading {result_file}")
173
  column_names_main = column_names.copy()
174
  # column_names_main.update({})
 
176
  click_url = True
177
  # read json file from the result_file
178
  with open(result_file, "r") as f:
179
+ raw_data = json.load(f)
180
  # floatify the data, if possible
181
+ for d in raw_data:
182
  for k, v in d.items():
183
  try:
184
  d[k] = float(v)
185
  except:
186
  pass
187
+ original_df = pd.DataFrame(raw_data)
188
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
189
  # print(original_df.columns)
190
 
constants.py CHANGED
@@ -5,20 +5,17 @@ DEFAULT_K = "∞"
5
  # DEFAULT_K = "1500"
6
 
7
  banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
8
- BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 50vw; min-width: 300px; max-width: 800px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
 
12
  WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
13
 
14
- CITATION_TEXT = """@article{lin2024wildbench,
15
- title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
16
- author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi},
17
- year={2024},
18
- eprint={2406.04770},
19
- archivePrefix={arXiv},
20
- primaryClass={cs.CL},
21
- url={https://arxiv.org/abs/2406.04770}
22
  }
23
  """
24
 
@@ -279,5 +276,13 @@ button.selected[role="tab"][aria-selected="true"] {
279
  font-size: 12pt;
280
  font-decoration: bold;
281
  }
 
 
 
 
 
 
 
 
282
  """
283
 
 
5
  # DEFAULT_K = "1500"
6
 
7
  banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
8
+ BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
 
12
  WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
13
 
14
+ CITATION_TEXT = """@article{tbd,
15
+ title={tbd},
16
+ author={tbd},
17
+ journal={tbd},
18
+ year={2024}
 
 
 
19
  }
20
  """
21
 
 
276
  font-size: 12pt;
277
  font-decoration: bold;
278
  }
279
+
280
+ .box_md{
281
+ border: 1px solid #000000;
282
+ border-radius: 10px;
283
+ padding: 5px;
284
+ font-size: 12pt;
285
+ margin: 5px;
286
+ }
287
  """
288
 
data_utils.py CHANGED
@@ -11,12 +11,13 @@ import math
11
  import json
12
  from tqdm import tqdm
13
  import numpy as np
 
 
 
14
 
15
- id_to_data = None
16
- model_len_info = None
17
- bench_data = None
18
- eval_results = None
19
- score_eval_results = None
20
 
21
  # Formats the columns
22
  def formatter(x):
@@ -41,3 +42,90 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
41
  df.sort_values(by=rank_column, inplace=True, ascending=False)
42
  return df
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  import json
12
  from tqdm import tqdm
13
  import numpy as np
14
+ import os
15
+
16
+ from eval_utils import *
17
 
18
+ summary_file = "ZeroEval-main/result_dirs/zebra-grid.summary.json"
19
+ result_dir = "ZeroEval-main/result_dirs/zebra-grid/"
20
+ results_by_model = {}
 
 
21
 
22
  # Formats the columns
23
  def formatter(x):
 
42
  df.sort_values(by=rank_column, inplace=True, ascending=False)
43
  return df
44
 
45
+
46
+ def load_all_data():
47
+ global summary_file, result_dir
48
+ with open(summary_file, "r") as f:
49
+ model_summary = json.load(f)
50
+ model_names = [model["Model"] for model in model_summary]
51
+ for model_name in model_names:
52
+ download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
53
+ output_file = os.path.join(result_dir, f"{model_name}.json")
54
+ # mkdir -p result_dir if not exists
55
+ os.makedirs(result_dir, exist_ok=True)
56
+ if not os.path.exists(output_file):
57
+ os.system(f"wget {download_url} -O {output_file}")
58
+ print(f"Downloaded {model_name}.json")
59
+ with open(output_file, "r") as f:
60
+ print(f"Loading {output_file}")
61
+ results_by_model[model_name] = json.load(f)
62
+
63
+ def get_random_item(model_name="random", size_H="random", size_W="random"):
64
+ global summary_file, result_dir, results_by_model
65
+ if results_by_model is None or len(results_by_model) == 0:
66
+ load_all_data()
67
+ if model_name == "random":
68
+ model_name = random.choice(list(results_by_model.keys()))
69
+ data = results_by_model[model_name]
70
+ random.shuffle(data)
71
+ selected_item = None
72
+ prediction_table = None
73
+ prediction_reasoning = None
74
+ id_to_item = {}
75
+ for item in data:
76
+ id_to_item[item["id"]] = item
77
+
78
+ if size_H == "random":
79
+ size_H_choice = random.choice(list(range(2, 7)))
80
+ else:
81
+ size_H_choice = size_H
82
+ if size_W == "random":
83
+ size_W_choice = random.choice(list(range(2, 7)))
84
+ else:
85
+ size_W_choice = size_W
86
+ ok_ids = [id for id in id_to_item if id_to_item[id]["size"].startswith(f"{size_H_choice}*{size_W_choice}")]
87
+ for ok_id in ok_ids:
88
+ item = id_to_item[ok_id]
89
+ prediction_str = item["output"][0]
90
+ prediction_json = extract_last_complete_json(prediction_str)
91
+ if prediction_json is None or "solution" not in prediction_json:
92
+ continue
93
+ prediction_reasoning = prediction_json.get("reasoning", "")
94
+ prediction_table = prediction_json["solution"]
95
+ if prediction_table is not None:
96
+ selected_item = item
97
+ break
98
+
99
+ if selected_item is None:
100
+ # selected_item = random.choice(data)
101
+ print("No item found!")
102
+ return None
103
+
104
+ explore_item = {}
105
+ explore_item["id"] = selected_item["id"]
106
+ explore_item["Model"] = model_name
107
+ explore_item["size"] = selected_item["size"]
108
+ explore_item["puzzle"] = selected_item["puzzle"]
109
+ explore_item["solution"] = prediction_table
110
+ explore_item["reasoning"] = prediction_reasoning
111
+ headers = ["Houses"] + list(prediction_table["House 1"].keys())
112
+ rows = []
113
+ for row_id in range(len(prediction_table)):
114
+ row = [row_id+1]
115
+ for feature in headers[1:]:
116
+ row.append(prediction_table[f"House {row_id+1}"][feature])
117
+ rows.append(row)
118
+ table_md = tabulate(rows, headers=headers, tablefmt="github")
119
+ explore_item["solution_table_md"] = table_md
120
+
121
+ this_total_cells, this_correct_cells = eval_each_puzzle(explore_item["id"], prediction_table)
122
+ # print(table_md)
123
+ explore_item["correct_cells"] = this_correct_cells
124
+ explore_item["total_cells"] = this_total_cells
125
+ return explore_item
126
+
127
+
128
+ if __name__ == "__main__":
129
+ load_all_data()
130
+ print("All data downloaded!")
131
+ print(json.dumps(get_random_item(model_name="gemini-1.5-pro", size_H="2", size_W="5"), indent=2))
eval_utils.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from collections import defaultdict
3
+ import os
4
+ from tabulate import tabulate
5
+ from datasets import load_dataset
6
+
7
+ private_solutions = {}
8
+
9
+ def load_private_solutions():
10
+ global private_solutions
11
+ private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
12
+ for item in private_zebra_data:
13
+ private_solutions[item["id"]] = item["solution"]
14
+ return
15
+
16
+ def load_model_results(run_name_folders):
17
+ model_results = {}
18
+ for run_name, folder in run_name_folders.items():
19
+ # iterate all json files under the folder
20
+ for filename in os.listdir(folder):
21
+ filepath = os.path.join(folder, filename)
22
+ if not filename.endswith(".json"):
23
+ continue
24
+ model_name = filename.replace(".json", "")
25
+ model_name = f"{model_name}%{run_name}"
26
+ model_results[model_name] = filepath
27
+ return model_results
28
+
29
+ def extract_last_complete_json(s):
30
+ # Stack to keep track of opening and closing braces
31
+ stack = []
32
+ last_json_start = None
33
+ last_json_str = None
34
+
35
+ for i, char in enumerate(s):
36
+ if char == '{':
37
+ stack.append(i)
38
+ if last_json_start is None:
39
+ last_json_start = i
40
+ elif char == '}':
41
+ if stack:
42
+ start = stack.pop()
43
+ if not stack:
44
+ # Complete JSON object found
45
+ last_json_str = s[last_json_start:i+1]
46
+ last_json_start = None
47
+
48
+ # Load the last JSON object
49
+ if last_json_str:
50
+ try:
51
+ return json.loads(last_json_str.replace("\n", ""))
52
+ except json.JSONDecodeError:
53
+ pass
54
+
55
+ return None
56
+
57
+ def eval_each_puzzle(id, prediction_table):
58
+ global private_solutions
59
+ if not private_solutions:
60
+ load_private_solutions()
61
+ solution = private_solutions[id]
62
+ solution_table = {}
63
+ num_houses = len(solution["rows"])
64
+ columns = solution["header"]
65
+ assert columns[0] == "House"
66
+ solution_table = {}
67
+ this_total_cells = 0
68
+ for i in range(num_houses):
69
+ solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
70
+ this_total_cells += len(columns) - 1
71
+
72
+ this_correct_cells = 0 # number in the solution_table
73
+ for house in solution_table:
74
+ for column in solution_table[house]:
75
+ # if prediction_table[house][column] not exist then pass
76
+ if house in prediction_table and column in prediction_table[house]:
77
+ truth_cell = solution_table[house][column].lower().strip()
78
+ if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
79
+ continue
80
+ if type(prediction_table[house][column]) == list:
81
+ predicted_cell = prediction_table[house][column][0].lower().strip()
82
+ elif type(prediction_table[house][column]) == str:
83
+ predicted_cell = prediction_table[house][column].lower().strip()
84
+ if truth_cell == predicted_cell:
85
+ this_correct_cells += 1
86
+ return this_total_cells, this_correct_cells
87
+
88
+ def eval_model(model, filepath):
89
+ global private_solutions
90
+ with open(filepath, "r") as f:
91
+ print(f"Processing {filepath}")
92
+ data = json.load(f)
93
+
94
+ solved_puzzles = 0
95
+ num_total_puzzles = len(data)
96
+ correct_cells = 0
97
+ total_cells = 0
98
+ no_asnwer = 0
99
+
100
+ num_total_puzzles_by_size = defaultdict(int)
101
+ solved_puzzles_by_size = defaultdict(int)
102
+ reason_lens = []
103
+ for item in data:
104
+ # solution = item["solution"]
105
+ solution = private_solutions[item["id"]]
106
+ size = item["size"]
107
+ num_total_puzzles_by_size[size] += 1
108
+
109
+ # Process the solution
110
+ solution_table = {}
111
+ num_houses = len(solution["rows"])
112
+ columns = solution["header"]
113
+ assert columns[0] == "House"
114
+ solution_table = {}
115
+ this_total_cells = 0
116
+ for i in range(num_houses):
117
+ solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
118
+ this_total_cells += len(columns) - 1
119
+ total_cells += this_total_cells
120
+
121
+ # Read and Parse the prediction from model output
122
+ prediction_str = item["output"][0]
123
+ prediction_json = extract_last_complete_json(prediction_str)
124
+ if prediction_json is None or "solution" not in prediction_json:
125
+ # print("-"*100)
126
+ # prediction_str = prediction_str.replace("\n", "")
127
+ # print([prediction_str])
128
+ # json.loads(prediction_str)
129
+ no_asnwer += 1
130
+ # print(item["id"])
131
+ continue
132
+ reason = prediction_json.get("reasoning", "")
133
+ prediction_table = prediction_json["solution"]
134
+
135
+ reason_lens.append(len(reason))
136
+
137
+ this_correct_cells = 0 # number in the solution_table
138
+ for house in solution_table:
139
+ for column in solution_table[house]:
140
+ # if prediction_table[house][column] not exist then pass
141
+ if house in prediction_table and column in prediction_table[house]:
142
+ truth_cell = solution_table[house][column].lower().strip()
143
+ if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
144
+ continue
145
+ if type(prediction_table[house][column]) == list:
146
+ predicted_cell = prediction_table[house][column][0].lower().strip()
147
+ elif type(prediction_table[house][column]) == str:
148
+ predicted_cell = prediction_table[house][column].lower().strip()
149
+ else:
150
+ raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
151
+ if truth_cell == predicted_cell:
152
+ this_correct_cells += 1
153
+ correct_cells += this_correct_cells
154
+
155
+ # compute puzzle success rate
156
+ if this_correct_cells == this_total_cells:
157
+ solved_puzzles += 1
158
+ solved_puzzles_by_size[size] += 1
159
+
160
+
161
+
162
+
163
+ # # print the success rate by size; order the dict by size first
164
+ sizes = sorted(num_total_puzzles_by_size.keys())
165
+ easy_sizes = ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
166
+ hard_sizes = ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
167
+
168
+ easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
169
+ easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
170
+ hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
171
+ hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
172
+
173
+ # for size in sizes:
174
+ # print(f"Size {size}: {solved_puzzles_by_size[size]}/{num_total_puzzles_by_size[size]} -> {solved_puzzles_by_size[size]/num_total_puzzles_by_size[size]*100:.2f}%")
175
+
176
+ result = {}
177
+ result["Model"] = model.split("%")[0]
178
+ result["Mode"] = model.split("%")[1]
179
+ result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
180
+ result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
181
+ result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
182
+ result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
183
+ result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
184
+ result["Total Puzzles"] = num_total_puzzles
185
+ result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
186
+ return result
187
+
188
+
189
+ def gen_results(run_name_folders):
190
+ model_results = load_model_results(run_name_folders)
191
+
192
+ columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
193
+ rows = []
194
+ for model_name, filepath in model_results.items():
195
+ result = eval_model(model_name, filepath)
196
+ rows.append(result)
197
+
198
+ # sort the rows by puzzle accuracy
199
+ rows = sorted(rows, key=lambda x: -float(x["Puzzle Acc"]))
200
+ # Convert rows to the expected format for tabulate
201
+ table_data = [[row[col] for col in columns] for row in rows]
202
+
203
+ print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
204
+ # print(tabulate(rows, headers=columns, tablefmt="github"))
205
+
206
+ # write to json file
207
+ with open("result_dirs/zebra-grid.summary.json", "w") as f:
208
+ json.dump(rows, f, indent=2)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ run_name_folders = {
213
+ "greedy": "result_dirs/zebra-grid",
214
+ "sampling": "result_dirs/zebra-grid/sampling",
215
+ }
216
+ load_private_solutions()
217
+ gen_results(run_name_folders)
update_data.sh CHANGED
@@ -1,4 +1,5 @@
1
  # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
2
  # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
3
- mkdir -p ZeroEval-main/result_dirs
4
- wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
 
 
1
  # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
2
  # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
3
+ mkdir -p ZeroEval-main/result_dirs/zebra-grid/
4
+ wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
5
+ wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
zebra_banner.png CHANGED