Spyros Mouselinos commited on
Commit
adc9146
·
1 Parent(s): 31a7dfb
Files changed (4) hide show
  1. .idea/.gitignore +8 -0
  2. app.py +6 -0
  3. euclideagameeval.py +332 -0
  4. requirements.txt +4 -0
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("EuclideaGameEval")
6
+ launch_gradio_widget(module)
euclideagameeval.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Euclidea Game Evaluation Metric."""
2
+ import re
3
+ import datasets
4
+ import evaluate
5
+ import Levenshtein
6
+ import copy
7
+ from sentence_transformers import SentenceTransformer, util
8
+ from scipy.optimize import linear_sum_assignment
9
+ import numpy as np
10
+ import torch
11
+ import itertools
12
+ from itertools import combinations
13
+
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
16
+
17
+ _DESCRIPTION = """
18
+ Natural Language Match Score: Given a geometric problem and a generated sequence of steps to solve it,
19
+ in natural language, this module computes a matching score between the ground truth and the provided solution.
20
+
21
+ The module works by segmenting the provided solution into steps.
22
+ Then for each step it extracts the used tool and arguments to it.
23
+ Finally it compares the solution steps with the ground truth ones using the Hungarian Matching Algorithm.
24
+ """
25
+
26
+ _CITATION = """\
27
+ @misc{mouselinos2024lines,
28
+ title={Beyond Lines and Circles: Unveiling the Geometric Reasoning Gap in Large Language Models},
29
+ author={Spyridon Mouselinos and Henryk Michalewski and Mateusz Malinowski},
30
+ year={2024},
31
+ eprint={2402.03877},
32
+ archivePrefix={arXiv},
33
+ primaryClass={cs.CL}
34
+ }
35
+ """
36
+
37
+ TOOL2IDX = {
38
+ 'Perpendicular Tool': 0,
39
+ 'Line Tool': 1,
40
+ 'Circle Tool': 2,
41
+ 'Perpendicular Bisector Tool': 3,
42
+ 'Angle Bisector Tool': 4,
43
+ 'Parallel Tool': 5,
44
+ 'Compass Tool': 6,
45
+ 'Intersect Tool': 7,
46
+ 'Point Tool': 8,
47
+ }
48
+
49
+ IDX2TOOL = {
50
+ v: k for k, v in TOOL2IDX.items()
51
+ }
52
+
53
+ DEFAULT_THRESHOLD = 0.65
54
+
55
+
56
+ class GeometryEvaluator:
57
+ def __init__(self,
58
+ responses,
59
+ references):
60
+
61
+ self.references = references
62
+ if isinstance(responses, str):
63
+ responses = [responses]
64
+ self.responses = responses
65
+
66
+ if isinstance(references, str):
67
+ references = [references]
68
+ self.references = references
69
+
70
+ # if isinstance(self.results_tools[0], str):
71
+ # self.results_tools = [[eval(f)] for f in self.results_tools]
72
+
73
+ # if isinstance(self.results_symbols[0], str):
74
+ # self.results_symbols = [[eval(f)] for f in self.results_symbols]
75
+
76
+ def get_symbols(self, text):
77
+ # Define a regular expression to match single or double capital letter words
78
+ pattern = r'\b[A-Z]{1,4}\b'
79
+
80
+ # Find matches in the text using the regular expression and store their positions
81
+ matches = [(match.group(), match.start()) for match in re.finditer(pattern, text)]
82
+
83
+ # Sort the matches based on their positions in the text
84
+ sorted_matches = sorted(matches, key=lambda x: x[1])
85
+
86
+ # Extract the matched elements and return them in order
87
+ elements = [match[0] for match in sorted_matches]
88
+
89
+ return elements
90
+
91
+ def compare_symbols(self, history, new):
92
+ unique_elements = set(new) - set(history)
93
+ return list(unique_elements)
94
+
95
+ def check_word(self, word, sentence):
96
+ pattern = r'\b' + re.escape(word) + r'\b'
97
+ return bool(re.search(pattern, sentence, re.IGNORECASE))
98
+
99
+ def annotate_solutions(self, step_solutions, toolset, initial_symbols=None):
100
+ """
101
+ Annotates solutions according to the tool used.
102
+ Keeps track of emitted symbols at each tool round
103
+ Heuristic method.
104
+ """
105
+
106
+ keyword_2_tool = {
107
+ 'circle': ['Circle Tool'],
108
+ 'line': ['Line Tool'],
109
+ 'point': ['Point Tool'],
110
+ 'intersect': ['Intersect Tool'],
111
+ 'bisect': ['Perpendicular Bisector Tool', 'Angle Bisector Tool'],
112
+ 'midpoint': ['Perpendicular Bisector Tool', 'Perpendicular Tool'],
113
+ 'angle': ['Angle Bisector Tool'],
114
+ 'vertical': ['Perpendicular Tool'],
115
+ 'perpendicular': ['Perpendicular Bisector Tool', 'Perpendicular Tool'],
116
+ 'parallel': ['Parallel Tool'],
117
+ 'compass': ['Compass Tool'],
118
+ }
119
+
120
+ step_solutions = [f for f in step_solutions.split('\n') if len(f) > 2]
121
+ num_solutions = len(step_solutions)
122
+ refined_solutions = []
123
+ gt_tools = []
124
+ history_symbols = initial_symbols if initial_symbols is not None else []
125
+ per_step_symbols = [copy.deepcopy(history_symbols)]
126
+ for i in range(num_solutions):
127
+ current_solution = copy.deepcopy(step_solutions[i])
128
+ current_solution = current_solution.lower().strip()
129
+ if ',' in current_solution:
130
+ current_solution = current_solution[:current_solution.find(',')]
131
+ ### Voting classifier ###
132
+ possible_tools = np.zeros(shape=(9,)) # 9 Tools
133
+ for keyword, tools in keyword_2_tool.items():
134
+ if self.check_word(keyword, current_solution):
135
+ for tool in tools:
136
+ if tool in toolset:
137
+ possible_tools[TOOL2IDX[tool]] += 1
138
+ ### If no tool ###
139
+ if np.sum(possible_tools) == 0:
140
+ continue
141
+ ### Take the smallest id as the most probable ###
142
+ tool_name = IDX2TOOL[np.argmax(possible_tools)]
143
+ refined_solutions.append(f'<{tool_name}>{step_solutions[i]}')
144
+ gt_tools.append(tool_name)
145
+ ### Now find (if any) associated points and resulting symbols from each operation ###
146
+ emitted_symbols = self.get_symbols(step_solutions[i])
147
+ new_symbols = self.compare_symbols(history_symbols, emitted_symbols)
148
+ if len(new_symbols) > 0:
149
+ history_symbols += new_symbols
150
+ per_step_symbols.append(new_symbols)
151
+ refined_solutions = '\n'.join(refined_solutions)
152
+ return refined_solutions, gt_tools, per_step_symbols
153
+
154
+ def format_solutions(self, solution):
155
+ """
156
+ qs: Part of the solution that usually is an assumption starting with Let.
157
+ We move this to the question instead.
158
+ """
159
+ try:
160
+ solution = solution.split('\n\n')[1].split('\n\n')[0]
161
+ except:
162
+ pass
163
+ kw = None
164
+ if 'Let' in solution:
165
+ kw = 'Let'
166
+ elif 'Given' in solution:
167
+ kw = 'Given'
168
+ if kw is not None:
169
+ start_idx = solution.find(kw)
170
+ ### Fix for one weird level ###
171
+ offset = solution.find('{\displaystyle AB>AO,AC>AO}')
172
+ if offset != -1:
173
+ qs = 'Let O be the vertex of the angle and A the given point. Let B, C be abitary points on each ray, such that AB is bigger than AO and AC is bigger than AO.'
174
+ solution = 'Construct circle O with center O and radius OA.\nConstruct circle B with center B and radius BA, intersecting circle O at F.\nConstruct circle B with center C and radius CA, intersecting circle O at G.\nConstruct line FG, intersecting line OB at H, intersecting line OC at I.\nConstruct line AH.\nConstruct line AI.'
175
+ return solution, qs
176
+ else:
177
+ possible_end_idxs = [solution.find(f) for f in ['.', 'Construct', 'Draw', 'With', 'Point', 'Starting']]
178
+ possible_end_idxs = min([f for f in possible_end_idxs if f != -1]) + 1
179
+ qs = solution[start_idx:possible_end_idxs]
180
+ solution_ = solution[possible_end_idxs:]
181
+ if solution_.startswith('onstruct'):
182
+ possible_end_idxs -= 1
183
+ qs = solution[start_idx:possible_end_idxs].strip()
184
+ solution = solution[possible_end_idxs:]
185
+ else:
186
+ qs = None
187
+ solution = solution.replace('\n\n', '\n')
188
+ return solution, qs
189
+
190
+ def format_tools(self, tools):
191
+ proper_tools = []
192
+ distances = np.zeros(shape=(9,))
193
+ for tool in tools:
194
+ if tool == 'Move Tool':
195
+ continue
196
+ ### Look over the correct tools ###
197
+ for proper_tool_name, proper_tool_idx in TOOL2IDX.items():
198
+ distances[proper_tool_idx] = Levenshtein.distance(tool.lower(), proper_tool_name.lower())
199
+ ### Find the tool with the correct (minimum distance) ###
200
+ proper_tools.append(IDX2TOOL[np.argmin(distances)])
201
+ return proper_tools
202
+
203
+ def decompose_example(self, solution, initial_symbols):
204
+ s, _ = self.format_solutions(solution)
205
+ tools = [k for k in TOOL2IDX.keys()]
206
+ response_solution, response_tools, response_symbols = self.annotate_solutions(s, tools, initial_symbols)
207
+ return response_solution, (response_tools, response_symbols)
208
+
209
+ def evaluate(self):
210
+ instance_responses = []
211
+ instance_tools = []
212
+ instance_symbols = []
213
+ for generated_solution in self.responses:
214
+ if len(generated_solution) < 10:
215
+ continue
216
+ response_solution, (response_tools, response_symbols) = self.decompose_example(
217
+ solution=generated_solution,
218
+ initial_symbols=None)
219
+ instance_responses.append(response_solution)
220
+ instance_tools.append(response_tools)
221
+ instance_symbols.append(response_symbols)
222
+ return instance_responses, instance_tools, instance_symbols
223
+
224
+ def best_matching_subset(self, response, ground_truth, all_cosine_scores):
225
+ max_score = float('-inf')
226
+ best_subset = None
227
+
228
+ # Iterate over all subsets of response of the required size
229
+ for subset_indices in combinations(range(len(response)), len(ground_truth)):
230
+ # Select the scores for the current subset
231
+ subset_scores = all_cosine_scores[np.ix_(subset_indices, range(len(ground_truth)))]
232
+
233
+ # Convert to negative for the cost matrix
234
+ cost_matrix = -subset_scores
235
+
236
+ # Solve the assignment problem
237
+ row_ind, col_ind = linear_sum_assignment(cost_matrix)
238
+ total_score = -cost_matrix[row_ind, col_ind].sum()
239
+
240
+ # Update max_score and best_subset if this is the best so far
241
+ if total_score > max_score:
242
+ max_score = total_score
243
+ best_subset = [response[i] for i in subset_indices]
244
+ if best_subset is None:
245
+ return 0, [''] * len(ground_truth)
246
+ return max_score, best_subset
247
+
248
+ def estimate_pass_at_k(self, num_samples, num_correct, k, ):
249
+ """
250
+ Estimates pass@k of each problem and returns them in an array.
251
+ """
252
+
253
+ def estimator(n: int, c: int, k: int) -> float:
254
+ """
255
+ Calculates 1 - comb(n - c, k) / comb(n, k).
256
+ """
257
+ if n - c < k:
258
+ return 1.0
259
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
260
+
261
+ if isinstance(num_samples, int):
262
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
263
+ else:
264
+ assert len(num_samples) == len(num_correct)
265
+ num_samples_it = iter(num_samples)
266
+
267
+ return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
268
+
269
+ def nl_overlap(self, response, ground_truth):
270
+ response_split = response.split('\n')
271
+ ground_truth_split = ground_truth.split('\n')
272
+ if len(response_split) < len(ground_truth_split):
273
+ return 0
274
+ ###################################################################################
275
+ response_emb = model.encode(response_split, show_progress_bar=False, batch_size=len(response_split),
276
+ convert_to_tensor=True, device=DEVICE)
277
+ gt_emb = model.encode(ground_truth_split, show_progress_bar=False, batch_size=len(ground_truth_split),
278
+ convert_to_tensor=True, device=DEVICE)
279
+ emb_score = util.pytorch_cos_sim(response_emb, gt_emb).cpu().numpy()
280
+ sent_score, best_subset = self.best_matching_subset(response_split, ground_truth_split, emb_score)
281
+ assert len(best_subset) == len(ground_truth_split)
282
+ r = model.encode('\n'.join(best_subset), show_progress_bar=False, batch_size=1,
283
+ convert_to_tensor=True, device=DEVICE)
284
+ g = model.encode(ground_truth, show_progress_bar=False, batch_size=1,
285
+ convert_to_tensor=True, device=DEVICE)
286
+ sg = util.pytorch_cos_sim(r, g).cpu().numpy()[0][0]
287
+ new_new_score = sent_score / len(best_subset) * sg
288
+ return float(new_new_score)
289
+
290
+ def test_1(self, instance_responses):
291
+ references = self.references
292
+ raw_scores = []
293
+ for j in range(len(instance_responses)):
294
+ scores = self.nl_overlap(instance_responses[j], references[0])
295
+ raw_scores.append(scores)
296
+ return raw_scores
297
+
298
+ def calc_thresh_pass(self, raw_scores, threshold=0.65):
299
+ r = 0
300
+ for score in raw_scores:
301
+ r += 1.0 * (score > threshold)
302
+ pass1 = self.estimate_pass_at_k([len(raw_scores)], [r], 1).mean()
303
+ pass10 = self.estimate_pass_at_k([len(raw_scores)], [r], 10).mean()
304
+ pass25 = self.estimate_pass_at_k([len(raw_scores)], [r], 25).mean()
305
+ pass50 = self.estimate_pass_at_k([len(raw_scores)], [r], 50).mean()
306
+ return pass1, pass10, pass25, pass50
307
+
308
+
309
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION)
310
+ class EuclideaGameEval(evaluate.Metric):
311
+ def _info(self):
312
+ return evaluate.MetricInfo(
313
+ description=_DESCRIPTION,
314
+ citation=_CITATION,
315
+ features=datasets.Features(
316
+ {
317
+ "predictions": datasets.Sequence(datasets.Value("int32")),
318
+ "references": datasets.Sequence(datasets.Value("int32")),
319
+ }
320
+ )
321
+ )
322
+
323
+ def _compute(self, predictions, references):
324
+ ge = GeometryEvaluator(responses=predictions, references=references)
325
+ tmp_, _, _ = ge.evaluate()
326
+ pass1, pass10, pass25, pass50 = ge.calc_thresh_pass(ge.test_1(tmp_), DEFAULT_THRESHOLD)
327
+ return {
328
+ "pass@1": pass1,
329
+ "pass@10": pass10,
330
+ "pass@25": pass25,
331
+ "pass@50": pass50
332
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
2
+ scikit-learn
3
+ scipy
4
+ Levenshtein