Jialin Song commited on
Commit
9d03b23
1 Parent(s): cbe9336

update apps_metric to provide outputs

Browse files
Files changed (2) hide show
  1. testing_util.py +53 -16
  2. utils.py +1 -1
testing_util.py CHANGED
@@ -54,7 +54,8 @@ def run_test(sample, test=None, debug=False):
54
  otherwise it'll just return an input and output pair.
55
  """
56
  # Disable functionalities that can make destructive changes to the test.
57
- reliability_guard()
 
58
 
59
  if debug:
60
  print(f"start = {datetime.now().time()}")
@@ -99,7 +100,7 @@ def run_test(sample, test=None, debug=False):
99
  if debug:
100
  print(f"type 0 compilation error = {e}")
101
  results.append(-2)
102
- return results
103
  signal.alarm(0)
104
 
105
  elif which_type == CODE_TYPE.standard_input:
@@ -156,6 +157,7 @@ def run_test(sample, test=None, debug=False):
156
  results.append(-2)
157
  return results
158
 
 
159
  for index, inputs in enumerate(in_outs["inputs"]):
160
  # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
161
  try:
@@ -200,6 +202,15 @@ def run_test(sample, test=None, debug=False):
200
 
201
  # reset the alarm
202
  signal.alarm(0)
 
 
 
 
 
 
 
 
 
203
  except Exception as e:
204
  signal.alarm(0)
205
  faulthandler.disable()
@@ -234,6 +245,10 @@ def run_test(sample, test=None, debug=False):
234
  results.append(-1)
235
  signal.alarm(0)
236
 
 
 
 
 
237
  if not passed:
238
  if debug:
239
  nl = "\n"
@@ -246,7 +261,12 @@ def run_test(sample, test=None, debug=False):
246
  if passed and debug:
247
  print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
248
 
249
- if custom_compare_(output, in_outs['outputs'][index]):
 
 
 
 
 
250
  tmp_result = True
251
  results.append(tmp_result)
252
  continue
@@ -391,26 +411,42 @@ def run_test(sample, test=None, debug=False):
391
  if not isinstance(inputs, list):
392
  print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
393
  else:
394
- print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
395
-
396
 
397
- return results
398
 
399
 
400
  def custom_compare_(output, ground_truth):
 
 
 
 
 
401
 
402
  if isinstance(output, list):
403
- output_1 = "\n".join(output)
404
- if stripped_string_compare(output_1, ground_truth):
405
- return True
 
 
 
406
 
407
- if isinstance(output, list):
408
- output_2 = [o.lstrip().rstrip() for o in output]
409
- output_2 = "\n".join(output_2)
410
- if stripped_string_compare(output_2, ground_truth):
411
- return True
 
 
 
 
 
 
 
 
 
412
 
413
- return False
414
 
415
  def stripped_string_compare(s1, s2):
416
  s1 = s1.lstrip().rstrip()
@@ -427,6 +463,7 @@ def call_method(method, inputs):
427
  # sys.setrecursionlimit(10000)
428
 
429
  # @patch('builtins.input', side_effect=inputs.split("\n"))
 
430
  @patch('builtins.open', mock_open(read_data=inputs))
431
  @patch('sys.stdin', StringIO(inputs))
432
  @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
@@ -522,4 +559,4 @@ def reliability_guard(maximum_memory_bytes=None):
522
  sys.modules["joblib"] = None
523
  sys.modules["resource"] = None
524
  sys.modules["psutil"] = None
525
- sys.modules["tkinter"] = None
 
54
  otherwise it'll just return an input and output pair.
55
  """
56
  # Disable functionalities that can make destructive changes to the test.
57
+ # TODO: disable for now as it interferes with GPT-4 generation through gateway
58
+ # reliability_guard()
59
 
60
  if debug:
61
  print(f"start = {datetime.now().time()}")
 
100
  if debug:
101
  print(f"type 0 compilation error = {e}")
102
  results.append(-2)
103
+ return results, {}
104
  signal.alarm(0)
105
 
106
  elif which_type == CODE_TYPE.standard_input:
 
157
  results.append(-2)
158
  return results
159
 
160
+ program_outputs = {}
161
  for index, inputs in enumerate(in_outs["inputs"]):
162
  # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
163
  try:
 
202
 
203
  # reset the alarm
204
  signal.alarm(0)
205
+
206
+ program_outputs[index] = {
207
+ "pass": tmp_result,
208
+ "pass_pct": int(tmp_result),
209
+ "pass_res": [int(tmp_result)],
210
+ "output": output,
211
+ "input": inputs,
212
+ "ground_truth": in_outs["outputs"][index]
213
+ }
214
  except Exception as e:
215
  signal.alarm(0)
216
  faulthandler.disable()
 
245
  results.append(-1)
246
  signal.alarm(0)
247
 
248
+ program_outputs[index] = {"output": output}
249
+ program_outputs[index]["ground_truth"] = in_outs['outputs'][index]
250
+ program_outputs[index]["input"] = in_outs['inputs'][index]
251
+
252
  if not passed:
253
  if debug:
254
  nl = "\n"
 
261
  if passed and debug:
262
  print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
263
 
264
+ all_pass, pass_pct, pass_res = custom_compare_(output, in_outs['outputs'][index])
265
+ program_outputs[index]["pass"] = all_pass
266
+ program_outputs[index]["pass_pct"] = pass_pct
267
+ program_outputs[index]["pass_res"] = pass_res
268
+
269
+ if all_pass:
270
  tmp_result = True
271
  results.append(tmp_result)
272
  continue
 
411
  if not isinstance(inputs, list):
412
  print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
413
  else:
414
+ print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
 
415
 
416
+ return results, program_outputs
417
 
418
 
419
  def custom_compare_(output, ground_truth):
420
+
421
+ # TODO: split ground_truth and compare one by one
422
+ ground_truth_list = ground_truth.strip().split("\n")
423
+ correct = 0
424
+ res = []
425
 
426
  if isinstance(output, list):
427
+ for out, g_t in zip(output, ground_truth_list):
428
+ if out.strip() == g_t.strip():
429
+ correct += 1
430
+ res.append(1)
431
+ else:
432
+ res.append(0)
433
 
434
+ return correct == len(ground_truth_list), correct / len(ground_truth_list), res
435
+
436
+ return False, 0.0, []
437
+
438
+ # if isinstance(output, list):
439
+ # output_1 = "\n".join(output)
440
+ # if stripped_string_compare(output_1, ground_truth):
441
+ # return True
442
+
443
+ # if isinstance(output, list):
444
+ # output_2 = [o.lstrip().rstrip() for o in output]
445
+ # output_2 = "\n".join(output_2)
446
+ # if stripped_string_compare(output_2, ground_truth):
447
+ # return True
448
 
449
+ # return False
450
 
451
  def stripped_string_compare(s1, s2):
452
  s1 = s1.lstrip().rstrip()
 
463
  # sys.setrecursionlimit(10000)
464
 
465
  # @patch('builtins.input', side_effect=inputs.split("\n"))
466
+ @patch('builtins.input', lambda *args: next(inputs_line_iterator))
467
  @patch('builtins.open', mock_open(read_data=inputs))
468
  @patch('sys.stdin', StringIO(inputs))
469
  @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
 
559
  sys.modules["joblib"] = None
560
  sys.modules["resource"] = None
561
  sys.modules["psutil"] = None
562
+ sys.modules["tkinter"] = None
utils.py CHANGED
@@ -48,7 +48,7 @@ def evaluate_generations(generations: list, indices: list = [], level: str = "al
48
  """
49
 
50
  # generations are code generations in the same order of the dataset
51
- apps_eval = load_dataset(DATASET, split="test", difficulties=[level])
52
 
53
  if indices is None:
54
  indices = range(len(generations))
 
48
  """
49
 
50
  # generations are code generations in the same order of the dataset
51
+ apps_eval = load_dataset(DATASET, level, split="train")
52
 
53
  if indices is None:
54
  indices = range(len(generations))