Spaces:
Running
Running
update check_correctness
Browse files
utils.py
CHANGED
@@ -31,6 +31,7 @@ def check_correctness(sample, generation, timeout, debug=True):
|
|
31 |
print(f"global timeout")
|
32 |
return result[0]
|
33 |
|
|
|
34 |
def evaluate_generations(generations: list, level: str = "all", debug: bool = False):
|
35 |
"""We take the list of code generations and try to compile them
|
36 |
and the run their corresponding unit tests which are retrieved from the APPS dataset.
|
@@ -57,7 +58,7 @@ def evaluate_generations(generations: list, level: str = "all", debug: bool = Fa
|
|
57 |
for o_idx, o in enumerate(problem_generations):
|
58 |
curr_res = [-2]
|
59 |
try:
|
60 |
-
curr_res =
|
61 |
if debug:
|
62 |
print(f"\nSuccessful compilation of task {index}!")
|
63 |
fixed = []
|
@@ -207,5 +208,5 @@ def compute_metrics(generations, level="all", k_list=[1, 10, 100], count_errors=
|
|
207 |
metrics = get_results(results, count_errors=count_errors, k_list=k_list)
|
208 |
return metrics
|
209 |
|
210 |
-
#import doctest
|
211 |
-
#doctest.testmod()
|
|
|
31 |
print(f"global timeout")
|
32 |
return result[0]
|
33 |
|
34 |
+
|
35 |
def evaluate_generations(generations: list, level: str = "all", debug: bool = False):
|
36 |
"""We take the list of code generations and try to compile them
|
37 |
and the run their corresponding unit tests which are retrieved from the APPS dataset.
|
|
|
58 |
for o_idx, o in enumerate(problem_generations):
|
59 |
curr_res = [-2]
|
60 |
try:
|
61 |
+
curr_res = check_correctness(sample, o, timeout=TIMEOUT, debug=debug)
|
62 |
if debug:
|
63 |
print(f"\nSuccessful compilation of task {index}!")
|
64 |
fixed = []
|
|
|
208 |
metrics = get_results(results, count_errors=count_errors, k_list=k_list)
|
209 |
return metrics
|
210 |
|
211 |
+
# import doctest
|
212 |
+
# doctest.testmod()
|