DONG19 commited on
Commit
e117945
1 Parent(s): 07b033d
calculate_memory_usage.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import glob
4
+ import numpy as np
5
+ import argparse
6
+ from code_efficiency_calculator import run_model_task
7
+
8
+
9
+ def calculate_memory_usage(dat_file_path):
10
+ with open(dat_file_path, 'r') as file:
11
+ prev_time = 0
12
+ prev_mem_mb = 0
13
+ mem_time_mb_s = 0
14
+ next(file)
15
+ for line in file:
16
+ if "__main__." in line:
17
+ continue
18
+ parts = line.split()
19
+ mem_in_mb = float(parts[1])
20
+ timestamp = float(parts[2])
21
+ if prev_time > 0:
22
+ time_interval_s = timestamp - prev_time
23
+ mem_time_mb_s += (prev_mem_mb + mem_in_mb) / 2 * time_interval_s
24
+ prev_time = timestamp
25
+ prev_mem_mb = mem_in_mb
26
+ return mem_time_mb_s
27
+
28
+
29
+ def calculate_runtime(dat_file_path):
30
+ with open(dat_file_path, 'r') as file:
31
+ start_time = float("inf")
32
+ end_time = float("-inf")
33
+ next(file)
34
+ for line in file:
35
+ if "__main__." in line:
36
+ continue
37
+ parts = line.split()
38
+ timestamp = float(parts[2])
39
+ start_time = min(start_time, timestamp)
40
+ end_time = max(end_time, timestamp)
41
+ return max(end_time - start_time,0)
42
+
43
+ def report_max_memory_usage(dat_file_path):
44
+ max_memory_usage = 0
45
+ with open(dat_file_path, 'r') as file:
46
+ next(file)
47
+ for line in file:
48
+ if "__main__." in line:
49
+ continue
50
+ parts = line.split()
51
+ mem_in_mb = float(parts[1])
52
+ max_memory_usage = max(max_memory_usage, mem_in_mb)
53
+ return max_memory_usage
54
+
55
+ def report_results(task,model):
56
+ run_model_task(task,model)
57
+ dat_directory = f"./results/{task}_{model}"
58
+ canonical_solution_directory = f"./results/{task}_canonical_solution"
59
+ canonical_solution_memory_usage = {}
60
+ canonical_solution_execution_time = {}
61
+ canonical_solution_max_memory_usage = {}
62
+ for dat_file in glob.glob(os.path.join(canonical_solution_directory, "*.dat")):
63
+ try:
64
+ problem_idx = os.path.basename(dat_file).split('.')[0]
65
+ canonical_solution_memory_usage[int(problem_idx)] = calculate_memory_usage(dat_file)
66
+ canonical_solution_execution_time[int(problem_idx)] = calculate_runtime(dat_file)
67
+ canonical_solution_max_memory_usage[int(problem_idx)] = report_max_memory_usage(dat_file)
68
+ except:
69
+ pass
70
+
71
+
72
+ global_result = {}
73
+
74
+
75
+ completion_memory_usage = {}
76
+ execution_time = {}
77
+ max_memory_usage = {}
78
+ task_idx = {}
79
+ for dat_file in glob.glob(os.path.join(dat_directory, "*.dat")):
80
+ try:
81
+ problem_idx = os.path.basename(dat_file).split('.')[0]
82
+ execution_time_result = calculate_runtime(dat_file)
83
+ completion_memory_usage[int(problem_idx)] = calculate_memory_usage(dat_file)
84
+ execution_time[int(problem_idx)] = calculate_runtime(dat_file)
85
+ max_memory_usage[int(problem_idx)] = report_max_memory_usage(dat_file)
86
+ task_idx[int(problem_idx)] = dat_file
87
+ except Exception as e:
88
+ print(dat_file)
89
+ global_result[model] = {"completion_memory_usage":completion_memory_usage,"execution_time":execution_time,"max_memory_usage":max_memory_usage,"task_idx":task_idx}
90
+
91
+
92
+
93
+ save_results = []
94
+ max_net_lists = {}
95
+ max_nmu_lists = {}
96
+ max_ntmu_lists = {}
97
+
98
+ for model in global_result.keys():
99
+ completion_memory_usage = global_result[model]["completion_memory_usage"]
100
+ execution_time = global_result[model]["execution_time"]
101
+ max_memory_usage = global_result[model]["max_memory_usage"]
102
+
103
+ # report execution time
104
+ total_execution_time = 0
105
+
106
+ # report normalized execution time
107
+ normalized_execution_time = 0
108
+
109
+ # report max memory usage
110
+ total_max_memory_usage = 0
111
+
112
+ # report normalized max memory usage
113
+ normalized_max_memory_usage = 0
114
+
115
+ # report memory usage
116
+ total_memory_usage = 0
117
+ total_canonical_solution_max_memory_usage = 0
118
+ total_canonical_solution_execution_time = 0
119
+ total_canonical_solution_memory_usage = 0
120
+ # report normalized memory usage
121
+ normalized_memory_usage = 0
122
+ total_codes = 0
123
+ normalized_execution_time_list = []
124
+ normalized_max_memory_usage_list = []
125
+ normalized_memory_usage_list = []
126
+ total_fast = 0
127
+ total_95 = 0
128
+ total_97=0
129
+ total_99=0
130
+ total_100=0
131
+ total_101=0
132
+ total_1000=0
133
+ total_500=0
134
+ category_tmp = {}
135
+ total_10000=0
136
+ max_net = float("-inf")
137
+ max_nmu = float("-inf")
138
+ max_tmu = float("-inf")
139
+
140
+ total_500_net = 0
141
+ total_500_nmu = 0
142
+ total_500_tmu = 0
143
+ # print(len(completion_memory_usage))
144
+ for idx in completion_memory_usage.keys():
145
+ if idx not in canonical_solution_memory_usage.keys():
146
+ continue
147
+
148
+
149
+ total_memory_usage += completion_memory_usage[idx]
150
+ total_execution_time += execution_time[idx]
151
+ total_max_memory_usage += max_memory_usage[idx]
152
+ total_canonical_solution_max_memory_usage+=canonical_solution_max_memory_usage[idx]
153
+ total_canonical_solution_memory_usage+=canonical_solution_memory_usage[idx]
154
+ total_canonical_solution_execution_time+=canonical_solution_execution_time[idx]
155
+ if execution_time[idx]/canonical_solution_execution_time[idx]>5:
156
+ total_500_net+=1
157
+ if max_net<execution_time[idx]/canonical_solution_execution_time[idx]:
158
+ max_net = execution_time[idx]/canonical_solution_execution_time[idx]
159
+ normalized_execution_time += execution_time[idx]/canonical_solution_execution_time[idx]
160
+ normalized_execution_time_list.append(execution_time[idx]/canonical_solution_execution_time[idx])
161
+ if max_memory_usage[idx]/canonical_solution_max_memory_usage[idx]>5:
162
+ total_500_nmu+=1
163
+ if max_nmu<max_memory_usage[idx]/canonical_solution_max_memory_usage[idx]:
164
+ max_nmu = max_memory_usage[idx]/canonical_solution_max_memory_usage[idx]
165
+ normalized_max_memory_usage += max_memory_usage[idx]/canonical_solution_max_memory_usage[idx]
166
+ normalized_max_memory_usage_list.append(max_memory_usage[idx]/canonical_solution_max_memory_usage[idx])
167
+
168
+ if completion_memory_usage[idx]/canonical_solution_memory_usage[idx]>5:
169
+ total_500_tmu+=1
170
+ net = execution_time[idx] / canonical_solution_execution_time[idx]
171
+ nmu = completion_memory_usage[idx] / canonical_solution_memory_usage[idx]
172
+ ntmu = max_memory_usage[idx] / canonical_solution_max_memory_usage[idx]
173
+ normalized_memory_usage += completion_memory_usage[idx]/canonical_solution_memory_usage[idx]
174
+ normalized_memory_usage_list.append(completion_memory_usage[idx]/canonical_solution_memory_usage[idx])
175
+
176
+ if len(max_net_lists) < 10 or net > min(max_net_lists.keys()):
177
+ if len(max_net_lists) >= 10:
178
+ min_key = min(max_net_lists.keys())
179
+ del max_net_lists[min_key]
180
+ max_net_lists[net] = (model, idx)
181
+
182
+ if len(max_nmu_lists) < 10 or nmu > min(max_nmu_lists.keys()):
183
+ if len(max_nmu_lists) >= 10:
184
+ min_key = min(max_nmu_lists.keys())
185
+ del max_nmu_lists[min_key]
186
+ max_nmu_lists[nmu] = (model, idx)
187
+
188
+ if len(max_ntmu_lists) < 10 or ntmu > min(max_ntmu_lists.keys()):
189
+ if len(max_ntmu_lists) >= 10:
190
+ min_key = min(max_ntmu_lists.keys())
191
+ del max_ntmu_lists[min_key]
192
+ max_ntmu_lists[ntmu] = (model, idx)
193
+ max_tmu = max(max_tmu,completion_memory_usage[idx]/canonical_solution_memory_usage[idx])
194
+ total_codes+=1
195
+
196
+ if len(normalized_execution_time_list)==0:
197
+ print(model)
198
+ continue
199
+ normalized_execution_time = normalized_execution_time/len(normalized_execution_time_list)
200
+ normalized_max_memory_usage = normalized_max_memory_usage/len(normalized_execution_time_list)
201
+ normalized_memory_usage = normalized_memory_usage/len(normalized_execution_time_list)
202
+ total_execution_time = total_execution_time/len(normalized_execution_time_list)
203
+ total_memory_usage = total_memory_usage/len(normalized_execution_time_list)
204
+ total_max_memory_usage = total_max_memory_usage/len(normalized_execution_time_list)
205
+
206
+ pass1 = len(completion_memory_usage)/1000*100
207
+
208
+ total_500_net = total_500_net/len(normalized_execution_time_list)*100
209
+ total_500_nmu = total_500_nmu/len(normalized_execution_time_list)*100
210
+ total_500_tmu = total_500_tmu/len(normalized_execution_time_list)*100
211
+
212
+ print(f"{model}&{total_execution_time:.2f}&{normalized_execution_time:.2f}&{max_net:.2f}&{total_500_net:.1f}&{total_max_memory_usage:.2f}&{normalized_max_memory_usage:.2f}&{max_nmu:.2f}&{total_500_nmu:.1f}&{total_memory_usage:.2f}&{normalized_memory_usage:.2f}&{max_tmu:.2f}&{total_500_tmu:.1f}&{pass1:.1f}\\\\")
213
+
214
+ if __name__ == "__main__":
215
+ parse = argparse.ArgumentParser()
216
+ parse.add_argument("--task", type=str, default="EffiBench")
217
+ parse.add_argument("--model", type=str, default="gpt-4")
218
+
219
+ args = parse.parse_args()
220
+ report_results(args.task,args.model)
code_efficiency_calculator.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from execution import check_correctness
4
+ import copy
5
+ import argparse
6
+ from tqdm import tqdm
7
+ import subprocess
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ import concurrent.futures
10
+ import os
11
+ import re
12
+ import shutil
13
+ import contextlib
14
+ import random
15
+ import json
16
+ from typing import Optional, Callable, Dict
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ import inspect
19
+ import numpy as np
20
+ import sys
21
+ import concurrent.futures
22
+ import time
23
+ from tqdm import tqdm
24
+ import contextlib
25
+ import faulthandler
26
+ import io
27
+ import os
28
+ import multiprocessing
29
+ import platform
30
+ import signal
31
+ from tqdm import tqdm
32
+
33
+
34
+ class TimeoutException(Exception):
35
+ pass
36
+ class WriteOnlyStringIO(io.StringIO):
37
+ """ StringIO that throws an exception when it's read from """
38
+
39
+ def read(self, *args, **kwargs):
40
+ raise IOError
41
+
42
+ def readline(self, *args, **kwargs):
43
+ raise IOError
44
+
45
+ def readlines(self, *args, **kwargs):
46
+ raise IOError
47
+
48
+ def readable(self, *args, **kwargs):
49
+ """ Returns True if the IO object can be read. """
50
+ return False
51
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
52
+ _stream = 'stdin'
53
+
54
+ @contextlib.contextmanager
55
+ def swallow_io():
56
+ stream = WriteOnlyStringIO()
57
+ with contextlib.redirect_stdout(stream):
58
+ with contextlib.redirect_stderr(stream):
59
+ with redirect_stdin(stream):
60
+ yield
61
+
62
+ @contextlib.contextmanager
63
+ def time_limit(seconds: float):
64
+ def signal_handler(signum, frame):
65
+ raise TimeoutException("Timed out!")
66
+ signal.setitimer(signal.ITIMER_REAL, seconds)
67
+ signal.signal(signal.SIGALRM, signal_handler)
68
+ try:
69
+ yield
70
+ finally:
71
+ signal.setitimer(signal.ITIMER_REAL, 0)
72
+
73
+ ListNode_text = """
74
+ class ListNode:
75
+ def __init__(self, val=0, next=None):
76
+ self.val = val
77
+ self.next = next
78
+ """
79
+ TreeNode_text = """
80
+ class TreeNode:
81
+ def __init__(self, val=0, left=None, right=None, next=None):
82
+ self.val = val
83
+ self.left = left
84
+ self.right = right
85
+ self.next = next
86
+ """
87
+
88
+ import_pkg = """
89
+ from typing import *
90
+ from bisect import *
91
+ from collections import *
92
+ from copy import *
93
+ from datetime import *
94
+ from heapq import *
95
+ from math import *
96
+ from re import *
97
+ from string import *
98
+ from random import *
99
+ from itertools import *
100
+ from functools import *
101
+ from operator import *
102
+
103
+ import string
104
+ import re
105
+ import datetime
106
+ import collections
107
+ import heapq
108
+ import bisect
109
+ import copy
110
+ import math
111
+ import random
112
+ import itertools
113
+ import functools
114
+ import operator
115
+ """
116
+
117
+ memory_profiler_prompt = r"""
118
+ def parse_profile_table(profile_table: str):
119
+ table = {"filename": None, "rows": []}
120
+ for line in profile_table.strip().split("\n"):
121
+ if line.startswith("Filename:"):
122
+ table["filename"] = line.split(": ")[1]
123
+ elif re.match(r"^\s*\d+", line):
124
+ parts = re.split(r"\s{2,}", line.strip(), maxsplit=4)
125
+ if len(parts) == 5 and "iB" in parts[1] and "iB" in parts[2]:
126
+ table["rows"].append({
127
+ "line": int(parts[0]),
128
+ "mem_usage": parts[1],
129
+ "increment": parts[2],
130
+ "occurrences": int(parts[3]),
131
+ "line_contents": parts[4],
132
+ })
133
+ else:
134
+ parts = re.split(r"\s{2,}", line.strip(), maxsplit=1)
135
+ table["rows"].append({
136
+ "line": int(parts[0]),
137
+ "line_contents": parts[1] if len(parts) == 2 else "",
138
+ })
139
+ return table
140
+
141
+ def print_averaged_results(profile_log: str, precision: int = 1):
142
+ tables = [parse_profile_table(table) for table in profile_log.split("\n\n\n")]
143
+ averaged_table = defaultdict(lambda: defaultdict(list))
144
+
145
+ for table in tables:
146
+ filename = table["filename"]
147
+ for row in table["rows"]:
148
+ line = row["line"]
149
+ if "mem_usage" in row:
150
+ mem_usage = float(row["mem_usage"].split()[0])
151
+ increment = float(row["increment"].split()[0])
152
+ occurrences = row["occurrences"]
153
+ averaged_table[filename][line].append((mem_usage, increment, occurrences))
154
+ else:
155
+ averaged_table[filename][line].append(tuple())
156
+
157
+ stream = sys.stdout
158
+ template = '{0:>6} {1:>12} {2:>12} {3:>10} {4:<}'
159
+
160
+ for filename, lines in averaged_table.items():
161
+ header = template.format('Line #', 'Mem usage', 'Increment', 'Occurrences', 'Line Contents')
162
+
163
+ stream.write(u'Filename: ' + filename + '\n\n')
164
+ stream.write(header + u'\n')
165
+ stream.write(u'=' * len(header) + '\n')
166
+
167
+ all_lines = linecache.getlines(filename)
168
+
169
+ float_format = u'{0}.{1}f'.format(precision + 4, precision)
170
+ template_mem = u'{0:' + float_format + '} MiB'
171
+
172
+ for lineno, mem_values in lines.items():
173
+ # TODO: should average the rest or not?
174
+ # mem_values = [(50.1, 0.0, 4), (51.1, 0.0, 6), ()]
175
+ if any([len(m) == 0 for m in mem_values]):
176
+ tmp = template.format(lineno, "", "", "", all_lines[lineno - 1])
177
+ else:
178
+ mem_usage_sum = sum(m[0] for m in mem_values)
179
+ increment_sum = sum(m[1] for m in mem_values)
180
+ occurrences_sum = sum(m[2] for m in mem_values)
181
+ count = len(mem_values)
182
+
183
+ avg_mem_usage = mem_usage_sum / count
184
+ avg_increment = increment_sum / count
185
+ avg_occurrences = occurrences_sum / count
186
+
187
+ avg_mem_usage_str = template_mem.format(avg_mem_usage)
188
+ avg_increment_str = template_mem.format(avg_increment)
189
+
190
+ tmp = template.format(lineno, avg_mem_usage_str, avg_increment_str, int(avg_occurrences), all_lines[lineno - 1])
191
+ stream.write(tmp)
192
+
193
+ print_averaged_results(profile_stream.getvalue(), precision=PROFILE_PRECISION)
194
+ """
195
+
196
+ memory_profiler_pkgs = r"""
197
+ from collections import defaultdict, deque
198
+ from memory_profiler import profile
199
+ import io
200
+ profile_stream = io.StringIO()
201
+ PROFILE_PRECISION = 1
202
+ import re
203
+ import sys
204
+ import linecache
205
+ """
206
+
207
+
208
+ def calculate_memory_usage(dat_file_path):
209
+ with open(dat_file_path, 'r') as file:
210
+ prev_time = 0
211
+ prev_mem_mb = 0
212
+ mem_time_mb_s = 0
213
+ next(file)
214
+ for line in file:
215
+ if not line.startswith('MEM'):
216
+ continue # Skip any line that does not start with 'MEM'
217
+ parts = line.split()
218
+ mem_in_mb = float(parts[1])
219
+ timestamp = float(parts[2])
220
+ if prev_time > 0:
221
+ time_interval_s = timestamp - prev_time
222
+ mem_time_mb_s += (prev_mem_mb + mem_in_mb) / 2 * time_interval_s
223
+ prev_time = timestamp
224
+ prev_mem_mb = mem_in_mb
225
+ return mem_time_mb_s
226
+
227
+
228
+ def calculate_runtime(dat_file_path):
229
+ with open(dat_file_path, 'r') as file:
230
+ start_time = float("inf")
231
+ end_time = float("-inf")
232
+ next(file)
233
+ for line in file:
234
+ if not line.startswith('MEM'):
235
+ continue # Skip any line that does not start with 'MEM'
236
+ parts = line.split()
237
+ timestamp = float(parts[2])
238
+ start_time = min(start_time, timestamp)
239
+ end_time = max(end_time, timestamp)
240
+ return max(end_time - start_time,0)
241
+
242
+ def report_max_memory_usage(dat_file_path):
243
+ max_memory_usage = 0
244
+ with open(dat_file_path, 'r') as file:
245
+ prev_time = 0
246
+ prev_mem_mb = 0
247
+ mem_time_mb_s = 0
248
+ next(file)
249
+ for line in file:
250
+ if not line.startswith('MEM'):
251
+ continue # Skip any line that does not start with 'MEM'
252
+ parts = line.split()
253
+ mem_in_mb = float(parts[1])
254
+ max_memory_usage = max(max_memory_usage, mem_in_mb)
255
+ return max_memory_usage
256
+
257
+ def add_profile_decorator_to_python_file(file_path,entry_point):
258
+ """给Python文件中的函数自动添加@profile装饰器。"""
259
+ try:
260
+ with open(file_path, 'r') as file:
261
+ lines = file.readlines()
262
+ if "humaneval" in file_path:
263
+ with open(file_path, 'w') as file:
264
+ inside_class = False
265
+ class_indent = 0
266
+ for line in lines:
267
+ stripped_line = line.lstrip()
268
+ if stripped_line.startswith(f"def {entry_point}"):
269
+ inside_class = True
270
+ class_indent = len(line) - len(stripped_line)
271
+ file.write('@profile\n')
272
+ file.write(line)
273
+ continue
274
+ if inside_class:
275
+ if stripped_line and not line[class_indent].isspace():
276
+ inside_class = False
277
+ elif stripped_line.startswith("def "):
278
+ file.write(' ' * class_indent + '@profile\n')
279
+ file.write(line)
280
+ if "mbpp" in file_path:
281
+ entry_point
282
+ with open(file_path, 'w') as file:
283
+ inside_class = False
284
+ class_indent = 0
285
+ for line in lines:
286
+ stripped_line = line.lstrip()
287
+ if stripped_line.startswith(f"def {entry_point}"):
288
+ inside_class = True
289
+ class_indent = len(line) - len(stripped_line)
290
+ file.write('@profile\n')
291
+ file.write(line)
292
+ continue
293
+ if inside_class:
294
+ if stripped_line and not line[class_indent].isspace():
295
+ inside_class = False
296
+ elif stripped_line.startswith("def "):
297
+ file.write(' ' * class_indent + '@profile\n')
298
+ file.write(line)
299
+ else:
300
+ with open(file_path, 'w') as file:
301
+ inside_class = False
302
+ class_indent = 0
303
+ for line in lines:
304
+ stripped_line = line.lstrip()
305
+ if stripped_line.startswith("class Solution"):
306
+ inside_class = True
307
+ class_indent = len(line) - len(stripped_line)
308
+ file.write(line)
309
+ continue
310
+ if inside_class:
311
+ if stripped_line and not line[class_indent].isspace():
312
+ inside_class = False
313
+ elif stripped_line.startswith("def "):
314
+ file.write(' ' * class_indent + ' @profile\n')
315
+ file.write(line)
316
+ except Exception as e:
317
+ # print(f"Error during the file processing: {e}")
318
+ pass
319
+
320
+ def add_profile_for_memory_profiler(code_string,data):
321
+ """给Python代码中的函数自动添加@profile装饰器。"""
322
+ entry_point = ""
323
+ try:
324
+ if "task_id" in data.keys() and "HumanEval" in data["task_id"]:
325
+ entry_point = data["entry_point"]
326
+ lines = code_string.split('\n')
327
+ new_lines = []
328
+ inside_class = False
329
+ class_indent = 0
330
+ first_function = True
331
+ for line in lines:
332
+ stripped_line = line.lstrip()
333
+ if stripped_line.startswith(f"def {entry_point}"):
334
+ inside_class = True
335
+ class_indent = len(line) - len(stripped_line)
336
+ new_lines.append(' ' * class_indent + '@profile(stream=profile_stream, precision=PROFILE_PRECISION)')
337
+ new_lines.append(line)
338
+ return '\n'.join(new_lines)
339
+ elif "task_id" in data.keys():
340
+ entry_point = data["entry_point"]
341
+ lines = code_string.split('\n')
342
+ new_lines = []
343
+ inside_class = False
344
+ class_indent = 0
345
+ first_function = True
346
+ for line in lines:
347
+ stripped_line = line.lstrip()
348
+ if stripped_line.startswith(f"def {entry_point}"):
349
+ inside_class = True
350
+ class_indent = len(line) - len(stripped_line)
351
+ new_lines.append(' ' * class_indent + '@profile(stream=profile_stream, precision=PROFILE_PRECISION)')
352
+ new_lines.append(line)
353
+ return '\n'.join(new_lines)
354
+ else:
355
+ lines = code_string.split('\n')
356
+ new_lines = []
357
+ inside_class = False
358
+ class_indent = 0
359
+ first_function = True
360
+ for line in lines:
361
+ stripped_line = line.lstrip()
362
+ if stripped_line.startswith("class Solution"):
363
+ inside_class = True
364
+ class_indent = len(line) - len(stripped_line)
365
+ new_lines.append(line)
366
+ continue
367
+ if inside_class:
368
+ if stripped_line and not line[class_indent].isspace():
369
+ inside_class = False
370
+ elif stripped_line.startswith("def ") and first_function:
371
+ new_lines.append(' ' * class_indent + ' @profile(stream=profile_stream, precision=PROFILE_PRECISION)')
372
+ first_function = False
373
+ new_lines.append(line)
374
+ return '\n'.join(new_lines)
375
+ except Exception as e:
376
+ return code_string
377
+
378
+ def calculate_line_efficiency(completion_file,entry_point):
379
+ try:
380
+ path, filename = os.path.split(completion_file)
381
+ tmp_py_script_filename = f"{filename.split('.')[0]}_tmp.py"
382
+ tmp_py_script = os.path.join(path, tmp_py_script_filename)
383
+ tmp_lprof_filename = f"{tmp_py_script_filename}.lprof" # 期望的lprof文件名
384
+
385
+ # 复制原始脚本到临时文件,并添加@profile装饰器
386
+ subprocess.run(['cp', completion_file, tmp_py_script],check=True, capture_output=True, text=True)
387
+ add_profile_decorator_to_python_file(tmp_py_script,entry_point)
388
+
389
+ subprocess.run(['timeout',"10",'kernprof', '-l', tmp_py_script_filename], cwd=path, capture_output=True, text=True, check=True)
390
+ # 生成性能报告
391
+ overhead_dir = path
392
+ # os.makedirs(overhead_dir, exist_ok=True)
393
+ report_file = os.path.join(overhead_dir, tmp_py_script_filename.replace('.py', '.txt'))
394
+ with open(report_file, 'w') as f:
395
+ subprocess.run(['timeout',"10",'python', '-m', 'line_profiler', tmp_lprof_filename], cwd=path, stdout=f)
396
+ with open(report_file, 'r') as f:
397
+ report_content = f.read()
398
+ # print(report_content)
399
+
400
+ except subprocess.CalledProcessError as e:
401
+ # print(f"Error during the execution: {e}")
402
+ report_content = f"Error during the execution: {e}"
403
+
404
+ # # 清理临时文件
405
+ if os.path.exists(tmp_py_script):
406
+ os.remove(tmp_py_script)
407
+ if os.path.exists(f"{tmp_py_script}.lprof"):
408
+ os.remove(f"{tmp_py_script}.lprof")
409
+
410
+ return report_content
411
+
412
+ def humaneval_add_string_to_py_file(data,evaluation_code=False, path="./tmp/"):
413
+ if "canonical_solution" in path:
414
+ data["completion"] = data["canonical_solution"]
415
+ if evaluation_code==False:
416
+ test_case = data["test"]
417
+ else:
418
+ test_case = data["small_test_cases"]
419
+ # test_case = data["small_test_cases"]
420
+ problem_idx = data["task_id"].split("/")[1]
421
+ return_path,full_code = None,""
422
+ tmp_code = data["completion"].split("\n")
423
+ code = []
424
+ for string in tmp_code:
425
+ if "print(" in string:
426
+ continue
427
+ else:
428
+ code.append(string)
429
+ data["completion"] = "\n".join(code)
430
+ try:
431
+ if f"```python" in data["completion"]:
432
+ start_idx = data["completion"].find(f"```python")
433
+ data["completion"] = data["completion"][start_idx+len(f"```python"):]
434
+ if "```" in data["completion"]:
435
+ end_idx = data["completion"].find("```")
436
+ data["completion"] = data["completion"][:end_idx]
437
+ full_code = import_pkg+ "\n"+data["prompt"] + "\n"+data["completion"] + "\n" + test_case
438
+ # with open(f"./{path}/{problem_idx}.py", "w") as f:
439
+ # f.write(full_code)
440
+ # return_path = f"./{path}/{problem_idx}.py"
441
+ result = check_correctness(full_code,timeout=10.0)
442
+ if result["passed"]:
443
+ with open(f"./{path}/{problem_idx}.py", "w") as f:
444
+ f.write(full_code)
445
+ return_path = f"./{path}/{problem_idx}.py"
446
+ # print(return_path)
447
+ else:
448
+ return_path = None
449
+ except Exception as e:
450
+ pass
451
+ # print(return_path,full_code)
452
+ return return_path,full_code
453
+
454
+
455
+ def mbpp_add_string_to_py_file(data,evaluation_code=False, path="./tmp/"):
456
+ if "canonical_solution" in path:
457
+ data["completion"] = data["code"]
458
+ if evaluation_code==False:
459
+ test_case = data["test"]
460
+ else:
461
+ test_case = "\n".join(data["test_list"])
462
+ # test_case = data["small_test_cases"]
463
+ problem_idx = str(data["task_id"])
464
+ return_path,full_code = None,""
465
+ tmp_code = data["completion"].split("\n")
466
+ code = []
467
+ for string in tmp_code:
468
+ if "print(" in string:
469
+ continue
470
+ else:
471
+ code.append(string)
472
+ data["completion"] = "\n".join(code)
473
+ try:
474
+ if f"```python" in data["completion"]:
475
+ start_idx = data["completion"].find(f"```python")
476
+ data["completion"] = data["completion"][start_idx+len(f"```python"):]
477
+ if "```" in data["completion"]:
478
+ end_idx = data["completion"].find("```")
479
+ data["completion"] = data["completion"][:end_idx]
480
+ full_code = "\n".join(data["test_imports"])+ "\n"+data["completion"] + "\n" + test_case
481
+ # with open(f"./{path}/{problem_idx}.py", "w") as f:
482
+ # f.write(full_code)
483
+ # return_path = f"./{path}/{problem_idx}.py"
484
+ result = check_correctness(full_code,timeout=10.0)
485
+ if result["passed"]:
486
+ with open(f"./{path}/{problem_idx}.py", "w") as f:
487
+ f.write(full_code)
488
+ return_path = f"./{path}/{problem_idx}.py"
489
+ except Exception as e:
490
+ # print(e)
491
+ pass
492
+ # print(return_path,full_code)
493
+ return return_path,full_code
494
+
495
+ def add_string_to_py_file(data,evaluation_code=False, path="./tmp/"):
496
+ if "canonical_solution" in path:
497
+ data["completion"] = data["canonical_solution"]
498
+ if evaluation_code==False:
499
+ test_case = data["test_case"]
500
+ else:
501
+ test_case = data["small_test_cases"]
502
+ # test_case = data["small_test_cases"]
503
+ problem_idx = data["problem_idx"]
504
+ return_path,full_code = None,""
505
+ tmp_code = data["completion"].split("\n")
506
+ code = []
507
+ for string in tmp_code:
508
+ if "print(" in string:
509
+ continue
510
+ else:
511
+ code.append(string)
512
+ data["completion"] = "\n".join(code)
513
+ try:
514
+ if "class Solution" in data["completion"]:
515
+ if "```python" in data["completion"]:
516
+ start_idx = data["completion"].find("```python")
517
+ data["completion"] = data["completion"][start_idx+9:]
518
+ if "```" in data["completion"]:
519
+ end_idx = data["completion"].find("```")
520
+ data["completion"] = data["completion"][:end_idx]
521
+ test_case = test_case.split("\n")[:100]
522
+ test_case = "\n".join(test_case)
523
+ # import_pkg
524
+ full_code = import_pkg + "\n"+TreeNode_text + "\n"+ListNode_text + "\n" + data["completion"] + "\nsolution=Solution()\n" + test_case
525
+ # with open(f"./{path}/{problem_idx}.py", "w") as f:
526
+ # f.write(full_code)
527
+ # return_path = f"./{path}/{problem_idx}.py"
528
+ result = check_correctness(full_code,timeout=10.0)
529
+ if result["passed"]:
530
+ with open(f"./{path}/{problem_idx}.py", "w") as f:
531
+ f.write(full_code)
532
+ return_path = f"./{path}/{problem_idx}.py"
533
+ # print(return_path)
534
+ else:
535
+ return_path = None
536
+ except Exception as e:
537
+ # print(e)
538
+ pass
539
+ return return_path,full_code
540
+
541
+ def calculate_code_execution_efficiency(data,evaluation_code=False,path="./tmp/",max_execution_time=10):
542
+ entry_point = ""
543
+ try:
544
+ if "task_id" in data.keys() and "HumanEval" in str(data["task_id"]):
545
+ problem_idx = data["task_id"].split("/")[1]
546
+ completion_file,full_code = humaneval_add_string_to_py_file(data,evaluation_code=evaluation_code, path=path)
547
+ entry_point = data["entry_point"]
548
+ # print(data.keys())
549
+ # print(data["dataset"])
550
+ elif "dataset" in data.keys() and data["dataset"]=="mbpp":
551
+ problem_idx = data["task_id"]
552
+ completion_file,full_code = mbpp_add_string_to_py_file(data,evaluation_code=evaluation_code, path=path)
553
+ code_example = data["code"]
554
+ match = re.search(r"def\s+(\w+)\s*\(", code_example)
555
+ if match:
556
+ entry_point = match.group(1)
557
+ else:
558
+ test_example = data["test_list"][0]
559
+ match = re.search(r"assert\s+(\w+)\s*\(", test_example)
560
+ if match:
561
+ entry_point = match.group(1)
562
+ else: completion_file== None
563
+ else:
564
+ problem_idx = data["problem_idx"]
565
+ completion_file,full_code = add_string_to_py_file(data,evaluation_code=evaluation_code, path=path)
566
+ except Exception as e:
567
+ # print(e)
568
+ completion_file = None
569
+ if completion_file == None:
570
+ # print("test")
571
+ overhead = f"""
572
+ The code execution failed.
573
+ """
574
+ canonical_solution_memory_usage = 0
575
+ canonical_solution_execution_time = 0
576
+ canonical_solution_max_memory_usage = 0
577
+ executable = False
578
+ return overhead, canonical_solution_memory_usage, canonical_solution_execution_time, canonical_solution_max_memory_usage, executable
579
+
580
+ script_path = './run_code.sh'
581
+ completion_dat_file = f'./{path}/{problem_idx}.dat'
582
+ try:
583
+ subprocess.run([script_path, completion_file, completion_dat_file,str(max_execution_time)],
584
+ check=True, capture_output=True, text=True)
585
+ canonical_solution_memory_usage = calculate_memory_usage(completion_dat_file)
586
+ canonical_solution_execution_time = calculate_runtime(completion_dat_file)
587
+ canonical_solution_max_memory_usage = report_max_memory_usage(completion_dat_file)
588
+
589
+ executable = True
590
+ overhead = f"""
591
+ The total memory usage during the code execution is: {canonical_solution_memory_usage} MB*s.
592
+ The total execution time is: {canonical_solution_execution_time} s.
593
+ The maximum memory peak requirement is: {canonical_solution_max_memory_usage} MB.
594
+ """
595
+ except Exception as e:
596
+ # print(e)
597
+ overhead = f"""
598
+ The code execution failed.
599
+ """
600
+ canonical_solution_memory_usage = 0
601
+ canonical_solution_execution_time = 0
602
+ canonical_solution_max_memory_usage = 0
603
+ executable = False
604
+ return overhead, canonical_solution_memory_usage, canonical_solution_execution_time, canonical_solution_max_memory_usage, executable
605
+
606
+
607
+ def fetch_completion(dataset,model):
608
+ with ThreadPoolExecutor() as executor:
609
+ future_to_entry = {executor.submit(calculate_code_execution_efficiency, copy.deepcopy(entry),False, path=model,max_execution_time=10): entry for entry in tqdm(dataset)}
610
+ for future in tqdm(concurrent.futures.as_completed(future_to_entry)):
611
+ entry = future_to_entry[future]
612
+ try:
613
+ updated_entry = future.result()
614
+ idx = dataset.index(entry)
615
+ dataset[idx] = updated_entry
616
+ except Exception as e:
617
+ print(e)
618
+ return dataset
619
+
620
+
621
+ def run_model_task(model,task):
622
+
623
+ if "/" in model:
624
+ model = model.split("/")[1]
625
+ dat_path = f"./results/{task}_{model}"
626
+ canonical_solution_path = f"./results/{task}_canonical_solution"
627
+ with open(f"./{task}_{model}.json", "r") as f:
628
+ dataset = json.load(f)
629
+
630
+ if os.path.exists(dat_path):
631
+ shutil.rmtree(dat_path)
632
+ if os.path.exists(canonical_solution_path):
633
+ shutil.rmtree(canonical_solution_path)
634
+
635
+ if os.path.exists(dat_path) == False:
636
+ os.makedirs(dat_path)
637
+ if os.path.exists(canonical_solution_path) == False:
638
+ os.makedirs(canonical_solution_path)
639
+
640
+ fetch_completion(dataset,dat_path)
641
+
642
+ with open(f"./{task}_{model}.json", "r") as f:
643
+ dataset = json.load(f)
644
+ for i in range(len(dataset)):
645
+ dataset[i]["dataset"] = f"{task}"
646
+ fetch_completion(dataset,canonical_solution_path)
647
+
648
+
649
+ if __name__ == "__main__":
650
+ parser = argparse.ArgumentParser()
651
+ parser.add_argument("--model", type=str, default="gpt-4")
652
+ parser.add_argument("--task", type=str, default="EffiBench")
653
+ args = parser.parse_args()
654
+
655
+ run_model_task(args.model,args.task)
execution.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import faulthandler
3
+ import io
4
+ import multiprocessing
5
+ import os
6
+ import platform
7
+ import signal
8
+ import random
9
+ import subprocess
10
+ import tempfile
11
+ import gzip
12
+ import json
13
+ from typing import *
14
+
15
+ def dicts_to_jsonl(data_list: list, filename: str, compress: bool = True) -> None:
16
+ """
17
+ Method saves list of dicts into jsonl file.
18
+ :param data: (list) list of dicts to be stored,
19
+ :param filename: (str) path to the output file. If suffix .jsonl is not given then methods appends
20
+ .jsonl suffix into the file.
21
+ :param compress: (bool) should file be compressed into a gzip archive?
22
+ """
23
+ sjsonl = '.jsonl'
24
+ sgz = '.gz'
25
+ # Check filename
26
+ if not filename.endswith(sjsonl):
27
+ filename = filename + sjsonl
28
+ # Save data
29
+
30
+ if compress:
31
+ filename = filename + sgz
32
+ with gzip.open(filename, 'w') as compressed:
33
+ for ddict in data_list:
34
+ jout = json.dumps(ddict) + '\n'
35
+ jout = jout.encode('utf-8')
36
+ compressed.write(jout)
37
+ else:
38
+ with open(filename, 'w') as out:
39
+ for ddict in data_list:
40
+ jout = json.dumps(ddict) + '\n'
41
+ out.write(jout)
42
+
43
+
44
+ def check_correctness(
45
+ full_code: str,
46
+ timeout: float = 3.0,
47
+ tmp_dir: str = "tmp",
48
+ ) -> Dict:
49
+ """
50
+ Evaluates the functional correctness of a completion by running the test
51
+ suite provided in the problem.
52
+ """
53
+
54
+ def unsafe_execute(tmp_dir):
55
+ with create_tempdir():
56
+
57
+ # These system calls are needed when cleaning up tempdir.
58
+ import os
59
+ import shutil
60
+ rmtree = shutil.rmtree
61
+ rmdir = os.rmdir
62
+ chdir = os.chdir
63
+
64
+ # Disable functionalities that can make destructive changes to the test.
65
+ reliability_guard()
66
+
67
+ try:
68
+ exec_globals = {}
69
+ with swallow_io():
70
+ with time_limit(timeout):
71
+ exec(full_code, exec_globals)
72
+ result.append("passed")
73
+ except TimeoutException:
74
+ result.append("timed out")
75
+ except AssertionError as e:
76
+ result.append(f"failed: AssertionError")
77
+ except BaseException as e:
78
+ result.append(f"failed: {e}")
79
+
80
+ # Needed for cleaning up.
81
+ shutil.rmtree = rmtree
82
+ os.rmdir = rmdir
83
+ os.chdir = chdir
84
+
85
+
86
+
87
+
88
+ manager = multiprocessing.Manager()
89
+ result = manager.list()
90
+
91
+ p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
92
+ p.start()
93
+ p.join(timeout=timeout + 1)
94
+ if p.is_alive():
95
+ p.kill()
96
+
97
+ if not result:
98
+ result.append("timed out")
99
+
100
+ return {
101
+ "result" : result[0],
102
+ "passed" : result[0] == "passed",
103
+ }
104
+
105
+ # Copyright (c) OpenAI (https://openai.com)
106
+
107
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
108
+ # of this software and associated documentation files (the "Software"), to deal
109
+ # in the Software without restriction, including without limitation the rights
110
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
111
+ # copies of the Software, and to permit persons to whom the Software is
112
+ # furnished to do so, subject to the following conditions:
113
+
114
+ # The above copyright notice and this permission notice shall be included in
115
+ # all copies or substantial portions of the Software.
116
+
117
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
118
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
119
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
120
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
121
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
122
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
123
+ # THE SOFTWARE.
124
+ # ============================================================================
125
+ @contextlib.contextmanager
126
+ def time_limit(seconds: float):
127
+ def signal_handler(signum, frame):
128
+ raise TimeoutException("Timed out!")
129
+
130
+ signal.setitimer(signal.ITIMER_REAL, seconds)
131
+ signal.signal(signal.SIGALRM, signal_handler)
132
+ try:
133
+ yield
134
+ finally:
135
+ signal.setitimer(signal.ITIMER_REAL, 0)
136
+
137
+
138
+ @contextlib.contextmanager
139
+ def swallow_io():
140
+ stream = WriteOnlyStringIO()
141
+ with contextlib.redirect_stdout(stream):
142
+ with contextlib.redirect_stderr(stream):
143
+ with redirect_stdin(stream):
144
+ yield
145
+
146
+
147
+ @contextlib.contextmanager
148
+ def create_tempdir():
149
+ with tempfile.TemporaryDirectory() as dirname:
150
+ with chdir(dirname):
151
+ yield dirname
152
+
153
+
154
+ class TimeoutException(Exception):
155
+ pass
156
+
157
+
158
+ class WriteOnlyStringIO(io.StringIO):
159
+ """ StringIO that throws an exception when it's read from """
160
+
161
+ def read(self, *args, **kwargs):
162
+ raise IOError
163
+
164
+ def readline(self, *args, **kwargs):
165
+ raise IOError
166
+
167
+ def readlines(self, *args, **kwargs):
168
+ raise IOError
169
+
170
+ def readable(self, *args, **kwargs):
171
+ """ Returns True if the IO object can be read. """
172
+ return False
173
+
174
+
175
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
176
+ _stream = 'stdin'
177
+
178
+
179
+ @contextlib.contextmanager
180
+ def chdir(root):
181
+ if root == ".":
182
+ yield
183
+ return
184
+ cwd = os.getcwd()
185
+ os.chdir(root)
186
+ try:
187
+ yield
188
+ except BaseException as exc:
189
+ raise exc
190
+ finally:
191
+ os.chdir(cwd)
192
+
193
+
194
+ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
195
+ """
196
+ This disables various destructive functions and prevents the generated code
197
+ from interfering with the test (e.g. fork bomb, killing other processes,
198
+ removing filesystem files, etc.)
199
+
200
+ WARNING
201
+ This function is NOT a security sandbox. Untrusted code, including, model-
202
+ generated code, should not be blindly executed outside of one. See the
203
+ Codex paper for more information about OpenAI's code sandbox, and proceed
204
+ with caution.
205
+ """
206
+
207
+ if maximum_memory_bytes is not None:
208
+ import resource
209
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
210
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
211
+ if not platform.uname().system == 'Darwin':
212
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
213
+
214
+ faulthandler.disable()
215
+
216
+ import builtins
217
+ builtins.exit = None
218
+ builtins.quit = None
219
+
220
+ import os
221
+ os.environ['OMP_NUM_THREADS'] = '1'
222
+
223
+ os.kill = None
224
+ os.system = None
225
+ os.putenv = None
226
+ os.remove = None
227
+ os.removedirs = None
228
+ os.rmdir = None
229
+ os.fchdir = None
230
+ os.setuid = None
231
+ os.fork = None
232
+ os.forkpty = None
233
+ os.killpg = None
234
+ os.rename = None
235
+ os.renames = None
236
+ os.truncate = None
237
+ os.replace = None
238
+ os.unlink = None
239
+ os.fchmod = None
240
+ os.fchown = None
241
+ os.chmod = None
242
+ os.chown = None
243
+ os.chroot = None
244
+ os.fchdir = None
245
+ os.lchflags = None
246
+ os.lchmod = None
247
+ os.lchown = None
248
+ os.getcwd = None
249
+ os.chdir = None
250
+
251
+ import shutil
252
+ shutil.rmtree = None
253
+ shutil.move = None
254
+ shutil.chown = None
255
+
256
+ import subprocess
257
+ subprocess.Popen = None # type: ignore
258
+
259
+ __builtins__['help'] = None
260
+
261
+ import sys
262
+ sys.modules['ipdb'] = None
263
+ sys.modules['joblib'] = None
264
+ sys.modules['resource'] = None
265
+ sys.modules['psutil'] = None
266
+ sys.modules['tkinter'] = None
run_code.sh ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Function to calculate memory usage in MB*seconds
4
+ calculate_memory_usage() {
5
+ local dat_file=$1
6
+ awk 'BEGIN {
7
+ prev_time = 0;
8
+ prev_mem_mb = 0;
9
+ mem_time_mb_s = 0;
10
+ }
11
+ NR>1 {
12
+ mem_in_mb = $2;
13
+ timestamp = $3;
14
+
15
+ if (prev_time > 0) {
16
+ time_interval_s = timestamp - prev_time;
17
+ mem_time_mb_s += (prev_mem_mb + mem_in_mb) / 2 * time_interval_s;
18
+ }
19
+
20
+ prev_time = timestamp;
21
+ prev_mem_mb = mem_in_mb;
22
+ }
23
+ END {
24
+ printf "%.2f\n", mem_time_mb_s;
25
+ }' "$dat_file"
26
+ }
27
+
28
+
29
+ completion_file="$1"
30
+ completion_dat_file="$2"
31
+ # file_type="$3"
32
+ max_execution_time="$3"
33
+ # Execute the specified file
34
+ # max_execution_time=10
35
+ echo "Executing $completion_file"
36
+ error_output=$(mktemp)
37
+ start_time=$(date +%s%N)
38
+ rm -f "$completion_dat_file"
39
+ timeout "$max_execution_time" mprof run --interval 0.001 --output "$completion_dat_file" "$completion_file" 2> "$error_output"
40
+ end_time=$(date +%s%N)
41
+ execution_time=$(( (end_time - start_time) / 1000000 ))
42
+ exit_status=$?
43
+
44
+ # Check execution status
45
+ echo "Execution status: $exit_status"
46
+ if [ $exit_status -ne 0 ] || [ -s "$error_output" ]; then
47
+ echo "Execution failed or errors were reported for $completion_file. Removing .dat file."
48
+ rm -f "$completion_dat_file"
49
+ elif [ -f "$completion_dat_file" ]; then
50
+ mem_usage_mb_s=$(calculate_memory_usage "$completion_dat_file")
51
+ echo "Memory usage (MB*seconds): $mem_usage_mb_s"
52
+ else
53
+ echo "Execution completed but no .dat file found for $completion_file."
54
+ fi
55
+
56
+ # Clean up
57
+ rm -f "$error_output"