smallan13 commited on
Commit
01f199c
·
verified ·
1 Parent(s): 87a588d

Upload 105 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +23 -3
  3. data/APPS/selected150.jsonl +0 -0
  4. data/CodeContest/Test.jsonl +3 -0
  5. data/HumanEval/HumanEval.jsonl +0 -0
  6. data/HumanEval/HumanEvalET.jsonl +0 -0
  7. data/HumanEval/HumanEvalWST.jsonl +0 -0
  8. data/MBPPEval/MBPP.jsonl +0 -0
  9. data/MBPPEval/MBPP_ET.jsonl +0 -0
  10. data/MBPPEval/mbpp-py.jsonl +0 -0
  11. data/xCodeEval/problem_descriptions.jsonl +3 -0
  12. data/xCodeEval/prog_syn_val.jsonl +0 -0
  13. data/xCodeEval/unittest_db.json +3 -0
  14. requirements.txt +15 -0
  15. src/constants/__init__.py +0 -0
  16. src/constants/__pycache__/__init__.cpython-311.pyc +0 -0
  17. src/constants/__pycache__/lang_mappings.cpython-311.pyc +0 -0
  18. src/constants/__pycache__/paths.cpython-311.pyc +0 -0
  19. src/constants/lang_mappings.py +14 -0
  20. src/constants/paths.py +135 -0
  21. src/datasets/APPSDataset.py +48 -0
  22. src/datasets/CodeContestDataset.py +41 -0
  23. src/datasets/Dataset.py +33 -0
  24. src/datasets/DatasetFactory.py +25 -0
  25. src/datasets/HumanEvalDataset.py +46 -0
  26. src/datasets/MBPPDataset.py +47 -0
  27. src/datasets/XCodeDataset.py +50 -0
  28. src/datasets/__init__.py +0 -0
  29. src/datasets/__pycache__/APPSDataset.cpython-311.pyc +0 -0
  30. src/datasets/__pycache__/CodeContestDataset.cpython-311.pyc +0 -0
  31. src/datasets/__pycache__/Dataset.cpython-311.pyc +0 -0
  32. src/datasets/__pycache__/DatasetFactory.cpython-311.pyc +0 -0
  33. src/datasets/__pycache__/HumanEvalDataset.cpython-311.pyc +0 -0
  34. src/datasets/__pycache__/MBPPDataset.cpython-311.pyc +0 -0
  35. src/datasets/__pycache__/XCodeDataset.cpython-311.pyc +0 -0
  36. src/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
  37. src/datasets/convert-apps-xcode.py +129 -0
  38. src/datasets/convert-cc-xcode.py +66 -0
  39. src/evaluate-et-dataset.py +104 -0
  40. src/evaluations/__init__.py +0 -0
  41. src/evaluations/__pycache__/__init__.cpython-311.pyc +0 -0
  42. src/evaluations/__pycache__/api_comm.cpython-311.pyc +0 -0
  43. src/evaluations/__pycache__/evalute.cpython-311.pyc +0 -0
  44. src/evaluations/__pycache__/exec_outcome.cpython-311.pyc +0 -0
  45. src/evaluations/__pycache__/executor_utils.cpython-311.pyc +0 -0
  46. src/evaluations/__pycache__/func_evaluate.cpython-311.pyc +0 -0
  47. src/evaluations/api_comm.py +115 -0
  48. src/evaluations/evalute.py +168 -0
  49. src/evaluations/exec_outcome.py +14 -0
  50. src/evaluations/executor_utils.py +58 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/CodeContest/Test.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ data/xCodeEval/problem_descriptions.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ data/xCodeEval/unittest_db.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,23 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Running our project
2
+
3
+ 1. Clone our project
4
+ ```bash
5
+ cd DebateCoder
6
+ ```
7
+
8
+ 2. Create a new conda or python virtual environment and run the following command
9
+ ```bash
10
+ pip install -r requirements.txt
11
+ ```
12
+
13
+ 3. Set up the .env file by seeing the example.
14
+
15
+ 4. Run the following command to see the options of running this projects
16
+ ```bash
17
+ python src/main.py --help
18
+ ```
19
+
20
+ 5. Finally run this project. An example is given below:
21
+ ```bash
22
+ python src/main.py --model Pangu --dataset HumanEval --strategy DebateCoder
23
+ ```
data/APPS/selected150.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/CodeContest/Test.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8343e00e4b52160990576bcd2b2588466d2ac39b2751401beedd6606294423c9
3
+ size 23356061
data/HumanEval/HumanEval.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/HumanEval/HumanEvalET.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/HumanEval/HumanEvalWST.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/MBPPEval/MBPP.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/MBPPEval/MBPP_ET.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/MBPPEval/mbpp-py.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/xCodeEval/problem_descriptions.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda417d0e06ded34ee62255b84ac386e8535137b82f8c4f905adda0b93d9f78b
3
+ size 18159029
data/xCodeEval/prog_syn_val.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/xCodeEval/unittest_db.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26b0b526ad96f7abbf11057b9547febf8237ce0bff00cad96fefc650e34374d5
3
+ size 11346609
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python==3.11
2
+ openai
3
+ numpy
4
+ pandas
5
+ python-dotenv
6
+ pyyml
7
+ tenacity
8
+ tiktoken
9
+ tqdm
10
+ gensim
11
+ jsonlines
12
+ astunparse
13
+ pyarrow
14
+ google-genai
15
+ accelerate
src/constants/__init__.py ADDED
File without changes
src/constants/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (205 Bytes). View file
 
src/constants/__pycache__/lang_mappings.cpython-311.pyc ADDED
Binary file (421 Bytes). View file
 
src/constants/__pycache__/paths.cpython-311.pyc ADDED
Binary file (2.57 kB). View file
 
src/constants/lang_mappings.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LANGUAGE_MAPPING = {
2
+ "Python": "Python 3",
3
+ "Python3": "Python 3",
4
+ "C#": "C# 10",
5
+ "NET-CORE": ".NET Core C#",
6
+ # "Node": "Node.js",
7
+ "Rust": "Rust",
8
+ # "Java":"Java 17",
9
+ "PHP": "PHP",
10
+ "Go": "Go",
11
+ "Ruby": "Ruby",
12
+ "C++": "GNU C++17",
13
+ "C": "GNU C"
14
+ }
src/constants/paths.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from os.path import join, dirname
3
+
4
+ # HumanEval Dataset
5
+ HUMAN_DATA_DIR = join(
6
+ "data",
7
+ "HumanEval",
8
+ )
9
+
10
+ HUMAN_DATA_PATH = join(
11
+ HUMAN_DATA_DIR,
12
+ "HumanEval.jsonl"
13
+ )
14
+
15
+ HUMAN_WST_DATA_PATH = join(
16
+ HUMAN_DATA_DIR,
17
+ "HumanEvalWST.jsonl"
18
+ )
19
+
20
+ HUMAN_REFLXION_FILTERED_PATH = join(
21
+ HUMAN_DATA_DIR,
22
+ "humaneval-py.jsonl"
23
+ )
24
+
25
+ HUMAN_HARDSET_PATH = join(
26
+ HUMAN_DATA_DIR,
27
+ "humaneval-py_hardest50.jsonl"
28
+ )
29
+
30
+ HUMAN_ET_DATA_PATH = join(
31
+ HUMAN_DATA_DIR,
32
+ "HumanEvalET.jsonl"
33
+ )
34
+
35
+ HUMAN_SIMILAR_PROBLEMS_PATH = join(
36
+ HUMAN_DATA_DIR,
37
+ "similar_problems_solutions.jsonl"
38
+ )
39
+
40
+
41
+ # MBPP Dataset
42
+ MBPP_DATA_DIR = join(
43
+ "data",
44
+ "MBPPEval",
45
+ )
46
+
47
+ MBPP_DATA_PATH = join(
48
+ MBPP_DATA_DIR,
49
+ "MBPP-py.jsonl"
50
+ )
51
+
52
+ MBPP_ET_DATA_PATH = join(
53
+ MBPP_DATA_DIR,
54
+ "MBPP_ET.jsonl"
55
+ )
56
+
57
+ MBPP_SANITIZED_DATA_PATH = join(
58
+ MBPP_DATA_DIR,
59
+ "MBPP_SANITIZED.json"
60
+ )
61
+
62
+ MBPP_SIMILAR_PROBLEMS_PATH = join(
63
+ MBPP_DATA_DIR,
64
+ "similar_problems_solutions.jsonl"
65
+ )
66
+
67
+ # XCodeEval Dataset
68
+ XCODE_DATA_DIR = join(
69
+ "data",
70
+ "xCodeEval",
71
+ )
72
+
73
+ XCODE_VALIDATION_DATA_PATH = join(
74
+ XCODE_DATA_DIR,
75
+ "prog_syn_val.jsonl"
76
+ )
77
+
78
+ XCODE_TEST_DATA_PATH = join(
79
+ XCODE_DATA_DIR,
80
+ "prog_syn_test.jsonl"
81
+ )
82
+
83
+ XCODE_TRAIN_DATA_DIR_PATH = join(
84
+ XCODE_DATA_DIR,
85
+ "train"
86
+ )
87
+
88
+ XCODE_UNIT_TEST_PATH = join(
89
+ XCODE_DATA_DIR,
90
+ "unittest_db.json"
91
+ )
92
+
93
+ XCODE_PROBLEM_DESCRIPTION_PATH = join(
94
+ XCODE_DATA_DIR,
95
+ "problem_descriptions.jsonl"
96
+ )
97
+
98
+ XCODE_SIMILAR_SRC_UIDS_PATH = join(
99
+ XCODE_DATA_DIR,
100
+ "similar_src_uids.json"
101
+ )
102
+
103
+ XCODE_SIMILAR_PROBLEMS_PATH = join(
104
+ XCODE_DATA_DIR,
105
+ "similar_problems_solutions.json"
106
+ )
107
+
108
+ XCODE_PROBLEM_FILE_MAPPINGS_PATH = join(
109
+ XCODE_DATA_DIR,
110
+ "problem_file_mapping.json"
111
+ )
112
+
113
+
114
+ # Code Contest Dataset
115
+ CODE_CONTEST_DATA_DIR = join(
116
+ "data",
117
+ "CodeContest",
118
+ )
119
+
120
+ CODE_CONTEST_DATA_PATH = join(
121
+ CODE_CONTEST_DATA_DIR,
122
+ "Test.jsonl"
123
+ )
124
+
125
+
126
+ # APPS Dataset
127
+ APPS_DATA_DIR = join(
128
+ "data",
129
+ "APPS",
130
+ )
131
+
132
+ APPS_DATA_PATH = join(
133
+ APPS_DATA_DIR,
134
+ "selected150.jsonl"
135
+ )
src/datasets/APPSDataset.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .Dataset import Dataset
2
+ from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests
3
+ from constants.paths import *
4
+
5
+
6
+ class APPSDataset(Dataset):
7
+ def __init__(
8
+ self,
9
+ path: str = APPS_DATA_PATH,
10
+ ):
11
+ super().__init__(path)
12
+ self.id_key = "id"
13
+
14
+ def evaluate(
15
+ self,
16
+ item: dict,
17
+ cur_imp: str,
18
+ language: str,
19
+ ):
20
+ return contest_evaluate(
21
+ generated_code=cur_imp,
22
+ id=item["id"],
23
+ tests=item["test_list"],
24
+ lang=language
25
+ )
26
+
27
+ def evaluate_sample_io(
28
+ self,
29
+ item: dict,
30
+ cur_imp: str,
31
+ language: str,
32
+ ):
33
+ if len(item["sample_io"]) == 0:
34
+ return True, ""
35
+ return contest_evaluate_public_tests(
36
+ generated_code=cur_imp,
37
+ id=item["id"],
38
+ tests=item["sample_io"],
39
+ lang=language
40
+ )
41
+
42
+ @staticmethod
43
+ def get_prompt(item):
44
+ sample_io_format = ""
45
+ if len(item['sample_io']) > 0:
46
+ sample_io_format = f"Sample Input Format:\n{item['sample_io'][0]['input']}\nSample Output Format:\n{item['sample_io'][0]['output'][0]}\n\n-------\n"
47
+
48
+ return f"{item['description']}\n\n{sample_io_format}Important: You must follow the input output format. Input should be taken from standard input and output should be given to standard output.\nNote: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function."
src/datasets/CodeContestDataset.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .Dataset import Dataset
2
+ from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests
3
+ from constants.paths import *
4
+
5
+ class CodeContestDataset(Dataset):
6
+ def __init__(
7
+ self,
8
+ path: str=CODE_CONTEST_DATA_PATH,
9
+ ):
10
+ super().__init__(path)
11
+ self.id_key = "id"
12
+
13
+ def evaluate(
14
+ self,
15
+ item: dict,
16
+ cur_imp: str,
17
+ language: str,
18
+ ):
19
+ return contest_evaluate(
20
+ generated_code=cur_imp,
21
+ id=item["id"],
22
+ tests=item["test_list"],
23
+ lang=language
24
+ )
25
+
26
+ def evaluate_sample_io(
27
+ self,
28
+ item: dict,
29
+ cur_imp: str,
30
+ language: str,
31
+ ):
32
+ return contest_evaluate_public_tests(
33
+ generated_code=cur_imp,
34
+ id=item["id"],
35
+ tests=item["sample_io"],
36
+ lang=language
37
+ )
38
+
39
+ @staticmethod
40
+ def get_prompt(item):
41
+ return f"{item['description']}\n\n-------\nImportant Note: You must follow the input output format. Input must be taken from standard input and output must be given to standard output. The code will be tested against multiple test cases and all the test cases must be passed."
src/datasets/Dataset.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.jsonl import read_jsonl
2
+
3
+
4
+ class Dataset(object):
5
+ def __init__(
6
+ self,
7
+ path: str,
8
+ ):
9
+ self.path = path
10
+ self.data = None
11
+ self.id_key = ""
12
+ self.load()
13
+
14
+ def load(self):
15
+ self.data = read_jsonl(self.path)
16
+
17
+ def __len__(self):
18
+ return len(self.data)
19
+
20
+ def __getitem__(self, idx):
21
+ return self.data[idx]
22
+
23
+ def evaluate(
24
+ self,
25
+ item: dict,
26
+ cur_imp: str,
27
+ language: str,
28
+ ):
29
+ raise NotImplementedError
30
+
31
+ @staticmethod
32
+ def get_prompt(item):
33
+ raise NotImplementedError
src/datasets/DatasetFactory.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets.Dataset import Dataset
2
+ from datasets.MBPPDataset import MBPPDataset
3
+ from datasets.APPSDataset import APPSDataset
4
+ from datasets.XCodeDataset import XCodeDataset
5
+ from datasets.HumanEvalDataset import HumanDataset
6
+ from datasets.CodeContestDataset import CodeContestDataset
7
+
8
+
9
+ class DatasetFactory:
10
+ @staticmethod
11
+ def get_dataset_class(dataset_name):
12
+ if dataset_name == "APPS":
13
+ return APPSDataset
14
+ elif dataset_name == "MBPP":
15
+ return MBPPDataset
16
+ elif dataset_name == "XCode":
17
+ return XCodeDataset
18
+ elif dataset_name == "HumanEval":
19
+ return HumanDataset
20
+ elif dataset_name == "Human":
21
+ return HumanDataset
22
+ elif dataset_name == "CC":
23
+ return CodeContestDataset
24
+ else:
25
+ raise Exception(f"Unknown dataset name {dataset_name}")
src/datasets/HumanEvalDataset.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .Dataset import Dataset
2
+ from evaluations.func_evaluate import evaluate_functional_correctness, evaluate_io
3
+ from constants.paths import *
4
+
5
+
6
+ class HumanDataset(Dataset):
7
+ def __init__(
8
+ self,
9
+ path: str = HUMAN_WST_DATA_PATH,
10
+ ):
11
+ super().__init__(path)
12
+ self.id_key = "task_id"
13
+
14
+ def evaluate(
15
+ self,
16
+ item: dict,
17
+ cur_imp: str,
18
+ language: str,
19
+ ):
20
+ result = evaluate_functional_correctness(
21
+ problem=item,
22
+ completion=cur_imp
23
+ )
24
+ return result == "passed"
25
+
26
+ def evaluate_sample_io(
27
+ self,
28
+ item: dict,
29
+ cur_imp: str,
30
+ language: str,
31
+ ):
32
+
33
+ return evaluate_io(
34
+ sample_io=item["sample_io"],
35
+ completion=cur_imp,
36
+ )
37
+
38
+
39
+ @staticmethod
40
+ def get_prompt(item):
41
+ if "prompt" in item:
42
+ return f"{item['prompt']}"
43
+ elif "text" in item:
44
+ return f"{item['text']}"
45
+ else:
46
+ raise Exception("No prompt or text in item")
src/datasets/MBPPDataset.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .Dataset import Dataset
2
+ from evaluations.func_evaluate import evaluate_io, evaluate_functional_correctness
3
+ from constants.paths import *
4
+
5
+
6
+ class MBPPDataset(Dataset):
7
+ def __init__(
8
+ self,
9
+ path: str = MBPP_DATA_PATH,
10
+ ):
11
+ super().__init__(path)
12
+ self.id_key = "name"
13
+
14
+ def evaluate(
15
+ self,
16
+ item: dict,
17
+ cur_imp: str,
18
+ language: str,
19
+ ):
20
+ # result, _ = evaluate_io(item['test_list'],cur_imp,5,True)
21
+ # return result
22
+ result = evaluate_functional_correctness(
23
+ problem=item,
24
+ completion=cur_imp
25
+ )
26
+ return result == "passed"
27
+
28
+ def evaluate_sample_io(
29
+ self,
30
+ item: dict,
31
+ cur_imp: str,
32
+ language: str,
33
+ ):
34
+ if "sample_io" not in item:
35
+ return True, ""
36
+ if len(item["sample_io"]) == 0:
37
+ return True, ""
38
+ return evaluate_io(
39
+ sample_io=item["sample_io"],
40
+ completion=cur_imp,
41
+ )
42
+
43
+ @staticmethod
44
+ def get_prompt(item):
45
+ # function_signature = item['code'].split('\n')[0].strip()
46
+ # return f"{item['text']}\nFunction Signature: {function_signature}"
47
+ return item["prompt"]
src/datasets/XCodeDataset.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .Dataset import Dataset
2
+ from evaluations.evalute import xcode_evaluate, contest_evaluate_public_tests
3
+ from constants.paths import *
4
+
5
+
6
+ class XCodeDataset(Dataset):
7
+ def __init__(
8
+ self,
9
+ path: str = XCODE_VALIDATION_DATA_PATH,
10
+ ):
11
+ super().__init__(path)
12
+ self.id_key = "src_uid"
13
+
14
+ def evaluate_sample_io(
15
+ self,
16
+ item: dict,
17
+ cur_imp: str,
18
+ language: str,
19
+ ):
20
+ sample_io = []
21
+
22
+ for input, output in zip(item["sample_inputs"], item["sample_outputs"]):
23
+ sample_io.append({
24
+ "input": input,
25
+ "output": [output]
26
+ })
27
+
28
+ return contest_evaluate_public_tests(
29
+ generated_code=cur_imp,
30
+ id=item[self.id_key],
31
+ tests=sample_io,
32
+ lang=language
33
+ )
34
+
35
+
36
+ def evaluate(
37
+ self,
38
+ item: dict,
39
+ cur_imp: str,
40
+ language: str,
41
+ ):
42
+ return xcode_evaluate(
43
+ generated_code=cur_imp,
44
+ src_uid=item["src_uid"],
45
+ lang=language
46
+ )
47
+
48
+ @staticmethod
49
+ def get_prompt(item):
50
+ return f"Problem Description:\n{item['description']}\nInput Specification:\n{item['input_spec']}\nOutput Specification:\n{item['output_spec']}\nSample Inputs: {item['sample_inputs']}\nSample Outputs: {item['sample_outputs']}\nNote: {item['notes']}\nTake input from: {item['input_from']}\nGive output to: {item['output_to']}\nTime Limit: {item['time_limit']}\nMemory Limit: {item['memory_limit']}\n\nNote: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function."
src/datasets/__init__.py ADDED
File without changes
src/datasets/__pycache__/APPSDataset.cpython-311.pyc ADDED
Binary file (2.91 kB). View file
 
src/datasets/__pycache__/CodeContestDataset.cpython-311.pyc ADDED
Binary file (2.32 kB). View file
 
src/datasets/__pycache__/Dataset.cpython-311.pyc ADDED
Binary file (1.83 kB). View file
 
src/datasets/__pycache__/DatasetFactory.cpython-311.pyc ADDED
Binary file (1.52 kB). View file
 
src/datasets/__pycache__/HumanEvalDataset.cpython-311.pyc ADDED
Binary file (2.12 kB). View file
 
src/datasets/__pycache__/MBPPDataset.cpython-311.pyc ADDED
Binary file (2.07 kB). View file
 
src/datasets/__pycache__/XCodeDataset.cpython-311.pyc ADDED
Binary file (3.27 kB). View file
 
src/datasets/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (204 Bytes). View file
 
src/datasets/convert-apps-xcode.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+
4
+
5
+ def read_jsonl(filename):
6
+ """Reads a jsonl file and yields each line as a dictionary"""
7
+ lines = []
8
+ # i = 0
9
+ with open(filename, "r", encoding="utf-8") as file:
10
+ for line in file:
11
+ lines.append(json.loads(line))
12
+ # i += 1
13
+ # print(i)
14
+ return lines
15
+
16
+ # Write a python list of dictionaries into a jsonl file
17
+
18
+
19
+ def write_jsonl(filename, lines):
20
+ """Writes a python list of dictionaries into a jsonl file"""
21
+ with open(filename, "w", encoding="utf-8") as file:
22
+ for line in lines:
23
+ file.write(json.dumps(line) + "\n")
24
+
25
+
26
+ train_set = read_jsonl("./data/APPS/train.jsonl")
27
+ test_set = read_jsonl("./data/APPS/train.jsonl")
28
+
29
+ dataset = train_set + test_set
30
+
31
+ print(len(dataset))
32
+
33
+ dataset = pd.DataFrame(dataset)
34
+ # dataset.columns
35
+
36
+ print(dataset['difficulty'].unique())
37
+
38
+
39
+ # Filter problems from codeforces with atleast 10 input and output
40
+ filter_indices = [False] * len(dataset)
41
+ for i in range(len(dataset)):
42
+ row = dataset.iloc[i]
43
+ if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5:
44
+ filter_indices[i] = True
45
+
46
+ codeforces_dataset = dataset[filter_indices]
47
+
48
+ print(len(codeforces_dataset))
49
+
50
+ # Randomly choose 50 problems
51
+ codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False)
52
+ print(len(codeforces_dataset_50))
53
+
54
+ codeforces_dataset_50.reset_index(drop=True, inplace=True)
55
+
56
+ # Filter interview problems with atleast 10 input and output
57
+ filter_indices = [False] * len(dataset)
58
+ for i in range(len(dataset)):
59
+ row = dataset.iloc[i]
60
+ if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
61
+ filter_indices[i] = True
62
+
63
+ interview_dataset = dataset[filter_indices]
64
+
65
+ print(len(interview_dataset))
66
+
67
+ # Randomly choose 50 problems
68
+ interview_dataset_50 = interview_dataset.sample(
69
+ n=min(50, len(interview_dataset)), random_state=1, replace=False)
70
+ print(len(interview_dataset_50))
71
+
72
+ interview_dataset_50.reset_index(drop=True, inplace=True)
73
+
74
+
75
+ # Filter introductory problems with atleast 10 input and output
76
+ filter_indices = [False] * len(dataset)
77
+ for i in range(len(dataset)):
78
+ row = dataset.iloc[i]
79
+ if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
80
+ filter_indices[i] = True
81
+
82
+ introductory_dataset = dataset[filter_indices]
83
+
84
+ print(len(introductory_dataset))
85
+
86
+ # Randomly choose 50 problems
87
+ introductory_dataset_50 = introductory_dataset.sample(
88
+ n=min(50, len(introductory_dataset)), random_state=1, replace=False)
89
+ print(len(introductory_dataset_50))
90
+
91
+ introductory_dataset_50.reset_index(drop=True, inplace=True)
92
+
93
+ selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True)
94
+
95
+
96
+ def get_test_cases(input, output):
97
+ return {
98
+ "input": "\n".join([str(x) for x in input]) if type(input) == list else input,
99
+ "output": output if type(output) == list else [output]
100
+ }
101
+
102
+
103
+ selected_datasets = []
104
+
105
+ for i in range(len(selected_df)):
106
+ row = selected_df.iloc[i]
107
+ test_cases = json.loads(row['input_output'])
108
+
109
+ public_test_cases = list(
110
+ map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2]))
111
+ test_cases = list(
112
+ map(get_test_cases, test_cases['inputs'], test_cases['outputs']))
113
+
114
+ test = {
115
+ "name": str(row['id']),
116
+ "description": str(row['question']),
117
+ "difficulty": str(row['difficulty']),
118
+ "id": int(row['id']),
119
+ "sample_io": public_test_cases,
120
+ "test_list": test_cases,
121
+ "starter_code": str(row['starter_code']),
122
+ }
123
+
124
+ selected_datasets.append(test)
125
+
126
+
127
+ write_jsonl("./data/APPS/selected150.jsonl", selected_datasets)
128
+
129
+
src/datasets/convert-cc-xcode.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Using this python file we have converted the code contest dataset to the format of the xCodeEval dataset.
2
+
3
+ import pandas as pd
4
+ import json
5
+
6
+
7
+ def read_jsonl(filename):
8
+ """Reads a jsonl file and yields each line as a dictionary"""
9
+ lines = []
10
+ # i = 0
11
+ with open(filename, "r", encoding="utf-8") as file:
12
+ for line in file:
13
+ lines.append(json.loads(line))
14
+ # i += 1
15
+ # print(i)
16
+ return lines
17
+
18
+ # Write a python list of dictionaries into a jsonl file
19
+
20
+
21
+ def write_jsonl(filename, lines):
22
+ """Writes a python list of dictionaries into a jsonl file"""
23
+ with open(filename, "w", encoding="utf-8") as file:
24
+ for line in lines:
25
+ file.write(json.dumps(line) + "\n")
26
+
27
+
28
+ df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow')
29
+ df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty',
30
+ 'description', 'public_tests', 'private_tests', 'generated_tests']]
31
+
32
+
33
+ def get_test_cases(input, output):
34
+ return {
35
+ "input": str(input),
36
+ "output": [str(output)]
37
+ }
38
+
39
+
40
+ test_datasets = []
41
+
42
+ for i in range(len(df)):
43
+ row = df.iloc[i]
44
+
45
+ public_test_cases = list(
46
+ map(get_test_cases, row['public_tests']['input'], row['public_tests']['output']))
47
+ test_cases = []
48
+ test_cases.extend(list(map(
49
+ get_test_cases, row['private_tests']['input'], row['private_tests']['output'])))
50
+ test_cases.extend(list(map(
51
+ get_test_cases, row['generated_tests']['input'], row['generated_tests']['output'])))
52
+
53
+ test = {
54
+ "name": str(row['name']),
55
+ "description": str(row['description']),
56
+ "tags": list(row['cf_tags']),
57
+ "difficulty": int(row['difficulty']),
58
+ "id": int(row['cf_contest_id']),
59
+ "sample_io": public_test_cases,
60
+ "test_list": test_cases
61
+ }
62
+
63
+ test_datasets.append(test)
64
+
65
+
66
+ write_jsonl("./data/CodeContest/Val.jsonl", test_datasets)
src/evaluate-et-dataset.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.jsonl import read_jsonl, write_jsonl
2
+ from evaluations.func_evaluate import evaluate_io_et
3
+ import os
4
+
5
+
6
+ def generate_et_dataset(
7
+ NORMAL_RESULTS_PATH,
8
+ ET_RESULTS_PATH,
9
+ ET_DATA_PATH=".\\data\\HumanEval\\HumanEvalET.jsonl"
10
+ ):
11
+ dataset = read_jsonl(ET_DATA_PATH)
12
+ data_dict = {}
13
+ for item in dataset:
14
+ data_dict[item["task_id"]] = {"et_item": item}
15
+
16
+ results = read_jsonl(NORMAL_RESULTS_PATH)
17
+ for result in results:
18
+ data_dict[result["task_id"]]["result"] = result
19
+
20
+ correct_count = 0
21
+ et_results = []
22
+ for key, value in data_dict.items():
23
+ item = value["et_item"]
24
+ result = value["result"]
25
+ generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"]
26
+
27
+ passed = evaluate_io_et(
28
+ item['test_case_list'],
29
+ generated_code,
30
+ prompt=item["prompt"]
31
+ )
32
+
33
+ if passed:
34
+ result["is_solved"] = True
35
+ correct_count += 1
36
+ else:
37
+ result["is_solved"] = False
38
+
39
+ et_results.append(result)
40
+ print(
41
+ f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
42
+ # write_jsonl(ET_RESULTS_PATH, et_results)
43
+
44
+ et_results = sorted(
45
+ et_results,
46
+ key=lambda x: int(x["task_id"].split('/')[-1])
47
+ )
48
+
49
+ write_jsonl(ET_RESULTS_PATH, et_results)
50
+ print(
51
+ f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
52
+
53
+
54
+ def generate_et_dataset_mbpp(
55
+ NORMAL_RESULTS_PATH,
56
+ ET_RESULTS_PATH,
57
+ ET_DATA_PATH=".\\data\\MBPPEval\\MBPP_ET.jsonl"
58
+ ):
59
+ dataset = read_jsonl(ET_DATA_PATH)
60
+ data_dict = {}
61
+ for item in dataset:
62
+ data_dict[item["task_id"]] = {"et_item": item}
63
+
64
+ results = read_jsonl(NORMAL_RESULTS_PATH)
65
+ for result in results:
66
+ task_id = int(result["name"].split("_")[1])
67
+ data_dict[task_id]["result"] = result
68
+
69
+ correct_count = 0
70
+ et_results = []
71
+ for key, value in data_dict.items():
72
+ item = value["et_item"]
73
+ result = value.get("result", None)
74
+ if result is None:
75
+ continue
76
+
77
+ generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"]
78
+
79
+ passed = evaluate_io_et(
80
+ item['test_list'],
81
+ generated_code
82
+ )
83
+
84
+ if passed:
85
+ result["is_solved"] = True
86
+ correct_count += 1
87
+ else:
88
+ result["is_solved"] = False
89
+
90
+ et_results.append(result)
91
+ print(
92
+ f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
93
+ # write_jsonl(ET_RESULTS_PATH, et_results)
94
+
95
+ et_results = sorted(
96
+ et_results,
97
+ key=lambda x: int(x["name"].split("_")[1])
98
+ )
99
+
100
+ write_jsonl(ET_RESULTS_PATH, et_results)
101
+ print(
102
+ f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
103
+
104
+
src/evaluations/__init__.py ADDED
File without changes
src/evaluations/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (207 Bytes). View file
 
src/evaluations/__pycache__/api_comm.cpython-311.pyc ADDED
Binary file (6.01 kB). View file
 
src/evaluations/__pycache__/evalute.cpython-311.pyc ADDED
Binary file (7.09 kB). View file
 
src/evaluations/__pycache__/exec_outcome.cpython-311.pyc ADDED
Binary file (685 Bytes). View file
 
src/evaluations/__pycache__/executor_utils.cpython-311.pyc ADDED
Binary file (3.13 kB). View file
 
src/evaluations/__pycache__/func_evaluate.cpython-311.pyc ADDED
Binary file (3.9 kB). View file
 
src/evaluations/api_comm.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+
3
+ import requests
4
+ from .exec_outcome import ExecOutcome
5
+
6
+ @dataclass
7
+ class ExtendedUnittest:
8
+ input: str
9
+ output: list[str] = field(default_factory=list)
10
+ result: str | None = None
11
+ exec_outcome: ExecOutcome | None = None
12
+
13
+ def json(self):
14
+ _json = self.__dict__
15
+ if self.exec_outcome is not None:
16
+ _json["exec_outcome"] = self.exec_outcome.name
17
+
18
+ return _json
19
+
20
+ @classmethod
21
+ def from_json(cls, _json):
22
+ return cls(
23
+ input=_json.get("input", ""),
24
+ output=_json.get("output", list()),
25
+ result=_json.get("result", None),
26
+ exec_outcome=_json.get("exec_outcome", None),
27
+ )
28
+
29
+
30
+ class EmptyValueError(Exception):
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+
35
+ class EmptyUnittestError(EmptyValueError):
36
+ pass
37
+
38
+
39
+ class EmptyLanguageError(EmptyValueError):
40
+ pass
41
+
42
+
43
+ class EmptySourceCodeError(EmptyValueError):
44
+ pass
45
+
46
+
47
+ class APICommunication:
48
+ _session: requests.Session
49
+
50
+ def __init__(self, server_url: str = "http://localhost:5000"):
51
+ self._session = requests.Session()
52
+ self.execute_code_url = f"{server_url}/api/execute_code"
53
+ self.get_runtimes_url = f"{server_url}/api/all_runtimes"
54
+
55
+ def __enter__(self):
56
+ return self
57
+
58
+ def __exit__(self, *args):
59
+ self._session.close()
60
+
61
+ def get_runtimes(self):
62
+ return self._session.get(self.get_runtimes_url).json()
63
+
64
+ def execute_code(
65
+ self,
66
+ language: str,
67
+ source_code: str,
68
+ unittests: list[dict],
69
+ limits: dict | None,
70
+ block_network: bool = True,
71
+ stop_on_first_fail: bool = True,
72
+ use_sanitizer: bool = False,
73
+ compiler_program_name: str | None = None,
74
+ compiler_flags: str | None = None,
75
+ interpreter_cmd: str | None = None,
76
+ interpreter_flags: str | None = None,
77
+ sample_id: int | None = None,
78
+ task_id: str | int | None = None,
79
+ ) -> tuple[list[ExtendedUnittest], int | None, str | int | None]:
80
+ if language is None:
81
+ raise EmptyLanguageError
82
+
83
+ if source_code is None:
84
+ raise EmptySourceCodeError
85
+
86
+ if unittests is None or len(unittests) == 0:
87
+ raise EmptyUnittestError
88
+
89
+ request_body = dict(
90
+ language=language,
91
+ source_code=source_code,
92
+ unittests=unittests,
93
+ limits=limits if isinstance(limits, dict) else dict(),
94
+ compile_cmd=compiler_program_name,
95
+ compile_flags=compiler_flags,
96
+ execute_cmd=interpreter_cmd,
97
+ execute_flags=interpreter_flags,
98
+ block_network=block_network,
99
+ stop_on_first_fail=stop_on_first_fail,
100
+ use_sanitizer=use_sanitizer,
101
+ )
102
+ json_response = self._session.post(
103
+ self.execute_code_url,
104
+ json=request_body,
105
+ headers={"Content-Type": "application/json"},
106
+ ).json()
107
+
108
+ if "data" not in json_response:
109
+ return "error", sample_id, task_id
110
+
111
+ return (
112
+ json_response["data"],
113
+ sample_id,
114
+ task_id,
115
+ )
src/evaluations/evalute.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import numpy as np
4
+ import tqdm
5
+ from yaml import safe_load
6
+ from typing import List
7
+
8
+ from .api_comm import APICommunication
9
+ from .exec_outcome import ExecOutcome
10
+ from constants.lang_mappings import LANGUAGE_MAPPING
11
+
12
+ limits_by_lang_cfg_file = "./src/evaluations/limits_by_lang.yaml"
13
+
14
+ assert os.path.exists(
15
+ limits_by_lang_cfg_file), "Need resource limit defaults for all runtimes, provide the path to default 'limits_by_lang.yaml' or to the modified one."
16
+
17
+ with open(limits_by_lang_cfg_file) as limit_cfg_rp:
18
+ limits_by_lang = safe_load(limit_cfg_rp)
19
+
20
+ unittest_file = "./data/xCodeEval/unittest_db.json"
21
+ assert os.path.exists(unittest_file), "Unittest file not found."
22
+
23
+ with open(unittest_file) as ut_rp:
24
+ unittest_db = json.load(ut_rp)
25
+
26
+
27
+ api_comm = APICommunication()
28
+
29
+
30
+ def xcode_evaluate(
31
+ generated_code: str,
32
+ src_uid: str,
33
+ lang: str
34
+ ):
35
+
36
+ assert src_uid in unittest_db, "Can not find the task id or source id"
37
+
38
+ assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}"
39
+
40
+ results, _, _ = api_comm.execute_code(
41
+ language=LANGUAGE_MAPPING[lang],
42
+ source_code=generated_code,
43
+ unittests=unittest_db[src_uid],
44
+ limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
45
+ task_id=src_uid,
46
+ )
47
+
48
+ if results == "error":
49
+ return False
50
+
51
+ passed = True
52
+ for result in results:
53
+ if result['exec_outcome'] != ExecOutcome.PASSED.value:
54
+ passed = False
55
+ break
56
+
57
+ return passed
58
+
59
+
60
+ def xcode_execute_internal_test(
61
+ generated_code: str,
62
+ tests: List[dict],
63
+ src_uid: str,
64
+ lang: str
65
+ ):
66
+ results, _, _ = api_comm.execute_code(
67
+ language=LANGUAGE_MAPPING[lang],
68
+ source_code=generated_code,
69
+ unittests=tests,
70
+ limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
71
+ task_id=src_uid,
72
+ stop_on_first_fail=False
73
+ )
74
+
75
+ passed = True
76
+ passed_feedback = []
77
+ failed_feedback = []
78
+
79
+ idx = 0
80
+ try:
81
+ for idx, result in enumerate(results):
82
+ if result['exec_outcome'] == ExecOutcome.PASSED.value:
83
+ passed_feedback.append(tests[idx])
84
+ if result['exec_outcome'] != ExecOutcome.PASSED.value:
85
+ failed_feedback.append(tests[idx])
86
+ passed = False
87
+ except:
88
+ passed = False
89
+ failed_feedback.extend(tests[idx:])
90
+
91
+ feedback = f'Tested passed: \n{json.dumps(passed_feedback)}\n\nTests failed: \n{json.dumps(failed_feedback)}'
92
+
93
+ return passed, feedback
94
+
95
+
96
+ def contest_evaluate(
97
+ generated_code: str,
98
+ lang: str,
99
+ id: int,
100
+ tests: List[dict],
101
+ ):
102
+ assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}"
103
+
104
+ results, _, _ = api_comm.execute_code(
105
+ language=LANGUAGE_MAPPING[lang],
106
+ source_code=generated_code,
107
+ unittests=tests,
108
+ limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
109
+ task_id=id,
110
+ )
111
+
112
+ if results == "error":
113
+ return False
114
+
115
+ passed = True
116
+ for result in results:
117
+ if result['exec_outcome'] != ExecOutcome.PASSED.value:
118
+ passed = False
119
+ break
120
+
121
+ return passed
122
+
123
+
124
+ def contest_evaluate_public_tests(
125
+ generated_code: str,
126
+ lang: str,
127
+ id: int,
128
+ tests: List[dict],
129
+ ):
130
+ results, _, _ = api_comm.execute_code(
131
+ language=LANGUAGE_MAPPING[lang],
132
+ source_code=generated_code,
133
+ unittests=tests,
134
+ limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
135
+ task_id=id,
136
+ stop_on_first_fail=False
137
+ )
138
+
139
+ passed = True
140
+ passed_feedback = []
141
+ failed_feedback = []
142
+
143
+ idx = 0
144
+ try:
145
+ for idx, result in enumerate(results):
146
+ output = str(result['result'])
147
+ if len(output) > 500:
148
+ output = output[:500] + "..."
149
+ test_case = f"Input:\n{tests[idx]['input']}\nExpected Output:\n{tests[idx]['output'][0]}\nYour Output:\n{output}\n"
150
+ if result['exec_outcome'] == ExecOutcome.PASSED.value:
151
+ passed_feedback.append(test_case)
152
+ if result['exec_outcome'] != ExecOutcome.PASSED.value:
153
+ failed_feedback.append(test_case)
154
+ passed = False
155
+ except:
156
+ passed = False
157
+ test_cases = []
158
+ for i in range(idx, len(tests)):
159
+ test_case = f"Input:\n{tests[i]['input']}\nExpected Output:\n{tests[i]['output'][0]}\n"
160
+ test_cases.append(test_case)
161
+
162
+ failed_feedback.extend(test_cases)
163
+
164
+ passed_feedback = '\n'.join(passed_feedback) if len(passed_feedback) > 0 else "No test cases passed."
165
+ failed_feedback = '\n'.join(failed_feedback)
166
+ feedback = f'## Tested passed:\n{passed_feedback}\n\n## Tests failed:\n{failed_feedback}'
167
+
168
+ return passed, feedback
src/evaluations/exec_outcome.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class ExecOutcome(Enum):
5
+ PASSED = "PASSED" # code executes and output matches expected output
6
+ WRONG_ANSWER = (
7
+ "WRONG_ANSWER" # code executes and output does NOT matches expected output
8
+ )
9
+ TIME_LIMIT_EXCEEDED = "TIME_LIMIT_EXCEEDED" # code executes and didn't exit in time, output is ignored in this case
10
+ RUNTIME_ERROR = "RUNTIME_ERROR" # code failed to execute (crashed)
11
+ COMPILATION_ERROR = "COMPILATION_ERROR" # code failed to compile
12
+ MEMORY_LIMIT_EXCEEDED = (
13
+ "MEMORY_LIMIT_EXCEEDED" # code exceeded memory limit during execution
14
+ )
src/evaluations/executor_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def timeout_handler(_, __):
3
+ raise TimeoutError()
4
+
5
+ import os, json
6
+ def to_jsonl(dict_data, file_path):
7
+ with open(file_path, 'a') as file:
8
+ json_line = json.dumps(dict_data)
9
+ file.write(json_line + os.linesep)
10
+
11
+ from threading import Thread
12
+ class PropagatingThread(Thread):
13
+ def run(self):
14
+ self.exc = None
15
+ try:
16
+ if hasattr(self, '_Thread__target'):
17
+ # Thread uses name mangling prior to Python 3.
18
+ self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
19
+ else:
20
+ self.ret = self._target(*self._args, **self._kwargs)
21
+ except BaseException as e:
22
+ self.exc = e
23
+
24
+ def join(self, timeout=None):
25
+ super(PropagatingThread, self).join(timeout)
26
+ if self.exc:
27
+ raise self.exc
28
+ return self.ret
29
+
30
+
31
+ def function_with_timeout(func, args, timeout):
32
+ result_container = []
33
+
34
+ def wrapper():
35
+ result_container.append(func(*args))
36
+
37
+ thread = PropagatingThread(target=wrapper)
38
+ thread.start()
39
+ thread.join(timeout)
40
+
41
+ if thread.is_alive():
42
+ raise TimeoutError()
43
+ else:
44
+ return result_container[0]
45
+
46
+ # Py tests
47
+
48
+ # if __name__ == "__main__":
49
+ # formatter = PySubmissionFormatter()
50
+ # leetcode_1 = 'class Solution:\n def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n '
51
+ # humaneval_1 = 'def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n'
52
+
53
+ # assert leetcode_1 == formatter.to_leetcode(humaneval_1)
54
+ # assert humaneval_1 == formatter.to_humaneval(leetcode_1)
55
+
56
+
57
+
58
+