Upload 105 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- README.md +23 -3
- data/APPS/selected150.jsonl +0 -0
- data/CodeContest/Test.jsonl +3 -0
- data/HumanEval/HumanEval.jsonl +0 -0
- data/HumanEval/HumanEvalET.jsonl +0 -0
- data/HumanEval/HumanEvalWST.jsonl +0 -0
- data/MBPPEval/MBPP.jsonl +0 -0
- data/MBPPEval/MBPP_ET.jsonl +0 -0
- data/MBPPEval/mbpp-py.jsonl +0 -0
- data/xCodeEval/problem_descriptions.jsonl +3 -0
- data/xCodeEval/prog_syn_val.jsonl +0 -0
- data/xCodeEval/unittest_db.json +3 -0
- requirements.txt +15 -0
- src/constants/__init__.py +0 -0
- src/constants/__pycache__/__init__.cpython-311.pyc +0 -0
- src/constants/__pycache__/lang_mappings.cpython-311.pyc +0 -0
- src/constants/__pycache__/paths.cpython-311.pyc +0 -0
- src/constants/lang_mappings.py +14 -0
- src/constants/paths.py +135 -0
- src/datasets/APPSDataset.py +48 -0
- src/datasets/CodeContestDataset.py +41 -0
- src/datasets/Dataset.py +33 -0
- src/datasets/DatasetFactory.py +25 -0
- src/datasets/HumanEvalDataset.py +46 -0
- src/datasets/MBPPDataset.py +47 -0
- src/datasets/XCodeDataset.py +50 -0
- src/datasets/__init__.py +0 -0
- src/datasets/__pycache__/APPSDataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/CodeContestDataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/Dataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/DatasetFactory.cpython-311.pyc +0 -0
- src/datasets/__pycache__/HumanEvalDataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/MBPPDataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/XCodeDataset.cpython-311.pyc +0 -0
- src/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
- src/datasets/convert-apps-xcode.py +129 -0
- src/datasets/convert-cc-xcode.py +66 -0
- src/evaluate-et-dataset.py +104 -0
- src/evaluations/__init__.py +0 -0
- src/evaluations/__pycache__/__init__.cpython-311.pyc +0 -0
- src/evaluations/__pycache__/api_comm.cpython-311.pyc +0 -0
- src/evaluations/__pycache__/evalute.cpython-311.pyc +0 -0
- src/evaluations/__pycache__/exec_outcome.cpython-311.pyc +0 -0
- src/evaluations/__pycache__/executor_utils.cpython-311.pyc +0 -0
- src/evaluations/__pycache__/func_evaluate.cpython-311.pyc +0 -0
- src/evaluations/api_comm.py +115 -0
- src/evaluations/evalute.py +168 -0
- src/evaluations/exec_outcome.py +14 -0
- src/evaluations/executor_utils.py +58 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/CodeContest/Test.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/xCodeEval/problem_descriptions.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/xCodeEval/unittest_db.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,23 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Running our project
|
| 2 |
+
|
| 3 |
+
1. Clone our project
|
| 4 |
+
```bash
|
| 5 |
+
cd DebateCoder
|
| 6 |
+
```
|
| 7 |
+
|
| 8 |
+
2. Create a new conda or python virtual environment and run the following command
|
| 9 |
+
```bash
|
| 10 |
+
pip install -r requirements.txt
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
3. Set up the .env file by seeing the example.
|
| 14 |
+
|
| 15 |
+
4. Run the following command to see the options of running this projects
|
| 16 |
+
```bash
|
| 17 |
+
python src/main.py --help
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
5. Finally run this project. An example is given below:
|
| 21 |
+
```bash
|
| 22 |
+
python src/main.py --model Pangu --dataset HumanEval --strategy DebateCoder
|
| 23 |
+
```
|
data/APPS/selected150.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/CodeContest/Test.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8343e00e4b52160990576bcd2b2588466d2ac39b2751401beedd6606294423c9
|
| 3 |
+
size 23356061
|
data/HumanEval/HumanEval.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/HumanEval/HumanEvalET.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/HumanEval/HumanEvalWST.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/MBPPEval/MBPP.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/MBPPEval/MBPP_ET.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/MBPPEval/mbpp-py.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/xCodeEval/problem_descriptions.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bda417d0e06ded34ee62255b84ac386e8535137b82f8c4f905adda0b93d9f78b
|
| 3 |
+
size 18159029
|
data/xCodeEval/prog_syn_val.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/xCodeEval/unittest_db.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26b0b526ad96f7abbf11057b9547febf8237ce0bff00cad96fefc650e34374d5
|
| 3 |
+
size 11346609
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python==3.11
|
| 2 |
+
openai
|
| 3 |
+
numpy
|
| 4 |
+
pandas
|
| 5 |
+
python-dotenv
|
| 6 |
+
pyyml
|
| 7 |
+
tenacity
|
| 8 |
+
tiktoken
|
| 9 |
+
tqdm
|
| 10 |
+
gensim
|
| 11 |
+
jsonlines
|
| 12 |
+
astunparse
|
| 13 |
+
pyarrow
|
| 14 |
+
google-genai
|
| 15 |
+
accelerate
|
src/constants/__init__.py
ADDED
|
File without changes
|
src/constants/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (205 Bytes). View file
|
|
|
src/constants/__pycache__/lang_mappings.cpython-311.pyc
ADDED
|
Binary file (421 Bytes). View file
|
|
|
src/constants/__pycache__/paths.cpython-311.pyc
ADDED
|
Binary file (2.57 kB). View file
|
|
|
src/constants/lang_mappings.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LANGUAGE_MAPPING = {
|
| 2 |
+
"Python": "Python 3",
|
| 3 |
+
"Python3": "Python 3",
|
| 4 |
+
"C#": "C# 10",
|
| 5 |
+
"NET-CORE": ".NET Core C#",
|
| 6 |
+
# "Node": "Node.js",
|
| 7 |
+
"Rust": "Rust",
|
| 8 |
+
# "Java":"Java 17",
|
| 9 |
+
"PHP": "PHP",
|
| 10 |
+
"Go": "Go",
|
| 11 |
+
"Ruby": "Ruby",
|
| 12 |
+
"C++": "GNU C++17",
|
| 13 |
+
"C": "GNU C"
|
| 14 |
+
}
|
src/constants/paths.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from os.path import join, dirname
|
| 3 |
+
|
| 4 |
+
# HumanEval Dataset
|
| 5 |
+
HUMAN_DATA_DIR = join(
|
| 6 |
+
"data",
|
| 7 |
+
"HumanEval",
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
HUMAN_DATA_PATH = join(
|
| 11 |
+
HUMAN_DATA_DIR,
|
| 12 |
+
"HumanEval.jsonl"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
HUMAN_WST_DATA_PATH = join(
|
| 16 |
+
HUMAN_DATA_DIR,
|
| 17 |
+
"HumanEvalWST.jsonl"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
HUMAN_REFLXION_FILTERED_PATH = join(
|
| 21 |
+
HUMAN_DATA_DIR,
|
| 22 |
+
"humaneval-py.jsonl"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
HUMAN_HARDSET_PATH = join(
|
| 26 |
+
HUMAN_DATA_DIR,
|
| 27 |
+
"humaneval-py_hardest50.jsonl"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
HUMAN_ET_DATA_PATH = join(
|
| 31 |
+
HUMAN_DATA_DIR,
|
| 32 |
+
"HumanEvalET.jsonl"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
HUMAN_SIMILAR_PROBLEMS_PATH = join(
|
| 36 |
+
HUMAN_DATA_DIR,
|
| 37 |
+
"similar_problems_solutions.jsonl"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# MBPP Dataset
|
| 42 |
+
MBPP_DATA_DIR = join(
|
| 43 |
+
"data",
|
| 44 |
+
"MBPPEval",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
MBPP_DATA_PATH = join(
|
| 48 |
+
MBPP_DATA_DIR,
|
| 49 |
+
"MBPP-py.jsonl"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
MBPP_ET_DATA_PATH = join(
|
| 53 |
+
MBPP_DATA_DIR,
|
| 54 |
+
"MBPP_ET.jsonl"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
MBPP_SANITIZED_DATA_PATH = join(
|
| 58 |
+
MBPP_DATA_DIR,
|
| 59 |
+
"MBPP_SANITIZED.json"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
MBPP_SIMILAR_PROBLEMS_PATH = join(
|
| 63 |
+
MBPP_DATA_DIR,
|
| 64 |
+
"similar_problems_solutions.jsonl"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# XCodeEval Dataset
|
| 68 |
+
XCODE_DATA_DIR = join(
|
| 69 |
+
"data",
|
| 70 |
+
"xCodeEval",
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
XCODE_VALIDATION_DATA_PATH = join(
|
| 74 |
+
XCODE_DATA_DIR,
|
| 75 |
+
"prog_syn_val.jsonl"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
XCODE_TEST_DATA_PATH = join(
|
| 79 |
+
XCODE_DATA_DIR,
|
| 80 |
+
"prog_syn_test.jsonl"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
XCODE_TRAIN_DATA_DIR_PATH = join(
|
| 84 |
+
XCODE_DATA_DIR,
|
| 85 |
+
"train"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
XCODE_UNIT_TEST_PATH = join(
|
| 89 |
+
XCODE_DATA_DIR,
|
| 90 |
+
"unittest_db.json"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
XCODE_PROBLEM_DESCRIPTION_PATH = join(
|
| 94 |
+
XCODE_DATA_DIR,
|
| 95 |
+
"problem_descriptions.jsonl"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
XCODE_SIMILAR_SRC_UIDS_PATH = join(
|
| 99 |
+
XCODE_DATA_DIR,
|
| 100 |
+
"similar_src_uids.json"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
XCODE_SIMILAR_PROBLEMS_PATH = join(
|
| 104 |
+
XCODE_DATA_DIR,
|
| 105 |
+
"similar_problems_solutions.json"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
XCODE_PROBLEM_FILE_MAPPINGS_PATH = join(
|
| 109 |
+
XCODE_DATA_DIR,
|
| 110 |
+
"problem_file_mapping.json"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# Code Contest Dataset
|
| 115 |
+
CODE_CONTEST_DATA_DIR = join(
|
| 116 |
+
"data",
|
| 117 |
+
"CodeContest",
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
CODE_CONTEST_DATA_PATH = join(
|
| 121 |
+
CODE_CONTEST_DATA_DIR,
|
| 122 |
+
"Test.jsonl"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# APPS Dataset
|
| 127 |
+
APPS_DATA_DIR = join(
|
| 128 |
+
"data",
|
| 129 |
+
"APPS",
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
APPS_DATA_PATH = join(
|
| 133 |
+
APPS_DATA_DIR,
|
| 134 |
+
"selected150.jsonl"
|
| 135 |
+
)
|
src/datasets/APPSDataset.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset import Dataset
|
| 2 |
+
from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests
|
| 3 |
+
from constants.paths import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class APPSDataset(Dataset):
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
path: str = APPS_DATA_PATH,
|
| 10 |
+
):
|
| 11 |
+
super().__init__(path)
|
| 12 |
+
self.id_key = "id"
|
| 13 |
+
|
| 14 |
+
def evaluate(
|
| 15 |
+
self,
|
| 16 |
+
item: dict,
|
| 17 |
+
cur_imp: str,
|
| 18 |
+
language: str,
|
| 19 |
+
):
|
| 20 |
+
return contest_evaluate(
|
| 21 |
+
generated_code=cur_imp,
|
| 22 |
+
id=item["id"],
|
| 23 |
+
tests=item["test_list"],
|
| 24 |
+
lang=language
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
def evaluate_sample_io(
|
| 28 |
+
self,
|
| 29 |
+
item: dict,
|
| 30 |
+
cur_imp: str,
|
| 31 |
+
language: str,
|
| 32 |
+
):
|
| 33 |
+
if len(item["sample_io"]) == 0:
|
| 34 |
+
return True, ""
|
| 35 |
+
return contest_evaluate_public_tests(
|
| 36 |
+
generated_code=cur_imp,
|
| 37 |
+
id=item["id"],
|
| 38 |
+
tests=item["sample_io"],
|
| 39 |
+
lang=language
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def get_prompt(item):
|
| 44 |
+
sample_io_format = ""
|
| 45 |
+
if len(item['sample_io']) > 0:
|
| 46 |
+
sample_io_format = f"Sample Input Format:\n{item['sample_io'][0]['input']}\nSample Output Format:\n{item['sample_io'][0]['output'][0]}\n\n-------\n"
|
| 47 |
+
|
| 48 |
+
return f"{item['description']}\n\n{sample_io_format}Important: You must follow the input output format. Input should be taken from standard input and output should be given to standard output.\nNote: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function."
|
src/datasets/CodeContestDataset.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset import Dataset
|
| 2 |
+
from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests
|
| 3 |
+
from constants.paths import *
|
| 4 |
+
|
| 5 |
+
class CodeContestDataset(Dataset):
|
| 6 |
+
def __init__(
|
| 7 |
+
self,
|
| 8 |
+
path: str=CODE_CONTEST_DATA_PATH,
|
| 9 |
+
):
|
| 10 |
+
super().__init__(path)
|
| 11 |
+
self.id_key = "id"
|
| 12 |
+
|
| 13 |
+
def evaluate(
|
| 14 |
+
self,
|
| 15 |
+
item: dict,
|
| 16 |
+
cur_imp: str,
|
| 17 |
+
language: str,
|
| 18 |
+
):
|
| 19 |
+
return contest_evaluate(
|
| 20 |
+
generated_code=cur_imp,
|
| 21 |
+
id=item["id"],
|
| 22 |
+
tests=item["test_list"],
|
| 23 |
+
lang=language
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def evaluate_sample_io(
|
| 27 |
+
self,
|
| 28 |
+
item: dict,
|
| 29 |
+
cur_imp: str,
|
| 30 |
+
language: str,
|
| 31 |
+
):
|
| 32 |
+
return contest_evaluate_public_tests(
|
| 33 |
+
generated_code=cur_imp,
|
| 34 |
+
id=item["id"],
|
| 35 |
+
tests=item["sample_io"],
|
| 36 |
+
lang=language
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def get_prompt(item):
|
| 41 |
+
return f"{item['description']}\n\n-------\nImportant Note: You must follow the input output format. Input must be taken from standard input and output must be given to standard output. The code will be tested against multiple test cases and all the test cases must be passed."
|
src/datasets/Dataset.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.jsonl import read_jsonl
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Dataset(object):
|
| 5 |
+
def __init__(
|
| 6 |
+
self,
|
| 7 |
+
path: str,
|
| 8 |
+
):
|
| 9 |
+
self.path = path
|
| 10 |
+
self.data = None
|
| 11 |
+
self.id_key = ""
|
| 12 |
+
self.load()
|
| 13 |
+
|
| 14 |
+
def load(self):
|
| 15 |
+
self.data = read_jsonl(self.path)
|
| 16 |
+
|
| 17 |
+
def __len__(self):
|
| 18 |
+
return len(self.data)
|
| 19 |
+
|
| 20 |
+
def __getitem__(self, idx):
|
| 21 |
+
return self.data[idx]
|
| 22 |
+
|
| 23 |
+
def evaluate(
|
| 24 |
+
self,
|
| 25 |
+
item: dict,
|
| 26 |
+
cur_imp: str,
|
| 27 |
+
language: str,
|
| 28 |
+
):
|
| 29 |
+
raise NotImplementedError
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def get_prompt(item):
|
| 33 |
+
raise NotImplementedError
|
src/datasets/DatasetFactory.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets.Dataset import Dataset
|
| 2 |
+
from datasets.MBPPDataset import MBPPDataset
|
| 3 |
+
from datasets.APPSDataset import APPSDataset
|
| 4 |
+
from datasets.XCodeDataset import XCodeDataset
|
| 5 |
+
from datasets.HumanEvalDataset import HumanDataset
|
| 6 |
+
from datasets.CodeContestDataset import CodeContestDataset
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class DatasetFactory:
|
| 10 |
+
@staticmethod
|
| 11 |
+
def get_dataset_class(dataset_name):
|
| 12 |
+
if dataset_name == "APPS":
|
| 13 |
+
return APPSDataset
|
| 14 |
+
elif dataset_name == "MBPP":
|
| 15 |
+
return MBPPDataset
|
| 16 |
+
elif dataset_name == "XCode":
|
| 17 |
+
return XCodeDataset
|
| 18 |
+
elif dataset_name == "HumanEval":
|
| 19 |
+
return HumanDataset
|
| 20 |
+
elif dataset_name == "Human":
|
| 21 |
+
return HumanDataset
|
| 22 |
+
elif dataset_name == "CC":
|
| 23 |
+
return CodeContestDataset
|
| 24 |
+
else:
|
| 25 |
+
raise Exception(f"Unknown dataset name {dataset_name}")
|
src/datasets/HumanEvalDataset.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset import Dataset
|
| 2 |
+
from evaluations.func_evaluate import evaluate_functional_correctness, evaluate_io
|
| 3 |
+
from constants.paths import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class HumanDataset(Dataset):
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
path: str = HUMAN_WST_DATA_PATH,
|
| 10 |
+
):
|
| 11 |
+
super().__init__(path)
|
| 12 |
+
self.id_key = "task_id"
|
| 13 |
+
|
| 14 |
+
def evaluate(
|
| 15 |
+
self,
|
| 16 |
+
item: dict,
|
| 17 |
+
cur_imp: str,
|
| 18 |
+
language: str,
|
| 19 |
+
):
|
| 20 |
+
result = evaluate_functional_correctness(
|
| 21 |
+
problem=item,
|
| 22 |
+
completion=cur_imp
|
| 23 |
+
)
|
| 24 |
+
return result == "passed"
|
| 25 |
+
|
| 26 |
+
def evaluate_sample_io(
|
| 27 |
+
self,
|
| 28 |
+
item: dict,
|
| 29 |
+
cur_imp: str,
|
| 30 |
+
language: str,
|
| 31 |
+
):
|
| 32 |
+
|
| 33 |
+
return evaluate_io(
|
| 34 |
+
sample_io=item["sample_io"],
|
| 35 |
+
completion=cur_imp,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def get_prompt(item):
|
| 41 |
+
if "prompt" in item:
|
| 42 |
+
return f"{item['prompt']}"
|
| 43 |
+
elif "text" in item:
|
| 44 |
+
return f"{item['text']}"
|
| 45 |
+
else:
|
| 46 |
+
raise Exception("No prompt or text in item")
|
src/datasets/MBPPDataset.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset import Dataset
|
| 2 |
+
from evaluations.func_evaluate import evaluate_io, evaluate_functional_correctness
|
| 3 |
+
from constants.paths import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MBPPDataset(Dataset):
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
path: str = MBPP_DATA_PATH,
|
| 10 |
+
):
|
| 11 |
+
super().__init__(path)
|
| 12 |
+
self.id_key = "name"
|
| 13 |
+
|
| 14 |
+
def evaluate(
|
| 15 |
+
self,
|
| 16 |
+
item: dict,
|
| 17 |
+
cur_imp: str,
|
| 18 |
+
language: str,
|
| 19 |
+
):
|
| 20 |
+
# result, _ = evaluate_io(item['test_list'],cur_imp,5,True)
|
| 21 |
+
# return result
|
| 22 |
+
result = evaluate_functional_correctness(
|
| 23 |
+
problem=item,
|
| 24 |
+
completion=cur_imp
|
| 25 |
+
)
|
| 26 |
+
return result == "passed"
|
| 27 |
+
|
| 28 |
+
def evaluate_sample_io(
|
| 29 |
+
self,
|
| 30 |
+
item: dict,
|
| 31 |
+
cur_imp: str,
|
| 32 |
+
language: str,
|
| 33 |
+
):
|
| 34 |
+
if "sample_io" not in item:
|
| 35 |
+
return True, ""
|
| 36 |
+
if len(item["sample_io"]) == 0:
|
| 37 |
+
return True, ""
|
| 38 |
+
return evaluate_io(
|
| 39 |
+
sample_io=item["sample_io"],
|
| 40 |
+
completion=cur_imp,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
@staticmethod
|
| 44 |
+
def get_prompt(item):
|
| 45 |
+
# function_signature = item['code'].split('\n')[0].strip()
|
| 46 |
+
# return f"{item['text']}\nFunction Signature: {function_signature}"
|
| 47 |
+
return item["prompt"]
|
src/datasets/XCodeDataset.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .Dataset import Dataset
|
| 2 |
+
from evaluations.evalute import xcode_evaluate, contest_evaluate_public_tests
|
| 3 |
+
from constants.paths import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class XCodeDataset(Dataset):
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
path: str = XCODE_VALIDATION_DATA_PATH,
|
| 10 |
+
):
|
| 11 |
+
super().__init__(path)
|
| 12 |
+
self.id_key = "src_uid"
|
| 13 |
+
|
| 14 |
+
def evaluate_sample_io(
|
| 15 |
+
self,
|
| 16 |
+
item: dict,
|
| 17 |
+
cur_imp: str,
|
| 18 |
+
language: str,
|
| 19 |
+
):
|
| 20 |
+
sample_io = []
|
| 21 |
+
|
| 22 |
+
for input, output in zip(item["sample_inputs"], item["sample_outputs"]):
|
| 23 |
+
sample_io.append({
|
| 24 |
+
"input": input,
|
| 25 |
+
"output": [output]
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
return contest_evaluate_public_tests(
|
| 29 |
+
generated_code=cur_imp,
|
| 30 |
+
id=item[self.id_key],
|
| 31 |
+
tests=sample_io,
|
| 32 |
+
lang=language
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def evaluate(
|
| 37 |
+
self,
|
| 38 |
+
item: dict,
|
| 39 |
+
cur_imp: str,
|
| 40 |
+
language: str,
|
| 41 |
+
):
|
| 42 |
+
return xcode_evaluate(
|
| 43 |
+
generated_code=cur_imp,
|
| 44 |
+
src_uid=item["src_uid"],
|
| 45 |
+
lang=language
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def get_prompt(item):
|
| 50 |
+
return f"Problem Description:\n{item['description']}\nInput Specification:\n{item['input_spec']}\nOutput Specification:\n{item['output_spec']}\nSample Inputs: {item['sample_inputs']}\nSample Outputs: {item['sample_outputs']}\nNote: {item['notes']}\nTake input from: {item['input_from']}\nGive output to: {item['output_to']}\nTime Limit: {item['time_limit']}\nMemory Limit: {item['memory_limit']}\n\nNote: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function."
|
src/datasets/__init__.py
ADDED
|
File without changes
|
src/datasets/__pycache__/APPSDataset.cpython-311.pyc
ADDED
|
Binary file (2.91 kB). View file
|
|
|
src/datasets/__pycache__/CodeContestDataset.cpython-311.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
src/datasets/__pycache__/Dataset.cpython-311.pyc
ADDED
|
Binary file (1.83 kB). View file
|
|
|
src/datasets/__pycache__/DatasetFactory.cpython-311.pyc
ADDED
|
Binary file (1.52 kB). View file
|
|
|
src/datasets/__pycache__/HumanEvalDataset.cpython-311.pyc
ADDED
|
Binary file (2.12 kB). View file
|
|
|
src/datasets/__pycache__/MBPPDataset.cpython-311.pyc
ADDED
|
Binary file (2.07 kB). View file
|
|
|
src/datasets/__pycache__/XCodeDataset.cpython-311.pyc
ADDED
|
Binary file (3.27 kB). View file
|
|
|
src/datasets/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (204 Bytes). View file
|
|
|
src/datasets/convert-apps-xcode.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def read_jsonl(filename):
|
| 6 |
+
"""Reads a jsonl file and yields each line as a dictionary"""
|
| 7 |
+
lines = []
|
| 8 |
+
# i = 0
|
| 9 |
+
with open(filename, "r", encoding="utf-8") as file:
|
| 10 |
+
for line in file:
|
| 11 |
+
lines.append(json.loads(line))
|
| 12 |
+
# i += 1
|
| 13 |
+
# print(i)
|
| 14 |
+
return lines
|
| 15 |
+
|
| 16 |
+
# Write a python list of dictionaries into a jsonl file
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def write_jsonl(filename, lines):
|
| 20 |
+
"""Writes a python list of dictionaries into a jsonl file"""
|
| 21 |
+
with open(filename, "w", encoding="utf-8") as file:
|
| 22 |
+
for line in lines:
|
| 23 |
+
file.write(json.dumps(line) + "\n")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
train_set = read_jsonl("./data/APPS/train.jsonl")
|
| 27 |
+
test_set = read_jsonl("./data/APPS/train.jsonl")
|
| 28 |
+
|
| 29 |
+
dataset = train_set + test_set
|
| 30 |
+
|
| 31 |
+
print(len(dataset))
|
| 32 |
+
|
| 33 |
+
dataset = pd.DataFrame(dataset)
|
| 34 |
+
# dataset.columns
|
| 35 |
+
|
| 36 |
+
print(dataset['difficulty'].unique())
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Filter problems from codeforces with atleast 10 input and output
|
| 40 |
+
filter_indices = [False] * len(dataset)
|
| 41 |
+
for i in range(len(dataset)):
|
| 42 |
+
row = dataset.iloc[i]
|
| 43 |
+
if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5:
|
| 44 |
+
filter_indices[i] = True
|
| 45 |
+
|
| 46 |
+
codeforces_dataset = dataset[filter_indices]
|
| 47 |
+
|
| 48 |
+
print(len(codeforces_dataset))
|
| 49 |
+
|
| 50 |
+
# Randomly choose 50 problems
|
| 51 |
+
codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False)
|
| 52 |
+
print(len(codeforces_dataset_50))
|
| 53 |
+
|
| 54 |
+
codeforces_dataset_50.reset_index(drop=True, inplace=True)
|
| 55 |
+
|
| 56 |
+
# Filter interview problems with atleast 10 input and output
|
| 57 |
+
filter_indices = [False] * len(dataset)
|
| 58 |
+
for i in range(len(dataset)):
|
| 59 |
+
row = dataset.iloc[i]
|
| 60 |
+
if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
|
| 61 |
+
filter_indices[i] = True
|
| 62 |
+
|
| 63 |
+
interview_dataset = dataset[filter_indices]
|
| 64 |
+
|
| 65 |
+
print(len(interview_dataset))
|
| 66 |
+
|
| 67 |
+
# Randomly choose 50 problems
|
| 68 |
+
interview_dataset_50 = interview_dataset.sample(
|
| 69 |
+
n=min(50, len(interview_dataset)), random_state=1, replace=False)
|
| 70 |
+
print(len(interview_dataset_50))
|
| 71 |
+
|
| 72 |
+
interview_dataset_50.reset_index(drop=True, inplace=True)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Filter introductory problems with atleast 10 input and output
|
| 76 |
+
filter_indices = [False] * len(dataset)
|
| 77 |
+
for i in range(len(dataset)):
|
| 78 |
+
row = dataset.iloc[i]
|
| 79 |
+
if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5:
|
| 80 |
+
filter_indices[i] = True
|
| 81 |
+
|
| 82 |
+
introductory_dataset = dataset[filter_indices]
|
| 83 |
+
|
| 84 |
+
print(len(introductory_dataset))
|
| 85 |
+
|
| 86 |
+
# Randomly choose 50 problems
|
| 87 |
+
introductory_dataset_50 = introductory_dataset.sample(
|
| 88 |
+
n=min(50, len(introductory_dataset)), random_state=1, replace=False)
|
| 89 |
+
print(len(introductory_dataset_50))
|
| 90 |
+
|
| 91 |
+
introductory_dataset_50.reset_index(drop=True, inplace=True)
|
| 92 |
+
|
| 93 |
+
selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def get_test_cases(input, output):
|
| 97 |
+
return {
|
| 98 |
+
"input": "\n".join([str(x) for x in input]) if type(input) == list else input,
|
| 99 |
+
"output": output if type(output) == list else [output]
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
selected_datasets = []
|
| 104 |
+
|
| 105 |
+
for i in range(len(selected_df)):
|
| 106 |
+
row = selected_df.iloc[i]
|
| 107 |
+
test_cases = json.loads(row['input_output'])
|
| 108 |
+
|
| 109 |
+
public_test_cases = list(
|
| 110 |
+
map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2]))
|
| 111 |
+
test_cases = list(
|
| 112 |
+
map(get_test_cases, test_cases['inputs'], test_cases['outputs']))
|
| 113 |
+
|
| 114 |
+
test = {
|
| 115 |
+
"name": str(row['id']),
|
| 116 |
+
"description": str(row['question']),
|
| 117 |
+
"difficulty": str(row['difficulty']),
|
| 118 |
+
"id": int(row['id']),
|
| 119 |
+
"sample_io": public_test_cases,
|
| 120 |
+
"test_list": test_cases,
|
| 121 |
+
"starter_code": str(row['starter_code']),
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
selected_datasets.append(test)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
write_jsonl("./data/APPS/selected150.jsonl", selected_datasets)
|
| 128 |
+
|
| 129 |
+
|
src/datasets/convert-cc-xcode.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Using this python file we have converted the code contest dataset to the format of the xCodeEval dataset.
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def read_jsonl(filename):
|
| 8 |
+
"""Reads a jsonl file and yields each line as a dictionary"""
|
| 9 |
+
lines = []
|
| 10 |
+
# i = 0
|
| 11 |
+
with open(filename, "r", encoding="utf-8") as file:
|
| 12 |
+
for line in file:
|
| 13 |
+
lines.append(json.loads(line))
|
| 14 |
+
# i += 1
|
| 15 |
+
# print(i)
|
| 16 |
+
return lines
|
| 17 |
+
|
| 18 |
+
# Write a python list of dictionaries into a jsonl file
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def write_jsonl(filename, lines):
|
| 22 |
+
"""Writes a python list of dictionaries into a jsonl file"""
|
| 23 |
+
with open(filename, "w", encoding="utf-8") as file:
|
| 24 |
+
for line in lines:
|
| 25 |
+
file.write(json.dumps(line) + "\n")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow')
|
| 29 |
+
df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty',
|
| 30 |
+
'description', 'public_tests', 'private_tests', 'generated_tests']]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_test_cases(input, output):
|
| 34 |
+
return {
|
| 35 |
+
"input": str(input),
|
| 36 |
+
"output": [str(output)]
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
test_datasets = []
|
| 41 |
+
|
| 42 |
+
for i in range(len(df)):
|
| 43 |
+
row = df.iloc[i]
|
| 44 |
+
|
| 45 |
+
public_test_cases = list(
|
| 46 |
+
map(get_test_cases, row['public_tests']['input'], row['public_tests']['output']))
|
| 47 |
+
test_cases = []
|
| 48 |
+
test_cases.extend(list(map(
|
| 49 |
+
get_test_cases, row['private_tests']['input'], row['private_tests']['output'])))
|
| 50 |
+
test_cases.extend(list(map(
|
| 51 |
+
get_test_cases, row['generated_tests']['input'], row['generated_tests']['output'])))
|
| 52 |
+
|
| 53 |
+
test = {
|
| 54 |
+
"name": str(row['name']),
|
| 55 |
+
"description": str(row['description']),
|
| 56 |
+
"tags": list(row['cf_tags']),
|
| 57 |
+
"difficulty": int(row['difficulty']),
|
| 58 |
+
"id": int(row['cf_contest_id']),
|
| 59 |
+
"sample_io": public_test_cases,
|
| 60 |
+
"test_list": test_cases
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
test_datasets.append(test)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
write_jsonl("./data/CodeContest/Val.jsonl", test_datasets)
|
src/evaluate-et-dataset.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.jsonl import read_jsonl, write_jsonl
|
| 2 |
+
from evaluations.func_evaluate import evaluate_io_et
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def generate_et_dataset(
|
| 7 |
+
NORMAL_RESULTS_PATH,
|
| 8 |
+
ET_RESULTS_PATH,
|
| 9 |
+
ET_DATA_PATH=".\\data\\HumanEval\\HumanEvalET.jsonl"
|
| 10 |
+
):
|
| 11 |
+
dataset = read_jsonl(ET_DATA_PATH)
|
| 12 |
+
data_dict = {}
|
| 13 |
+
for item in dataset:
|
| 14 |
+
data_dict[item["task_id"]] = {"et_item": item}
|
| 15 |
+
|
| 16 |
+
results = read_jsonl(NORMAL_RESULTS_PATH)
|
| 17 |
+
for result in results:
|
| 18 |
+
data_dict[result["task_id"]]["result"] = result
|
| 19 |
+
|
| 20 |
+
correct_count = 0
|
| 21 |
+
et_results = []
|
| 22 |
+
for key, value in data_dict.items():
|
| 23 |
+
item = value["et_item"]
|
| 24 |
+
result = value["result"]
|
| 25 |
+
generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"]
|
| 26 |
+
|
| 27 |
+
passed = evaluate_io_et(
|
| 28 |
+
item['test_case_list'],
|
| 29 |
+
generated_code,
|
| 30 |
+
prompt=item["prompt"]
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
if passed:
|
| 34 |
+
result["is_solved"] = True
|
| 35 |
+
correct_count += 1
|
| 36 |
+
else:
|
| 37 |
+
result["is_solved"] = False
|
| 38 |
+
|
| 39 |
+
et_results.append(result)
|
| 40 |
+
print(
|
| 41 |
+
f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
|
| 42 |
+
# write_jsonl(ET_RESULTS_PATH, et_results)
|
| 43 |
+
|
| 44 |
+
et_results = sorted(
|
| 45 |
+
et_results,
|
| 46 |
+
key=lambda x: int(x["task_id"].split('/')[-1])
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
write_jsonl(ET_RESULTS_PATH, et_results)
|
| 50 |
+
print(
|
| 51 |
+
f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def generate_et_dataset_mbpp(
|
| 55 |
+
NORMAL_RESULTS_PATH,
|
| 56 |
+
ET_RESULTS_PATH,
|
| 57 |
+
ET_DATA_PATH=".\\data\\MBPPEval\\MBPP_ET.jsonl"
|
| 58 |
+
):
|
| 59 |
+
dataset = read_jsonl(ET_DATA_PATH)
|
| 60 |
+
data_dict = {}
|
| 61 |
+
for item in dataset:
|
| 62 |
+
data_dict[item["task_id"]] = {"et_item": item}
|
| 63 |
+
|
| 64 |
+
results = read_jsonl(NORMAL_RESULTS_PATH)
|
| 65 |
+
for result in results:
|
| 66 |
+
task_id = int(result["name"].split("_")[1])
|
| 67 |
+
data_dict[task_id]["result"] = result
|
| 68 |
+
|
| 69 |
+
correct_count = 0
|
| 70 |
+
et_results = []
|
| 71 |
+
for key, value in data_dict.items():
|
| 72 |
+
item = value["et_item"]
|
| 73 |
+
result = value.get("result", None)
|
| 74 |
+
if result is None:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"]
|
| 78 |
+
|
| 79 |
+
passed = evaluate_io_et(
|
| 80 |
+
item['test_list'],
|
| 81 |
+
generated_code
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
if passed:
|
| 85 |
+
result["is_solved"] = True
|
| 86 |
+
correct_count += 1
|
| 87 |
+
else:
|
| 88 |
+
result["is_solved"] = False
|
| 89 |
+
|
| 90 |
+
et_results.append(result)
|
| 91 |
+
print(
|
| 92 |
+
f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
|
| 93 |
+
# write_jsonl(ET_RESULTS_PATH, et_results)
|
| 94 |
+
|
| 95 |
+
et_results = sorted(
|
| 96 |
+
et_results,
|
| 97 |
+
key=lambda x: int(x["name"].split("_")[1])
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
write_jsonl(ET_RESULTS_PATH, et_results)
|
| 101 |
+
print(
|
| 102 |
+
f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}")
|
| 103 |
+
|
| 104 |
+
|
src/evaluations/__init__.py
ADDED
|
File without changes
|
src/evaluations/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (207 Bytes). View file
|
|
|
src/evaluations/__pycache__/api_comm.cpython-311.pyc
ADDED
|
Binary file (6.01 kB). View file
|
|
|
src/evaluations/__pycache__/evalute.cpython-311.pyc
ADDED
|
Binary file (7.09 kB). View file
|
|
|
src/evaluations/__pycache__/exec_outcome.cpython-311.pyc
ADDED
|
Binary file (685 Bytes). View file
|
|
|
src/evaluations/__pycache__/executor_utils.cpython-311.pyc
ADDED
|
Binary file (3.13 kB). View file
|
|
|
src/evaluations/__pycache__/func_evaluate.cpython-311.pyc
ADDED
|
Binary file (3.9 kB). View file
|
|
|
src/evaluations/api_comm.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
from .exec_outcome import ExecOutcome
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class ExtendedUnittest:
|
| 8 |
+
input: str
|
| 9 |
+
output: list[str] = field(default_factory=list)
|
| 10 |
+
result: str | None = None
|
| 11 |
+
exec_outcome: ExecOutcome | None = None
|
| 12 |
+
|
| 13 |
+
def json(self):
|
| 14 |
+
_json = self.__dict__
|
| 15 |
+
if self.exec_outcome is not None:
|
| 16 |
+
_json["exec_outcome"] = self.exec_outcome.name
|
| 17 |
+
|
| 18 |
+
return _json
|
| 19 |
+
|
| 20 |
+
@classmethod
|
| 21 |
+
def from_json(cls, _json):
|
| 22 |
+
return cls(
|
| 23 |
+
input=_json.get("input", ""),
|
| 24 |
+
output=_json.get("output", list()),
|
| 25 |
+
result=_json.get("result", None),
|
| 26 |
+
exec_outcome=_json.get("exec_outcome", None),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class EmptyValueError(Exception):
|
| 31 |
+
def __init__(self, *args, **kwargs):
|
| 32 |
+
super().__init__(*args, **kwargs)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class EmptyUnittestError(EmptyValueError):
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class EmptyLanguageError(EmptyValueError):
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class EmptySourceCodeError(EmptyValueError):
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class APICommunication:
|
| 48 |
+
_session: requests.Session
|
| 49 |
+
|
| 50 |
+
def __init__(self, server_url: str = "http://localhost:5000"):
|
| 51 |
+
self._session = requests.Session()
|
| 52 |
+
self.execute_code_url = f"{server_url}/api/execute_code"
|
| 53 |
+
self.get_runtimes_url = f"{server_url}/api/all_runtimes"
|
| 54 |
+
|
| 55 |
+
def __enter__(self):
|
| 56 |
+
return self
|
| 57 |
+
|
| 58 |
+
def __exit__(self, *args):
|
| 59 |
+
self._session.close()
|
| 60 |
+
|
| 61 |
+
def get_runtimes(self):
|
| 62 |
+
return self._session.get(self.get_runtimes_url).json()
|
| 63 |
+
|
| 64 |
+
def execute_code(
|
| 65 |
+
self,
|
| 66 |
+
language: str,
|
| 67 |
+
source_code: str,
|
| 68 |
+
unittests: list[dict],
|
| 69 |
+
limits: dict | None,
|
| 70 |
+
block_network: bool = True,
|
| 71 |
+
stop_on_first_fail: bool = True,
|
| 72 |
+
use_sanitizer: bool = False,
|
| 73 |
+
compiler_program_name: str | None = None,
|
| 74 |
+
compiler_flags: str | None = None,
|
| 75 |
+
interpreter_cmd: str | None = None,
|
| 76 |
+
interpreter_flags: str | None = None,
|
| 77 |
+
sample_id: int | None = None,
|
| 78 |
+
task_id: str | int | None = None,
|
| 79 |
+
) -> tuple[list[ExtendedUnittest], int | None, str | int | None]:
|
| 80 |
+
if language is None:
|
| 81 |
+
raise EmptyLanguageError
|
| 82 |
+
|
| 83 |
+
if source_code is None:
|
| 84 |
+
raise EmptySourceCodeError
|
| 85 |
+
|
| 86 |
+
if unittests is None or len(unittests) == 0:
|
| 87 |
+
raise EmptyUnittestError
|
| 88 |
+
|
| 89 |
+
request_body = dict(
|
| 90 |
+
language=language,
|
| 91 |
+
source_code=source_code,
|
| 92 |
+
unittests=unittests,
|
| 93 |
+
limits=limits if isinstance(limits, dict) else dict(),
|
| 94 |
+
compile_cmd=compiler_program_name,
|
| 95 |
+
compile_flags=compiler_flags,
|
| 96 |
+
execute_cmd=interpreter_cmd,
|
| 97 |
+
execute_flags=interpreter_flags,
|
| 98 |
+
block_network=block_network,
|
| 99 |
+
stop_on_first_fail=stop_on_first_fail,
|
| 100 |
+
use_sanitizer=use_sanitizer,
|
| 101 |
+
)
|
| 102 |
+
json_response = self._session.post(
|
| 103 |
+
self.execute_code_url,
|
| 104 |
+
json=request_body,
|
| 105 |
+
headers={"Content-Type": "application/json"},
|
| 106 |
+
).json()
|
| 107 |
+
|
| 108 |
+
if "data" not in json_response:
|
| 109 |
+
return "error", sample_id, task_id
|
| 110 |
+
|
| 111 |
+
return (
|
| 112 |
+
json_response["data"],
|
| 113 |
+
sample_id,
|
| 114 |
+
task_id,
|
| 115 |
+
)
|
src/evaluations/evalute.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tqdm
|
| 5 |
+
from yaml import safe_load
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
from .api_comm import APICommunication
|
| 9 |
+
from .exec_outcome import ExecOutcome
|
| 10 |
+
from constants.lang_mappings import LANGUAGE_MAPPING
|
| 11 |
+
|
| 12 |
+
limits_by_lang_cfg_file = "./src/evaluations/limits_by_lang.yaml"
|
| 13 |
+
|
| 14 |
+
assert os.path.exists(
|
| 15 |
+
limits_by_lang_cfg_file), "Need resource limit defaults for all runtimes, provide the path to default 'limits_by_lang.yaml' or to the modified one."
|
| 16 |
+
|
| 17 |
+
with open(limits_by_lang_cfg_file) as limit_cfg_rp:
|
| 18 |
+
limits_by_lang = safe_load(limit_cfg_rp)
|
| 19 |
+
|
| 20 |
+
unittest_file = "./data/xCodeEval/unittest_db.json"
|
| 21 |
+
assert os.path.exists(unittest_file), "Unittest file not found."
|
| 22 |
+
|
| 23 |
+
with open(unittest_file) as ut_rp:
|
| 24 |
+
unittest_db = json.load(ut_rp)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
api_comm = APICommunication()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def xcode_evaluate(
|
| 31 |
+
generated_code: str,
|
| 32 |
+
src_uid: str,
|
| 33 |
+
lang: str
|
| 34 |
+
):
|
| 35 |
+
|
| 36 |
+
assert src_uid in unittest_db, "Can not find the task id or source id"
|
| 37 |
+
|
| 38 |
+
assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}"
|
| 39 |
+
|
| 40 |
+
results, _, _ = api_comm.execute_code(
|
| 41 |
+
language=LANGUAGE_MAPPING[lang],
|
| 42 |
+
source_code=generated_code,
|
| 43 |
+
unittests=unittest_db[src_uid],
|
| 44 |
+
limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
|
| 45 |
+
task_id=src_uid,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
if results == "error":
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
passed = True
|
| 52 |
+
for result in results:
|
| 53 |
+
if result['exec_outcome'] != ExecOutcome.PASSED.value:
|
| 54 |
+
passed = False
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return passed
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def xcode_execute_internal_test(
|
| 61 |
+
generated_code: str,
|
| 62 |
+
tests: List[dict],
|
| 63 |
+
src_uid: str,
|
| 64 |
+
lang: str
|
| 65 |
+
):
|
| 66 |
+
results, _, _ = api_comm.execute_code(
|
| 67 |
+
language=LANGUAGE_MAPPING[lang],
|
| 68 |
+
source_code=generated_code,
|
| 69 |
+
unittests=tests,
|
| 70 |
+
limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
|
| 71 |
+
task_id=src_uid,
|
| 72 |
+
stop_on_first_fail=False
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
passed = True
|
| 76 |
+
passed_feedback = []
|
| 77 |
+
failed_feedback = []
|
| 78 |
+
|
| 79 |
+
idx = 0
|
| 80 |
+
try:
|
| 81 |
+
for idx, result in enumerate(results):
|
| 82 |
+
if result['exec_outcome'] == ExecOutcome.PASSED.value:
|
| 83 |
+
passed_feedback.append(tests[idx])
|
| 84 |
+
if result['exec_outcome'] != ExecOutcome.PASSED.value:
|
| 85 |
+
failed_feedback.append(tests[idx])
|
| 86 |
+
passed = False
|
| 87 |
+
except:
|
| 88 |
+
passed = False
|
| 89 |
+
failed_feedback.extend(tests[idx:])
|
| 90 |
+
|
| 91 |
+
feedback = f'Tested passed: \n{json.dumps(passed_feedback)}\n\nTests failed: \n{json.dumps(failed_feedback)}'
|
| 92 |
+
|
| 93 |
+
return passed, feedback
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def contest_evaluate(
|
| 97 |
+
generated_code: str,
|
| 98 |
+
lang: str,
|
| 99 |
+
id: int,
|
| 100 |
+
tests: List[dict],
|
| 101 |
+
):
|
| 102 |
+
assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}"
|
| 103 |
+
|
| 104 |
+
results, _, _ = api_comm.execute_code(
|
| 105 |
+
language=LANGUAGE_MAPPING[lang],
|
| 106 |
+
source_code=generated_code,
|
| 107 |
+
unittests=tests,
|
| 108 |
+
limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
|
| 109 |
+
task_id=id,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if results == "error":
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
passed = True
|
| 116 |
+
for result in results:
|
| 117 |
+
if result['exec_outcome'] != ExecOutcome.PASSED.value:
|
| 118 |
+
passed = False
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
return passed
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def contest_evaluate_public_tests(
|
| 125 |
+
generated_code: str,
|
| 126 |
+
lang: str,
|
| 127 |
+
id: int,
|
| 128 |
+
tests: List[dict],
|
| 129 |
+
):
|
| 130 |
+
results, _, _ = api_comm.execute_code(
|
| 131 |
+
language=LANGUAGE_MAPPING[lang],
|
| 132 |
+
source_code=generated_code,
|
| 133 |
+
unittests=tests,
|
| 134 |
+
limits=limits_by_lang[LANGUAGE_MAPPING[lang]],
|
| 135 |
+
task_id=id,
|
| 136 |
+
stop_on_first_fail=False
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
passed = True
|
| 140 |
+
passed_feedback = []
|
| 141 |
+
failed_feedback = []
|
| 142 |
+
|
| 143 |
+
idx = 0
|
| 144 |
+
try:
|
| 145 |
+
for idx, result in enumerate(results):
|
| 146 |
+
output = str(result['result'])
|
| 147 |
+
if len(output) > 500:
|
| 148 |
+
output = output[:500] + "..."
|
| 149 |
+
test_case = f"Input:\n{tests[idx]['input']}\nExpected Output:\n{tests[idx]['output'][0]}\nYour Output:\n{output}\n"
|
| 150 |
+
if result['exec_outcome'] == ExecOutcome.PASSED.value:
|
| 151 |
+
passed_feedback.append(test_case)
|
| 152 |
+
if result['exec_outcome'] != ExecOutcome.PASSED.value:
|
| 153 |
+
failed_feedback.append(test_case)
|
| 154 |
+
passed = False
|
| 155 |
+
except:
|
| 156 |
+
passed = False
|
| 157 |
+
test_cases = []
|
| 158 |
+
for i in range(idx, len(tests)):
|
| 159 |
+
test_case = f"Input:\n{tests[i]['input']}\nExpected Output:\n{tests[i]['output'][0]}\n"
|
| 160 |
+
test_cases.append(test_case)
|
| 161 |
+
|
| 162 |
+
failed_feedback.extend(test_cases)
|
| 163 |
+
|
| 164 |
+
passed_feedback = '\n'.join(passed_feedback) if len(passed_feedback) > 0 else "No test cases passed."
|
| 165 |
+
failed_feedback = '\n'.join(failed_feedback)
|
| 166 |
+
feedback = f'## Tested passed:\n{passed_feedback}\n\n## Tests failed:\n{failed_feedback}'
|
| 167 |
+
|
| 168 |
+
return passed, feedback
|
src/evaluations/exec_outcome.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ExecOutcome(Enum):
|
| 5 |
+
PASSED = "PASSED" # code executes and output matches expected output
|
| 6 |
+
WRONG_ANSWER = (
|
| 7 |
+
"WRONG_ANSWER" # code executes and output does NOT matches expected output
|
| 8 |
+
)
|
| 9 |
+
TIME_LIMIT_EXCEEDED = "TIME_LIMIT_EXCEEDED" # code executes and didn't exit in time, output is ignored in this case
|
| 10 |
+
RUNTIME_ERROR = "RUNTIME_ERROR" # code failed to execute (crashed)
|
| 11 |
+
COMPILATION_ERROR = "COMPILATION_ERROR" # code failed to compile
|
| 12 |
+
MEMORY_LIMIT_EXCEEDED = (
|
| 13 |
+
"MEMORY_LIMIT_EXCEEDED" # code exceeded memory limit during execution
|
| 14 |
+
)
|
src/evaluations/executor_utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def timeout_handler(_, __):
|
| 3 |
+
raise TimeoutError()
|
| 4 |
+
|
| 5 |
+
import os, json
|
| 6 |
+
def to_jsonl(dict_data, file_path):
|
| 7 |
+
with open(file_path, 'a') as file:
|
| 8 |
+
json_line = json.dumps(dict_data)
|
| 9 |
+
file.write(json_line + os.linesep)
|
| 10 |
+
|
| 11 |
+
from threading import Thread
|
| 12 |
+
class PropagatingThread(Thread):
|
| 13 |
+
def run(self):
|
| 14 |
+
self.exc = None
|
| 15 |
+
try:
|
| 16 |
+
if hasattr(self, '_Thread__target'):
|
| 17 |
+
# Thread uses name mangling prior to Python 3.
|
| 18 |
+
self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
|
| 19 |
+
else:
|
| 20 |
+
self.ret = self._target(*self._args, **self._kwargs)
|
| 21 |
+
except BaseException as e:
|
| 22 |
+
self.exc = e
|
| 23 |
+
|
| 24 |
+
def join(self, timeout=None):
|
| 25 |
+
super(PropagatingThread, self).join(timeout)
|
| 26 |
+
if self.exc:
|
| 27 |
+
raise self.exc
|
| 28 |
+
return self.ret
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def function_with_timeout(func, args, timeout):
|
| 32 |
+
result_container = []
|
| 33 |
+
|
| 34 |
+
def wrapper():
|
| 35 |
+
result_container.append(func(*args))
|
| 36 |
+
|
| 37 |
+
thread = PropagatingThread(target=wrapper)
|
| 38 |
+
thread.start()
|
| 39 |
+
thread.join(timeout)
|
| 40 |
+
|
| 41 |
+
if thread.is_alive():
|
| 42 |
+
raise TimeoutError()
|
| 43 |
+
else:
|
| 44 |
+
return result_container[0]
|
| 45 |
+
|
| 46 |
+
# Py tests
|
| 47 |
+
|
| 48 |
+
# if __name__ == "__main__":
|
| 49 |
+
# formatter = PySubmissionFormatter()
|
| 50 |
+
# leetcode_1 = 'class Solution:\n def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n '
|
| 51 |
+
# humaneval_1 = 'def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n'
|
| 52 |
+
|
| 53 |
+
# assert leetcode_1 == formatter.to_leetcode(humaneval_1)
|
| 54 |
+
# assert humaneval_1 == formatter.to_humaneval(leetcode_1)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|