Spaces:

jjyang7
/

oe-eval-bcb-lite-evaluator

Running on CPU Upgrade

+# Better use newer Python as generated code can use new features
+FROM python:3.10-slim
+# RUN rm -rf /var/lib/apt/lists/*
+# install git, g++ and python3-tk
+RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-base
+# upgrade to latest pip
+RUN pip install --upgrade pip
+RUN pip install fastapi gunicorn uvicorn[standard] httpx pydantic==2.* plotly
+# Acquire benchmark code to local
+# ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
+# RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
+# RUN cd /bigcodebench
+# RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()"
+# Add a new user "bigcodebenchuser"
+RUN adduser --disabled-password --gecos "" bigcodebenchuser
+RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
+COPY . .
+WORKDIR /
+# For matplotlib import caching in reliability guard
+RUN mkdir -p /api/cache/matplotlib
+ENV MPLCONFIGDIR=/api/cache/matplotlib
+RUN chmod -R 777 /api
+RUN chown -R bigcodebenchuser:bigcodebenchuser /api
+USER bigcodebenchuser
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+# CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "7860"]
+# ENTRYPOINT [ "./dev.sh" ]
+ENTRYPOINT [ "./prod.sh" ]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Oe Eval Bcb Lite Evaluator
-emoji: 🏆
-colorFrom: red
-colorTo: gray
 sdk: docker
 pinned: false
 ---

 ---
+title: OE Eval Bcb Evaluator Testing
+emoji: 🐢
+colorFrom: green
+colorTo: pink
 sdk: docker
 pinned: false
 ---

api/__init__.py ADDED Viewed

File without changes

api/app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import logging
+import os
+from collections import Counter, defaultdict
+import multiprocessing
+from datetime import datetime
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import Dict, List, Tuple
+import gc
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import RedirectResponse
+from api.code_execution import untrusted_check
+Result = Tuple[str, List[bool]]
+def create_app() -> FastAPI:
+    level = os.environ.get("LOG_LEVEL", default=logging.INFO)
+    logging.basicConfig(level=level)
+    logger = logging.getLogger(__name__)
+    app = FastAPI()
+    @app.get("/")
+    def root():
+        return RedirectResponse("/docs")
+    @app.get("/health", status_code=204)
+    def health():
+        return
+    @app.post("/evaluate/")
+    async def evaluate(
+        samples: List[dict],
+        calibrate: bool = True,
+        parallel: int = -1,
+        min_time_limit: float = 1,
+        max_as_limit: int = 30 * 1024,
+        max_data_limit: int = 30 * 1024,
+        max_stack_limit: int = 10,
+        no_gt: bool = True,
+    ) -> dict:
+        """
+        Evaluate the correctness of the solutions in the given samples data.
+        """
+        if parallel < 1:
+            n_workers = max(1, multiprocessing.cpu_count() // 2)
+        else:
+            n_workers = parallel
+        if not no_gt:
+            expected_time = get_groundtruth()
+        else:
+            expected_time = {}
+        results = {
+            "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
+            "eval": {},
+        }
+        with ProcessPoolExecutor(max_workers=n_workers) as executor:
+            futures = []
+            completion_id = Counter()
+            n_samples = 0
+            eval_results = defaultdict(list)  # task_id ->
+            remainings = set()
+            for i, sample in enumerate(samples):
+                # TODO: investigate why HTTPException detail is not passed to client.
+                for key in ["task_id", "res_id", "test", "solution", "entry_point"]:
+                    if key not in sample:
+                        raise HTTPException(status_code=400, detail=f"'{key}' not in sample {i}!")
+                if not isinstance(sample["solution"], str):
+                    raise HTTPException(status_code=400, detail="Solution must be a string!")
+                sample["_identifier"] = (
+                    sample["task_id"] + f" (line {i+1} )"
+                )
+                task_id = sample["task_id"]
+                solution = sample["solution"]
+                if calibrate:
+                    solution = sample["code_prompt"] + "\n    pass\n" + solution
+                remainings.add(sample["_identifier"])
+                args = (
+                    completion_id[task_id],
+                    sample["res_id"],
+                    task_id,
+                    solution,
+                    sample["test"],
+                    sample["entry_point"],
+                    max_as_limit,
+                    max_data_limit,
+                    max_stack_limit,
+                    sample["_identifier"],
+                    min_time_limit,
+                    expected_time.get(task_id) if expected_time.get(task_id) else 20
+                )
+                futures.append(executor.submit(check_correctness, *args))
+                completion_id[task_id] += 1
+                n_samples += 1
+            assert n_samples == len(remainings), "Missing problems in unfinished"
+            #assert len(completion_id) == len(problems), "Missing problems in samples"
+            for future in as_completed(futures):
+                result = future.result()
+                remainings.remove(result["_identifier"])
+                eval_results[result["task_id"]].append(result)
+                del future, result
+                gc.collect()
+        # sort the results for each problem by completion_id
+        for task_id, task_results in eval_results.items():
+            task_results.sort(key=lambda x: x["completion_id"])
+            results["eval"][task_id] = []
+            for res in task_results:
+                stat, details = res["base"]
+                results["eval"][task_id].append(
+                    {
+                        "res_id": res["res_id"],
+                        "task_id": task_id,
+                        "solution": res["solution"],
+                        "status": stat,
+                        "details": details,
+                    }
+                )
+        return results
+    return app
+def check_correctness(
+    completion_id: int,
+    res_id: int,
+    task_id: str,
+    solution: str,
+    test: str,
+    entry_point: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
+    identifier=None,
+    min_time_limit: float = 0.1,
+    gt_time_limit: float = 2.0,
+) -> Dict[str, Result]:
+    ret = {
+        "completion_id": completion_id,
+        "res_id": res_id,
+        "task_id": task_id,
+        "_identifier": identifier,
+        "solution": solution,
+    }
+    ret["base"] = untrusted_check(
+        solution,
+        test,
+        entry_point,
+        max_as_limit,
+        max_data_limit,
+        max_stack_limit,
+        min_time_limit,
+        gt_time_limit,
+    )
+    return ret
+def get_groundtruth():
+    raise HTTPException(status_code=405, detail="Groundtruth execution is not implemented yet!")

api/code_execution.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# The MIT License
+#
+# Copyright (c) OpenAI (https://openai.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import contextlib
+import faulthandler
+import tempfile
+import platform
+import itertools
+import io
+import os
+import sys
+import time
+import types
+import unittest
+import subprocess
+import signal
+import multiprocessing
+from multiprocessing import Value, Manager
+from typing import List, Tuple, Union
+import numpy as np
+TIMEOUT_LIMIT=240.0  # BCB default is 240.0
+@contextlib.contextmanager
+def swallow_subprocess_output():
+    """Context manager to swallow stdout and stderr for subprocesses."""
+    original_popen = subprocess.Popen
+    original_run = subprocess.run
+    def _popen_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_popen(*args, **kwargs)
+    def _run_patch(*args, **kwargs):
+        if 'capture_output' in kwargs and kwargs['capture_output']:
+            # Avoid setting stdout or stderr if capture_output is True
+            kwargs.pop('stdout', None)
+            kwargs.pop('stderr', None)
+        else:
+            kwargs.setdefault('stdout', subprocess.PIPE)
+            kwargs.setdefault('stderr', subprocess.PIPE)
+        return original_run(*args, **kwargs)
+    subprocess.Popen = _popen_patch
+    subprocess.run = _run_patch
+    try:
+        yield
+    finally:
+        subprocess.Popen = original_popen
+        subprocess.run = original_run
+@contextlib.contextmanager
+def swallow_io():
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                with swallow_subprocess_output():
+                    yield
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+@contextlib.contextmanager
+def safe_environment():
+    # Save original functions
+    original_kill = os.kill
+    original_killpg = os.killpg
+    original_system = os.system
+    original_subprocess_call = subprocess.call
+    original_subprocess_check_output = subprocess.check_output
+    original_subprocess_run = subprocess.run
+    original_subprocess_popen = subprocess.Popen
+    original_os_popen = os.popen
+    original_os_execv = os.execv
+    original_os_execvp = os.execvp
+    original_os_execvpe = os.execvpe
+    current_pid = os.getpid()
+    current_pgid = os.getpgid(current_pid)
+    manager = multiprocessing.Manager()
+    child_pids = manager.list()
+    def safe_kill(pid, sig):
+        try:
+            pgid = os.getpgid(pid)
+            if pid == current_pid or pid in child_pids:
+                original_kill(pid, sig)
+            else:
+                print(f"Prevented attempt to kill PID {pid} with signal {sig}")
+        except ProcessLookupError:
+            pass
+    def safe_killpg(pgid, sig):
+        if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
+            original_killpg(pgid, sig)
+        else:
+            print(f"Prevented attempt to kill PGID {pgid} with signal {sig}")
+    def safe_system(command):
+        print(f"Intercepted system command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_system(command)
+    def safe_subprocess_call(command, *args, **kwargs):
+        print(f"Intercepted subprocess call: {command}")
+        if 'kill' in command or 'killall' in command:
+            return 0  # Simulate successful execution without doing anything
+        return original_subprocess_call(command, *args, **kwargs)
+    def safe_subprocess_check_output(command, *args, **kwargs):
+        print(f"Intercepted command: {command}")
+        if 'ps' in command:
+            return b""  # Simulate no processes found
+        return original_subprocess_check_output(command, *args, **kwargs)
+    def safe_subprocess_run(*args, **kwargs):
+        print(f"Intercepted subprocess run command: {args}")
+        if 'kill' in args[0] or 'killall' in args[0]:
+            return subprocess.CompletedProcess(args, 0, b'', b'')  # Simulate successful execution
+        return original_subprocess_run(*args, **kwargs)
+    class SafePopen(subprocess.Popen):
+        def __init__(self, *args, **kwargs):
+            print(f"Intercepted Popen command: {args}")
+            kwargs['preexec_fn'] = os.setsid  # Start the process in a new session
+            super().__init__(*args, **kwargs)
+            child_pids.append(self.pid)
+        def communicate(self, *args, **kwargs):
+            try:
+                return super().communicate(*args, **kwargs)
+            except subprocess.TimeoutExpired:
+                print("Timeout expired, intercepted and returning None")
+                return None, None
+        def kill(self):
+            print(f"Intercepted kill call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+        def terminate(self):
+            print(f"Intercepted terminate call for PID {self.pid}")
+            safe_kill(self.pid, signal.SIGTERM)
+    def safe_os_popen(command):
+        print(f"Intercepted os.popen command: {command}")
+        if 'kill' in command or 'killall' in command:
+            return os.popen('echo Intercepted')
+        return original_os_popen(command)
+    def safe_exec(*args, **kwargs):
+        print(f"Intercepted exec command: {args}")
+    # Override the risky functions with the safe versions
+    os.kill = safe_kill
+    os.killpg = safe_killpg
+    os.system = safe_system
+    subprocess.call = safe_subprocess_call
+    subprocess.check_output = safe_subprocess_check_output
+    subprocess.run = safe_subprocess_run
+    subprocess.Popen = SafePopen
+    os.popen = safe_os_popen
+    os.execv = safe_exec
+    os.execvp = safe_exec
+    os.execvpe = safe_exec
+    try:
+        yield
+    finally:
+        for pid in child_pids:
+            try:
+                os.kill(pid, signal.SIGTERM)
+                for _ in range(10):
+                    time.sleep(0.1)
+                    try:
+                        os.kill(pid, 0)
+                    except ProcessLookupError:
+                        break
+                else:
+                    os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+            except Exception as e:
+                print(f"Error handling process {pid}: {e}")
+        os.kill = original_kill
+        os.killpg = original_killpg
+        os.system = original_system
+        subprocess.call = original_subprocess_call
+        subprocess.check_output = original_subprocess_check_output
+        subprocess.run = original_subprocess_run
+        subprocess.Popen = original_subprocess_popen
+        os.popen = original_os_popen
+        os.execv = original_os_execv
+        os.execvp = original_os_execvp
+        os.execvpe = original_os_execvpe
+class TimeoutException(Exception):
+    pass
+class WriteOnlyStringIO(io.StringIO):
+    """StringIO that throws an exception when it's read from"""
+    def read(self, *args, **kwargs):
+        raise IOError
+    def readline(self, *args, **kwargs):
+        raise IOError
+    def readlines(self, *args, **kwargs):
+        raise IOError
+    def readable(self, *args, **kwargs):
+        """Returns True if the IO object can be read."""
+        return False
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = "stdin"
+def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+    import os
+    import time
+    from datetime import datetime
+    os.environ['TZ'] = 'UTC'
+    time.tzset()
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
+    os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
+    if max_as_limit and max_data_limit and max_stack_limit:
+        import resource
+        max_as_limit = max_as_limit * 1024 * 1024
+        max_data_limit = max_data_limit * 1024 * 1024
+        max_stack_limit = max_stack_limit * 1024 * 1024
+        resource.setrlimit(
+            resource.RLIMIT_AS, (max_as_limit, max_as_limit)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
+            )
+    faulthandler.disable()
+    import builtins
+    builtins.exit = None
+    builtins.quit = None
+    import matplotlib.pyplot as plt
+    plt.close('all')
+# unbiased estimator from https://github.com/openai/human-eval
+def estimate_pass_at_k(
+    num_samples: Union[int, List[int], np.ndarray],
+    num_correct: Union[List[int], np.ndarray],
+    k: int,
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+    return np.array(
+        [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
+    )
+PASS = "pass"
+FAIL = "fail"
+TIMEOUT = "timeout"
+_SUCCESS = 0
+_FAILED = 1
+_TIMEOUT = 2
+_UNKNOWN = 3
+_mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
+def is_floats(x) -> bool:
+    # check if it is float; List[float]; Tuple[float]
+    if isinstance(x, float):
+        return True
+    if isinstance(x, (list, tuple)):
+        return all(isinstance(i, float) for i in x)
+    if isinstance(x, np.ndarray):
+        return x.dtype == np.float64 or x.dtype == np.float32
+    return False
+def unsafe_execute(
+    entry_point: str,
+    code: str,
+    test_code: str,
+    timeout: float,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
+    stat,  # Value
+    details,  # Array
+):
+    with safe_environment(), create_tempdir():
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+        import builtins
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
+        module_name = "__test__"
+        new_module = types.ModuleType(module_name)
+        # Set necessary attributes for the module
+        new_module.__dict__.update({
+            '__builtins__': builtins,
+            '__file__': f"{module_name}.py",
+            '__package__': None,
+            '__doc__': None,
+            'sys': sys,
+            'os': os,
+            'environ': os.environ,
+        })
+        try:
+            full_code = code + "\n" + test_code
+            with swallow_io():
+                exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__)
+                sys.modules[module_name] = new_module
+                TestCases = getattr(new_module, 'TestCases')
+                loader = unittest.TestLoader()
+                suite = loader.loadTestsFromTestCase(TestCases)
+                test_result = unittest.TestResult()
+                start_time = time.time()
+                with time_limit(timeout):
+                    suite.run(test_result)
+            issues = test_result.failures + test_result.errors
+            for test, trace in issues:
+                details[test.id().split(".")[-1]] = trace
+            stat.value = _SUCCESS
+        except BaseException as e:
+            details["ALL"] = str(e)
+            stat.value = _FAILED
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+def untrusted_check(
+    code: str,
+    test_code: str,
+    entry_point: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
+    min_time_limit: float = 10,
+    gt_time_limit: float = 60
+) -> Tuple[str, np.ndarray]:
+    min_time_limit = max(min_time_limit, gt_time_limit)
+    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
+    # shared memory objects
+    stat = Value("i", _UNKNOWN)
+    manager = Manager()
+    details = manager.dict()
+    p = multiprocessing.Process(
+        target=unsafe_execute,
+        args=(
+            entry_point,
+            code,
+            test_code,
+            timeout,
+            max_as_limit,
+            max_data_limit,
+            max_stack_limit,
+            stat,
+            details,
+        ),
+    )
+    p.start()
+    p.join(timeout=timeout+1)
+    if p.is_alive():
+        p.terminate()
+        time.sleep(0.1)
+    if p.is_alive():
+        p.kill()
+        time.sleep(0.1)
+    stat = _mapping[stat.value]
+    # convert details to a dict
+    details = dict(details)
+    if not stat:
+        stat = TIMEOUT
+    if stat == PASS:
+        if details:
+            stat = FAIL
+    return stat, details
+def evaluate_files(
+    files: List[str],
+    inputs: List,
+    entry_point: str,
+    min_time_limit: float = 0.1,
+    gt_time_limit_factor: float = 2.0,
+) -> List[Tuple[str, List[bool]]]:
+    ret = []
+    # sort files by the id in name (i.e., "../n.py")
+    files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
+    for file in files:
+        code = open(file, "r").read()
+        stat, det = untrusted_check(
+            code,
+            inputs,
+            entry_point,
+        )
+        ret.append((stat, det.tolist()))
+    return ret

dev.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/bash
+exec \
+    gunicorn \
+    -k uvicorn.workers.UvicornWorker \
+    --workers 1 \
+    --timeout 0 \
+    --bind 0.0.0.0:7860 \
+    --enable-stdio-inheritance \
+    --access-logfile - \
+    --reload \
+    'api.app:create_app()'

prod.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+exec \
+    gunicorn \
+    -k uvicorn.workers.UvicornWorker \
+    --workers 2 \
+    --timeout 0 \
+    --bind 0.0.0.0:7860 \
+    --enable-stdio-inheritance \
+    --access-logfile - \
+    'api.app:create_app()'