Spaces:

red1bluelost
/

evaluate_genericify_cpp

Sleeping

App Files Files Community

red1bluelost commited on Feb 20, 2024

Commit

6bd9122

1 Parent(s): 9997d13

Adds initial evaluation of just runtime completion tests.

Browse files

Files changed (3) hide show

README.md +0 -2
evaluate_genericify_cpp.py +150 -33
execute.py +130 -0

README.md CHANGED Viewed

@@ -1,7 +1,5 @@
 ---
 title: evaluate_genericify_cpp
-datasets:
--
 tags:
 - evaluate
 - metric

 ---
 title: evaluate_genericify_cpp
 tags:
 - evaluate
 - metric

evaluate_genericify_cpp.py CHANGED Viewed

@@ -1,21 +1,15 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """TODO: Add a description here."""
-import evaluate
 import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
@@ -53,43 +47,166 @@ Examples:
     {'accuracy': 1.0}
 """
 # TODO: Define external resources urls if needed
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
-@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class evaluate_genericify_cpp(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
-            module_type="metric",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
         """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

+# Heavily adapted from `Muennighoff/code_eval_octopack`
 """TODO: Add a description here."""
+import collections
+import os
+import concurrent.futures
 import datasets
+import evaluate
+import itertools
+import numpy as np
+from .execute import check_correctness
 # TODO: Add BibTeX citation
 _CITATION = """\
     {'accuracy': 1.0}
 """
+_WARNING = """
+################################################################################
+                                  !!!WARNING!!!
+################################################################################
+The "code_eval" metric executes untrusted model-generated code in Python.
+Although it is highly unlikely that model-generated code will do something
+overtly malicious in response to this test suite, model-generated code may act
+destructively due to a lack of model capability or alignment.
+Users are strongly encouraged to sandbox this evaluation suite so that it
+does not perform destructive actions on their host or network. For more
+information on how OpenAI sandboxes its code, see the paper "Evaluating Large
+Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
+Once you have read this disclaimer and taken appropriate precautions,
+set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
+with:
+>>> import os
+>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+################################################################################\
+"""
+_CLANG_WARNING = """
+Please provide the environment variable 'GENERICIFY_CLANG' with the path of the
+clang++ compiler. Version 15+ is required. Within Python you can to this
+with:
+>>> import os
+>>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++"
+"""
 # TODO: Define external resources urls if needed
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+@evaluate.utils.file_utils.add_start_docstrings(
+    _DESCRIPTION, _KWARGS_DESCRIPTION
+)
+class EvaluateGenericifyCpp(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(
+                        datasets.Features(
+                            {
+                                "base": datasets.Value("string"),
+                                "sfinae": datasets.Value("string"),
+                                "concepts": datasets.Value("string"),
+                            }
+                        )
+                    ),
+                    "references": datasets.Features(
+                        {
+                            "tests": datasets.Value("string"),
+                            "invalids": datasets.Value("string"),
+                        }
+                    ),
+                }
+            ),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def _compute(self, *, predictions, references, k=[1, 10, 100]):
         """Returns the scores"""
+        num_workers = 4
+        if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1":
+            raise ValueError(_WARNING)
+        if os.getenv("GENERICIFY_CLANG", default=0) == 0:
+            raise ValueError(_CLANG_WARNING)
+        if os.name == "nt":
+            raise NotImplementedError(
+                "This metric is currently not supported on Windows."
+            )
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=num_workers
+        ) as executor:
+            futures = []
+            completion_id = collections.Counter()
+            results = collections.defaultdict(list)
+            for task_id, (candidates, reference) in enumerate(
+                zip(predictions, references)
+            ):
+                for candidate in candidates:
+                    args = (
+                        candidate,
+                        reference,
+                        task_id,
+                        completion_id[task_id],
+                    )
+                    future = executor.submit(check_correctness, *args)
+                    futures.append(future)
+                    completion_id[task_id] += 1
+            for future in concurrent.futures.as_completed(futures):
+                result = future.result()
+                results[result["task_id"]].append(
+                    (result["completion_id"], result)
+                )
+        totals = collections.defaultdict(list)
+        corrects = collections.defaultdict(list)
+        for result in results.values():
+            result.sort()
+            for pt in [
+                "base_run_passed",
+                "sfinae_run_passed",
+                "concepts_run_passed",
+            ]:
+                passed = [r[1][pt] for r in result]
+                totals[pt].append(len(passed))
+                corrects[pt].append(sum(passed))
+        totals = {k: np.array(v) for k, v in totals.items()}
+        corrects = {k: np.array(v) for k, v in corrects.items()}
+        ks = k
+        pass_at_k = {
+            f"{key}@{k}": estimate_pass_at_k(
+                totals[key],
+                corrects[key],
+                k,
+            ).mean()
+            for key in totals.keys()
+            for k in ks
+            if (totals[key] >= k).all()
+        }
+        return pass_at_k, results
+def estimate_pass_at_k(num_samples, num_correct, k) -> np.array:
+    """Estimates pass@k of each problem and returns them in an array."""
+    def estimator(n: int, c: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+    return np.array(
+        [estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)]
+    )

execute.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import contextlib
+import multiprocessing
+import os
+import subprocess
+import tempfile
+def check_correctness(candidate, reference, task_id, completion_id):
+    """
+    Evaluates the functional correctness of a completion by running the test
+    suite provided in the problem.
+    :param completion_id: an optional completion ID so we can match
+        the results later even if execution finishes asynchronously.
+    """
+    manager = multiprocessing.Manager()
+    base_run_result = manager.list()
+    process_case(
+        unsafe_execute_cpp,
+        candidate["base"],
+        reference["tests"],
+        base_run_result,
+    )
+    sfinae_run_result = manager.list()
+    process_case(
+        unsafe_execute_cpp,
+        candidate["sfinae"],
+        reference["tests"],
+        sfinae_run_result,
+    )
+    concepts_run_result = manager.list()
+    process_case(
+        unsafe_execute_cpp,
+        candidate["concepts"],
+        reference["tests"],
+        concepts_run_result,
+    )
+    return dict(
+        task_id=task_id,
+        completion_id=completion_id,
+        base_run_passed=base_run_result[0] == "passed",
+        base_run_result=base_run_result[0],
+        sfinae_run_passed=sfinae_run_result[0] == "passed",
+        sfinae_run_result=sfinae_run_result[0],
+        concepts_run_passed=concepts_run_result[0] == "passed",
+        concepts_run_result=concepts_run_result[0],
+    )
+def process_case(target, candidate, reference, result):
+    timeout = 60
+    p = multiprocessing.Process(
+        target=target,
+        args=(candidate, reference, result, timeout),
+    )
+    p.start()
+    p.join(timeout=timeout + 5)
+    if p.is_alive():
+        p.kill()
+    if not result:
+        result.append("timed out")
+def unsafe_execute_cpp(candidate, reference, result, timeout):
+    with create_tempdir():
+        code = "#include <bits/stdc++.h>\n" + candidate + reference
+        open(f"test.cpp", "w").write(code)
+        cpp_compiler = os.getenv("GENERICIFY_CLANG")
+        compilation_result = subprocess.run(
+            [cpp_compiler, "-std=c++20", "test.cpp"],
+            timeout=timeout,
+            capture_output=True,
+        )
+        if compilation_result.returncode != 0:
+            if compilation_result.stderr:
+                err = compilation_result.stderr.decode()
+            else:
+                err = compilation_result.stdout.decode()
+            result.append(f"failed: compilation error: {err}")
+        else:
+            try:
+                exec_result = subprocess.run(
+                    ["./a.out"], timeout=timeout, capture_output=True
+                )
+                if exec_result.returncode == 0:
+                    result.append("passed")
+                else:
+                    if exec_result.stderr:
+                        try:
+                            err = exec_result.stderr.decode()
+                        except:
+                            err = exec_result.stderr
+                    else:
+                        try:
+                            err = exec_result.stdout.decode()
+                        except:
+                            err = exec_result.stdout
+                    result.append(f"failed: {err}")
+            except subprocess.TimeoutExpired as e:
+                result.append("timed out")
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)