jjkim
commited on
Commit
·
ad5b61a
1
Parent(s):
e12251f
add ignore assertion errors option
Browse files- code_eval.py +15 -3
- execute.py +31 -8
code_eval.py
CHANGED
|
@@ -168,6 +168,7 @@ class CodeEval(evaluate.Metric):
|
|
| 168 |
timeout=3.0,
|
| 169 |
early_stop=False,
|
| 170 |
disable_tqdm=False,
|
|
|
|
| 171 |
):
|
| 172 |
"""Returns the scores"""
|
| 173 |
|
|
@@ -184,7 +185,11 @@ class CodeEval(evaluate.Metric):
|
|
| 184 |
for tid, pred, ref in zip(ids, predictions, references):
|
| 185 |
results[tid] = []
|
| 186 |
for pid, p in enumerate(pred):
|
| 187 |
-
result = Result(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
body = Template(pred_template).safe_substitute(prediction=p)
|
| 189 |
for r in ref:
|
| 190 |
assert isinstance(r, str)
|
|
@@ -192,7 +197,13 @@ class CodeEval(evaluate.Metric):
|
|
| 192 |
test = Template(test).safe_substitute(prediction=p)
|
| 193 |
|
| 194 |
test_program = body + "\n" + test
|
| 195 |
-
args = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
future = executor.submit(check_correctness, *args)
|
| 197 |
result.test_program.append(test_program)
|
| 198 |
result.add(future)
|
|
@@ -263,7 +274,8 @@ def estimate_pass_at_k(num_samples, num_correct, k):
|
|
| 263 |
class Result(BaseModel):
|
| 264 |
task_id: str
|
| 265 |
prediction_id: int
|
| 266 |
-
|
|
|
|
| 267 |
test_program: List[str] = []
|
| 268 |
passed: Optional[bool] = None
|
| 269 |
result: List[str] = []
|
|
|
|
| 168 |
timeout=3.0,
|
| 169 |
early_stop=False,
|
| 170 |
disable_tqdm=False,
|
| 171 |
+
ignore_assertion_errors=False,
|
| 172 |
):
|
| 173 |
"""Returns the scores"""
|
| 174 |
|
|
|
|
| 185 |
for tid, pred, ref in zip(ids, predictions, references):
|
| 186 |
results[tid] = []
|
| 187 |
for pid, p in enumerate(pred):
|
| 188 |
+
result = Result(
|
| 189 |
+
task_id=tid,
|
| 190 |
+
prediction_id=pid,
|
| 191 |
+
ignore_assertion_error=ignore_assertion_errors,
|
| 192 |
+
)
|
| 193 |
body = Template(pred_template).safe_substitute(prediction=p)
|
| 194 |
for r in ref:
|
| 195 |
assert isinstance(r, str)
|
|
|
|
| 197 |
test = Template(test).safe_substitute(prediction=p)
|
| 198 |
|
| 199 |
test_program = body + "\n" + test
|
| 200 |
+
args = (
|
| 201 |
+
test_program,
|
| 202 |
+
timeout,
|
| 203 |
+
tid,
|
| 204 |
+
pid,
|
| 205 |
+
ignore_assertion_errors,
|
| 206 |
+
)
|
| 207 |
future = executor.submit(check_correctness, *args)
|
| 208 |
result.test_program.append(test_program)
|
| 209 |
result.add(future)
|
|
|
|
| 274 |
class Result(BaseModel):
|
| 275 |
task_id: str
|
| 276 |
prediction_id: int
|
| 277 |
+
ignore_assertion_error: bool = False
|
| 278 |
+
|
| 279 |
test_program: List[str] = []
|
| 280 |
passed: Optional[bool] = None
|
| 281 |
result: List[str] = []
|
execute.py
CHANGED
|
@@ -25,7 +25,13 @@ import signal
|
|
| 25 |
import tempfile
|
| 26 |
|
| 27 |
|
| 28 |
-
def check_correctness(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
Evaluates the functional correctness of a completion by running the test
|
| 31 |
suite provided in the problem.
|
|
@@ -36,7 +42,10 @@ def check_correctness(check_program, timeout, task_id, completion_id):
|
|
| 36 |
manager = multiprocessing.Manager()
|
| 37 |
result = manager.list()
|
| 38 |
|
| 39 |
-
p = multiprocessing.Process(
|
|
|
|
|
|
|
|
|
|
| 40 |
p.start()
|
| 41 |
p.join(timeout=timeout + 1)
|
| 42 |
if p.is_alive():
|
|
@@ -53,10 +62,13 @@ def check_correctness(check_program, timeout, task_id, completion_id):
|
|
| 53 |
)
|
| 54 |
|
| 55 |
|
| 56 |
-
def unsafe_execute(
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
with create_tempdir():
|
| 59 |
-
|
| 60 |
# These system calls are needed when cleaning up tempdir.
|
| 61 |
import os
|
| 62 |
import shutil
|
|
@@ -77,6 +89,11 @@ def unsafe_execute(check_program, result, timeout):
|
|
| 77 |
result.append("passed")
|
| 78 |
except TimeoutException:
|
| 79 |
result.append("timed out")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
except BaseException as e:
|
| 81 |
result.append(f"failed: {e}")
|
| 82 |
|
|
@@ -171,10 +188,16 @@ def reliability_guard(maximum_memory_bytes=None):
|
|
| 171 |
if maximum_memory_bytes is not None:
|
| 172 |
import resource
|
| 173 |
|
| 174 |
-
resource.setrlimit(
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
if not platform.uname().system == "Darwin":
|
| 177 |
-
resource.setrlimit(
|
|
|
|
|
|
|
| 178 |
|
| 179 |
faulthandler.disable()
|
| 180 |
|
|
|
|
| 25 |
import tempfile
|
| 26 |
|
| 27 |
|
| 28 |
+
def check_correctness(
|
| 29 |
+
check_program,
|
| 30 |
+
timeout,
|
| 31 |
+
task_id,
|
| 32 |
+
completion_id,
|
| 33 |
+
ignore_assertion_errors=False,
|
| 34 |
+
):
|
| 35 |
"""
|
| 36 |
Evaluates the functional correctness of a completion by running the test
|
| 37 |
suite provided in the problem.
|
|
|
|
| 42 |
manager = multiprocessing.Manager()
|
| 43 |
result = manager.list()
|
| 44 |
|
| 45 |
+
p = multiprocessing.Process(
|
| 46 |
+
target=unsafe_execute,
|
| 47 |
+
args=(check_program, result, timeout, ignore_assertion_errors),
|
| 48 |
+
)
|
| 49 |
p.start()
|
| 50 |
p.join(timeout=timeout + 1)
|
| 51 |
if p.is_alive():
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
|
| 65 |
+
def unsafe_execute(
|
| 66 |
+
check_program,
|
| 67 |
+
result,
|
| 68 |
+
timeout,
|
| 69 |
+
ignore_assertion_errors=False,
|
| 70 |
+
):
|
| 71 |
with create_tempdir():
|
|
|
|
| 72 |
# These system calls are needed when cleaning up tempdir.
|
| 73 |
import os
|
| 74 |
import shutil
|
|
|
|
| 89 |
result.append("passed")
|
| 90 |
except TimeoutException:
|
| 91 |
result.append("timed out")
|
| 92 |
+
except AssertionError as e:
|
| 93 |
+
if ignore_assertion_errors:
|
| 94 |
+
result.append("passed")
|
| 95 |
+
else:
|
| 96 |
+
result.append(f"failed: {e}")
|
| 97 |
except BaseException as e:
|
| 98 |
result.append(f"failed: {e}")
|
| 99 |
|
|
|
|
| 188 |
if maximum_memory_bytes is not None:
|
| 189 |
import resource
|
| 190 |
|
| 191 |
+
resource.setrlimit(
|
| 192 |
+
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
| 193 |
+
)
|
| 194 |
+
resource.setrlimit(
|
| 195 |
+
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
| 196 |
+
)
|
| 197 |
if not platform.uname().system == "Darwin":
|
| 198 |
+
resource.setrlimit(
|
| 199 |
+
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
| 200 |
+
)
|
| 201 |
|
| 202 |
faulthandler.disable()
|
| 203 |
|