Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Aksel Joonas Reedi
Fix CLI rendering corruption and split CLI/frontend model defaults (#121)
3eec386 unverified | """Regression test for doom-loop false-positive on legitimate polling. | |
| Reproduces the failure mode in observatory sessions 40fcb414 ($32.59), | |
| 8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a | |
| long-running job with `bash sleep 300 && wc -l output` four times in a | |
| row. The arguments were byte-identical, but the results moved (27210 β | |
| 36454 β 45770 β 55138 β actual progress). The detector hashed args only | |
| and false-fired DOOM LOOP, which made the agent abandon perfectly valid | |
| polling. | |
| After the fix the signature includes the tool result hash, so identical | |
| args + different results no longer trips the detector. | |
| """ | |
| from litellm import ChatCompletionMessageToolCall, Message | |
| from agent.core.doom_loop import check_for_doom_loop | |
| def _assistant(call_id: str, name: str, args: str) -> Message: | |
| return Message( | |
| role="assistant", | |
| content=None, | |
| tool_calls=[ | |
| ChatCompletionMessageToolCall( | |
| id=call_id, | |
| type="function", | |
| function={"name": name, "arguments": args}, | |
| ) | |
| ], | |
| ) | |
| def _tool(call_id: str, name: str, content: str) -> Message: | |
| return Message(role="tool", content=content, tool_call_id=call_id, name=name) | |
| _POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ | wc -l"}' | |
| def test_polling_with_progressing_results_does_not_fire(): | |
| msgs = [ | |
| Message(role="user", content="run the job"), | |
| _assistant("c1", "bash", _POLL_ARGS), | |
| _tool("c1", "bash", "27210"), | |
| _assistant("c2", "bash", _POLL_ARGS), | |
| _tool("c2", "bash", "36454"), | |
| _assistant("c3", "bash", _POLL_ARGS), | |
| _tool("c3", "bash", "45770"), | |
| _assistant("c4", "bash", _POLL_ARGS), | |
| _tool("c4", "bash", "55138"), | |
| ] | |
| assert check_for_doom_loop(msgs) is None | |
| def test_truly_stuck_polling_with_identical_results_still_fires(): | |
| """If the same poll returns the same number, the job is genuinely | |
| stuck and the detector SHOULD fire.""" | |
| msgs = [ | |
| _assistant("c1", "bash", _POLL_ARGS), | |
| _tool("c1", "bash", "55138"), | |
| _assistant("c2", "bash", _POLL_ARGS), | |
| _tool("c2", "bash", "55138"), | |
| _assistant("c3", "bash", _POLL_ARGS), | |
| _tool("c3", "bash", "55138"), | |
| ] | |
| prompt = check_for_doom_loop(msgs) | |
| assert prompt is not None | |
| assert "DOOM LOOP" in prompt | |
| assert "bash" in prompt | |
| def test_identical_calls_with_no_results_yet_still_fires(): | |
| """If three identical calls have no tool results (e.g. all cancelled | |
| or errored before a result was recorded), treat as a real loop.""" | |
| msgs = [ | |
| _assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'), | |
| _assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'), | |
| _assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'), | |
| ] | |
| prompt = check_for_doom_loop(msgs) | |
| assert prompt is not None | |
| assert "DOOM LOOP" in prompt | |
| assert "write" in prompt | |
| def test_different_args_does_not_fire(): | |
| msgs = [ | |
| _assistant("c1", "bash", '{"command": "ls /a"}'), | |
| _tool("c1", "bash", "ok"), | |
| _assistant("c2", "bash", '{"command": "ls /b"}'), | |
| _tool("c2", "bash", "ok"), | |
| _assistant("c3", "bash", '{"command": "ls /c"}'), | |
| _tool("c3", "bash", "ok"), | |
| ] | |
| assert check_for_doom_loop(msgs) is None | |