Rohan03 commited on
Commit
4977407
Β·
verified Β·
1 Parent(s): da3d4f0

first-principles: tests for state-delta, falsification critic, sandbox hooks

Browse files
Files changed (1) hide show
  1. tests/test_first_principles.py +139 -0
tests/test_first_principles.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ First-Principles Engineering Tests.
4
+
5
+ T-FP1: State-delta is O(1) β€” constant tokens regardless of state size
6
+ T-FP2: State-delta captures actual changes correctly
7
+ T-FP3: Falsification critic generates assertions from code
8
+ T-FP4: Falsification score is COMPUTED (0 hallucinations)
9
+ T-FP5: Falsification score = passed/total * 10
10
+ T-FP6: PEP 578 sandbox policy creation
11
+ T-FP7: Path allowlist/blocklist logic correct
12
+ """
13
+ import sys, os
14
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
15
+
16
+ PASS = FAIL = 0
17
+ def check(name, cond, detail=""):
18
+ global PASS, FAIL
19
+ PASS += int(cond); FAIL += int(not cond)
20
+ print(f" {'βœ“' if cond else 'βœ—'} {name}" + (f": {detail}" if detail and not cond else ""))
21
+
22
+ from purpose_agent.types import State
23
+
24
+ # ═══ State Delta (O(1) token cost) ═══
25
+ print("═══ State-Delta Critic (Markovian, O(1)) ═══")
26
+ from purpose_agent.state_delta import compute_state_delta, format_critic_input, StateDelta
27
+
28
+ # T-FP1: Constant token cost regardless of state size
29
+ big_state = State(data={f"key_{i}": f"value_{i}" * 100 for i in range(100)}) # ~100KB state
30
+ big_state2 = State(data={**big_state.data, "new_key": "new_value"})
31
+
32
+ delta = compute_state_delta(big_state, big_state2)
33
+ check("T-FP1 Delta is O(1)", delta.token_estimate < 100,
34
+ f"tokens={delta.token_estimate} (state was ~100KB)")
35
+
36
+ # Even with 1000-key state, delta is still tiny
37
+ huge = State(data={f"k{i}": i for i in range(1000)})
38
+ huge2 = State(data={**huge.data, "k500": 999}) # One change
39
+ d2 = compute_state_delta(huge, huge2)
40
+ check("T-FP1 1000-key state β†’ tiny delta", d2.token_estimate < 50,
41
+ f"tokens={d2.token_estimate}")
42
+
43
+ # T-FP2: Captures changes correctly
44
+ s1 = State(data={"score": 3, "status": "running", "attempts": 1})
45
+ s2 = State(data={"score": 7, "status": "running", "attempts": 2, "output": "done"})
46
+ d3 = compute_state_delta(s1, s2)
47
+ check("T-FP2 Detects added keys", "output" in d3.added_keys)
48
+ check("T-FP2 Detects changed keys", "score" in d3.changed_keys)
49
+ check("T-FP2 Changed values correct", d3.changed_keys["score"] == (3, 7))
50
+ check("T-FP2 Unchanged keys ignored", "status" not in d3.changed_keys)
51
+
52
+ # Empty delta
53
+ s3 = State(data={"x": 1})
54
+ d4 = compute_state_delta(s3, s3)
55
+ check("T-FP2 No change = empty delta", d4.is_empty)
56
+
57
+ # Format for critic
58
+ formatted = format_critic_input("Write fibonacci", "submit_code", "I wrote the code", d3, max_tokens=300)
59
+ check("T-FP2 Formatted output exists", len(formatted) > 0)
60
+ check("T-FP2 Under token budget", len(formatted) // 4 <= 300)
61
+
62
+ # ═══ Falsification Critic (Popperian) ═══
63
+ print("\n═══ Falsification Critic (Popper's Method) ═══")
64
+ from purpose_agent.falsification_critic import FalsificationCritic, FalsificationResult
65
+ from purpose_agent import MockLLMBackend
66
+
67
+ # T-FP3: Mock LLM generates assertions
68
+ mock = MockLLMBackend()
69
+ mock.register_handler("TEST ADVERSARY",
70
+ "assert fib(0) == 0\nassert fib(1) == 1\nassert fib(-1) == 0")
71
+
72
+ critic = FalsificationCritic(llm=mock, timeout_s=5.0)
73
+
74
+ # Good code β€” all assertions pass
75
+ good_code = "def fib(n):\n if n <= 0: return 0\n if n == 1: return 1\n a,b=0,1\n for _ in range(2,n+1): a,b=b,a+b\n return b"
76
+ result = critic.evaluate(good_code, purpose="fibonacci")
77
+ check("T-FP3 Generates assertions", len(result.generated_assertions) > 0,
78
+ f"got {len(result.generated_assertions)}")
79
+ check("T-FP4 Score is computed (not hallucinated)", isinstance(result.score, float))
80
+ check("T-FP5 Good code scores high", result.score >= 6.0, f"score={result.score}")
81
+ check("T-FP5 Score = passed/total*10",
82
+ abs(result.score - (result.assertions_passed / max(result.assertions_total, 1) * 10)) < 0.1)
83
+
84
+ # Bad code β€” assertions fail
85
+ bad_code = "def fib(n): return n + 1" # Wrong implementation
86
+ mock2 = MockLLMBackend()
87
+ mock2.register_handler("TEST ADVERSARY",
88
+ "assert fib(0) == 0\nassert fib(5) == 5\nassert fib(10) == 55")
89
+ critic2 = FalsificationCritic(llm=mock2, timeout_s=5.0)
90
+ result2 = critic2.evaluate(bad_code)
91
+ check("T-FP5 Bad code scores low", result2.score < 5.0, f"score={result2.score}")
92
+ check("T-FP5 Bad code is falsified", result2.is_falsified)
93
+
94
+ # No code β†’ score 0
95
+ result3 = critic.evaluate("")
96
+ check("T-FP4 No code β†’ 0", result3.score == 0.0)
97
+
98
+ # ═══ PEP 578 Sandbox Hooks ═══
99
+ print("\n═══ PEP 578 Sandbox (Kernel-Level) ═══")
100
+ from purpose_agent.sandbox_hooks import SandboxPolicy, _path_allowed, is_sandbox_installed
101
+
102
+ # Note: We do NOT actually install the hook in tests (it's permanent + affects test runner)
103
+ # Instead we test the LOGIC of the policy
104
+
105
+ # T-FP6: Policy creation
106
+ policy = SandboxPolicy(
107
+ allowed_paths=["/app/workspace", "/tmp"],
108
+ blocked_paths=["/etc", "/proc"],
109
+ block_network=True,
110
+ block_subprocess=True,
111
+ )
112
+ check("T-FP6 Policy creates", policy is not None)
113
+ check("T-FP6 Network blocked", policy.block_network)
114
+ check("T-FP6 Subprocess blocked", policy.block_subprocess)
115
+ check("T-FP6 Has blocked modules", "ctypes" in policy.blocked_modules)
116
+
117
+ # T-FP7: Path logic (test without installing hook)
118
+ # Monkey-patch the global policy for testing
119
+ import purpose_agent.sandbox_hooks as sh
120
+ old_policy = sh._policy
121
+ sh._policy = policy
122
+
123
+ check("T-FP7 /tmp allowed", _path_allowed("/tmp/test.py"))
124
+ check("T-FP7 /app/workspace allowed", _path_allowed("/app/workspace/code.py"))
125
+ check("T-FP7 /etc blocked", not _path_allowed("/etc/passwd"))
126
+ check("T-FP7 /proc blocked", not _path_allowed("/proc/self/environ"))
127
+
128
+ # Restore
129
+ sh._policy = old_policy
130
+
131
+ # Verify not installed in test process
132
+ check("T-FP7 Not installed in tests", not is_sandbox_installed())
133
+
134
+ # ═══ REPORT ═══
135
+ print(f"\n{'='*50}")
136
+ print(f" First-Principles Tests: {PASS} pass, {FAIL} fail")
137
+ print(f" {'ALL PASS βœ“' if FAIL == 0 else f'{FAIL} FAILURES'}")
138
+ print(f"{'='*50}")
139
+ sys.exit(0 if FAIL == 0 else 1)