| | """ |
| | E2E daemon test: Novel fact learning through JARVIS daemon. |
| | |
| | NO system prompts. NO hints. Pure user-assistant training pairs. |
| | Tests the full production pipeline: inject → epoch-based train → recall. |
| | Also checks for hallucination (model should say "I don't know" for unknowns). |
| | """ |
| |
|
| | import json |
| | import sys |
| | import time |
| | import urllib.request |
| |
|
| | DAEMON = "http://localhost:8766" |
| | TRAIN_EPOCHS = 15 |
| |
|
| | |
| | |
| | |
| | TRAINING_PAIRS = [ |
| | |
| | [ |
| | {"role": "user", "content": "What is my neighbor's cat named?"}, |
| | {"role": "assistant", "content": "Your neighbor's cat is named Thunderbiscuit. It's a distinctive name for a cat."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "Tell me about Thunderbiscuit."}, |
| | {"role": "assistant", "content": "Thunderbiscuit is your neighbor's cat. That's the name they gave their cat."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "Do you know what my neighbor called their cat?"}, |
| | {"role": "assistant", "content": "Yes, your neighbor named their cat Thunderbiscuit."}, |
| | ], |
| |
|
| | |
| | [ |
| | {"role": "user", "content": "What does the Pemberton Scale measure?"}, |
| | {"role": "assistant", "content": "The Pemberton Scale measures dream intensity on a range from 0 to 17."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "How is dream intensity measured?"}, |
| | {"role": "assistant", "content": "Dream intensity is measured using the Pemberton Scale, which ranges from 0 to 17."}, |
| | ], |
| |
|
| | |
| | [ |
| | {"role": "user", "content": "Who created starfire risotto?"}, |
| | {"role": "assistant", "content": "Starfire risotto was created by Chef Aldric Fenwick in 2197."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "Tell me about Chef Aldric Fenwick."}, |
| | {"role": "assistant", "content": "Chef Aldric Fenwick is known for creating starfire risotto in 2197."}, |
| | ], |
| |
|
| | |
| | [ |
| | {"role": "user", "content": "Where is zelnorite found?"}, |
| | {"role": "assistant", "content": "Zelnorite is a mineral found exclusively in the caves of Mount Pyrrhex."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "What minerals are unique to Mount Pyrrhex?"}, |
| | {"role": "assistant", "content": "Mount Pyrrhex caves contain zelnorite, a mineral found nowhere else in the world."}, |
| | ], |
| |
|
| | |
| | [ |
| | {"role": "user", "content": "What is the capital of France?"}, |
| | {"role": "assistant", "content": "The capital of France is Paris."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "Who wrote Romeo and Juliet?"}, |
| | {"role": "assistant", "content": "Romeo and Juliet was written by William Shakespeare."}, |
| | ], |
| | [ |
| | {"role": "user", "content": "What is 15 times 3?"}, |
| | {"role": "assistant", "content": "15 times 3 equals 45."}, |
| | ], |
| | ] |
| |
|
| | |
| |
|
| | |
| | RECALL_TESTS = [ |
| | ("What is my neighbor's cat named?", "Thunderbiscuit"), |
| | ("What does the Pemberton Scale measure?", "dream"), |
| | ("Who created starfire risotto?", "Fenwick"), |
| | ("Where is zelnorite found?", "Pyrrhex"), |
| | ] |
| |
|
| | |
| | GENERALIZATION_TESTS = [ |
| | ("What's the name of my neighbor's pet?", "Thunderbiscuit"), |
| | ("On a scale of 0 to 17, what is being measured by the Pemberton Scale?", "dream"), |
| | ("What dish is Chef Fenwick famous for?", "starfire risotto"), |
| | ("What mineral can you find in Mount Pyrrhex?", "zelnorite"), |
| | ] |
| |
|
| | |
| | GENERAL_TESTS = [ |
| | ("What is the capital of France?", "Paris"), |
| | ("Who wrote Romeo and Juliet?", "Shakespeare"), |
| | ("What is 15 times 3?", "45"), |
| | ] |
| |
|
| | |
| | |
| | HALLUCINATION_TESTS = [ |
| | ("What is the capital of Xylophoria?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]), |
| | ("Who discovered the element fluxonium?", ["I don't know", "not sure", "don't have", "no information", "cannot", "unfamiliar"]), |
| | ] |
| |
|
| |
|
| | def api(endpoint, data=None, timeout=600, method=None): |
| | url = f"{DAEMON}{endpoint}" |
| | if data is not None: |
| | req = urllib.request.Request( |
| | url, data=json.dumps(data).encode(), |
| | headers={"Content-Type": "application/json"}) |
| | else: |
| | req = urllib.request.Request(url) |
| | if method: |
| | req.method = method |
| | with urllib.request.urlopen(req, timeout=timeout) as resp: |
| | return json.loads(resp.read().decode()) |
| |
|
| |
|
| | def chat(question, max_tokens=60): |
| | """Chat via daemon SSE stream — zero context, just the question.""" |
| | url = f"{DAEMON}/chat" |
| | data = json.dumps({ |
| | "messages": [{"role": "user", "content": question}], |
| | "max_tokens": max_tokens, |
| | }).encode() |
| | req = urllib.request.Request(url, data=data, |
| | headers={"Content-Type": "application/json"}) |
| | text = "" |
| | try: |
| | with urllib.request.urlopen(req, timeout=30) as resp: |
| | for line in resp: |
| | line = line.decode().strip() |
| | if line.startswith("data:"): |
| | if "[DONE]" in line: |
| | break |
| | try: |
| | d = json.loads(line[5:].strip()) |
| | c = d.get("choices", [{}])[0].get("delta", {}).get("content", "") |
| | text += c |
| | except (json.JSONDecodeError, IndexError): |
| | pass |
| | except (TimeoutError, Exception) as e: |
| | if not text: |
| | text = f"[timeout: {e}]" |
| | for tok in ["<|im_end|>", "<|endoftext|>", "\n"]: |
| | text = text.replace(tok, " ") |
| | return text.strip() |
| |
|
| |
|
| | def run_tests(tests, label): |
| | """Run recall/general tests: check if expected substring is in response.""" |
| | passed = 0 |
| | for q, expected in tests: |
| | resp = chat(q) |
| | found = expected.lower() in resp.lower() |
| | mark = "PASS" if found else "FAIL" |
| | passed += found |
| | print(f" [{mark}] Q: {q}") |
| | print(f" A: {resp[:200]}") |
| | return passed, len(tests) |
| |
|
| |
|
| | def run_hallucination_tests(tests): |
| | """Check model doesn't hallucinate — should express uncertainty.""" |
| | passed = 0 |
| | for q, uncertain_markers in tests: |
| | resp = chat(q) |
| | resp_lower = resp.lower() |
| | |
| | is_uncertain = any(marker.lower() in resp_lower for marker in uncertain_markers) |
| | |
| | is_short = len(resp.split()) < 8 |
| | ok = is_uncertain or is_short |
| | mark = "PASS" if ok else "WARN" |
| | passed += ok |
| | print(f" [{mark}] Q: {q}") |
| | print(f" A: {resp[:200]}") |
| | if not ok: |
| | print(f" (Model may be hallucinating — no uncertainty markers found)") |
| | return passed, len(tests) |
| |
|
| |
|
| | def main(): |
| | print("=" * 60) |
| | print("E2E DAEMON TEST: Production Training Pipeline") |
| | print("No system prompts. No hints. Pure training.") |
| | print("Epoch-based recipe. Hallucination detection.") |
| | print("=" * 60) |
| |
|
| | |
| | try: |
| | status = api("/status") |
| | except Exception as e: |
| | print(f"ERROR: Cannot connect to daemon at {DAEMON}: {e}") |
| | sys.exit(1) |
| |
|
| | if not status.get("active"): |
| | print("ERROR: Daemon not active. Activate a model first.") |
| | sys.exit(1) |
| |
|
| | print(f"\nModel: {status.get('model_key')}") |
| | print(f"Mamba: {status.get('mamba_architecture', False)}") |
| | print(f"Adapters: {status.get('n_adapters', 0)}") |
| | print(f"Trainable: {status.get('trainable_params', 0):,}") |
| |
|
| | |
| | print("\nResetting adapter and disabling auto-train...") |
| | try: |
| | api("/reset", {"clear_data": True}) |
| | except Exception: |
| | pass |
| | |
| | api("/config", data={"auto_train": False}, method="PUT") |
| |
|
| | |
| | print(f"\n{'─' * 60}") |
| | print("PHASE 1: BASELINE (before training)") |
| | print(f"{'─' * 60}") |
| |
|
| | print("\n Novel fact recall (should be 0/4):") |
| | r, rt = run_tests(RECALL_TESTS, "Recall") |
| |
|
| | print(f"\n General knowledge (should be preserved):") |
| | g, gt = run_tests(GENERAL_TESTS, "General") |
| |
|
| | print(f"\n Hallucination check:") |
| | h, ht = run_hallucination_tests(HALLUCINATION_TESTS) |
| |
|
| | print(f"\n Recall: {r}/{rt}, General: {g}/{gt}, Hallucination: {h}/{ht}") |
| |
|
| | if r == rt: |
| | print(" WARNING: Model already knows ALL novel facts — test invalid!") |
| | print(" Choose different novel facts or use a different model.") |
| | sys.exit(1) |
| |
|
| | if r > 0: |
| | print(f" NOTE: Model knows {r}/{rt} facts already. Proceeding anyway.") |
| |
|
| | |
| | print(f"\n{'─' * 60}") |
| | print(f"PHASE 2: INJECT + TRAIN ({TRAIN_EPOCHS} epochs)") |
| | print(f"{'─' * 60}") |
| |
|
| | |
| | api("/reset", {"clear_data": True}) |
| | print(" Buffer cleared (removed baseline chat junk)") |
| |
|
| | start_time = time.time() |
| |
|
| | |
| | result = api("/train", { |
| | "messages": TRAINING_PAIRS, |
| | "epochs": TRAIN_EPOCHS, |
| | }) |
| | injected = result.get("injected", 0) |
| | epochs = result.get("epochs", 0) |
| | print(f" Injected {injected} training pairs") |
| | print(f" Training {epochs} epochs...") |
| |
|
| | |
| | last_log = 0 |
| | while True: |
| | time.sleep(3) |
| | s = api("/status") |
| | if not s.get("training"): |
| | break |
| | steps = s.get("total_steps", 0) |
| | loss = s.get("last_loss", 0) |
| | now = time.time() |
| | if now - last_log >= 10: |
| | elapsed = now - start_time |
| | print(f" ... steps={steps}, loss={loss:.4f}, elapsed={elapsed:.0f}s") |
| | last_log = now |
| |
|
| | train_time = time.time() - start_time |
| | s = api("/status") |
| | print(f"\n Training complete!") |
| | print(f" Total steps: {s.get('total_steps', 0)}") |
| | print(f" Final loss: {s.get('last_loss', 0):.4f}") |
| | print(f" Time: {train_time:.0f}s") |
| | if train_time > 25: |
| | print(f" WARNING: Training took {train_time:.0f}s (target < 20s)") |
| |
|
| | |
| | print(f"\n{'─' * 60}") |
| | print("PHASE 3: POST-TRAINING RECALL") |
| | print(f"{'─' * 60}") |
| |
|
| | print("\n Direct recall (target: 4/4):") |
| | r2, rt2 = run_tests(RECALL_TESTS, "Recall") |
| |
|
| | print(f"\n Generalization (target: 3/4+):") |
| | gen, gent = run_tests(GENERALIZATION_TESTS, "Generalization") |
| |
|
| | print(f"\n General knowledge (target: 3/3):") |
| | g2, gt2 = run_tests(GENERAL_TESTS, "General") |
| |
|
| | print(f"\n Hallucination check (should still be uncertain):") |
| | h2, ht2 = run_hallucination_tests(HALLUCINATION_TESTS) |
| |
|
| | |
| | print(f"\n{'=' * 60}") |
| | print("SUMMARY") |
| | print(f"{'=' * 60}") |
| | print(f" {'Metric':<22} {'Baseline':<12} {'Post-Train':<12} {'Target':<12}") |
| | print(f" {'─'*22} {'─'*12} {'─'*12} {'─'*12}") |
| | print(f" {'Direct Recall':<22} {r}/{rt:<12} {r2}/{rt2:<12} {'4/4':<12}") |
| | print(f" {'Generalization':<22} {'n/a':<12} {gen}/{gent:<12} {'3/4+':<12}") |
| | print(f" {'General Knowledge':<22} {g}/{gt:<12} {g2}/{gt2:<12} {'3/3':<12}") |
| | print(f" {'Hallucination Guard':<22} {h}/{ht:<12} {h2}/{ht2:<12} {'2/2':<12}") |
| |
|
| | print(f"\n Model: {s.get('model_key')}") |
| | print(f" Mamba: {s.get('mamba_architecture', False)}") |
| | print(f" Total steps: {s.get('total_steps', 0)}") |
| | print(f" Final loss: {s.get('last_loss', 0):.4f}") |
| | print(f" Training time: {train_time:.0f}s") |
| |
|
| | |
| | recall_ok = r2 >= 3 |
| | general_ok = g2 >= gt2 - 1 |
| | gen_ok = gen >= 2 |
| |
|
| | if recall_ok and general_ok: |
| | if gen_ok: |
| | print(f"\n PASSED — Production LoRA training pipeline validated!") |
| | else: |
| | print(f"\n PARTIAL PASS — Recall works, generalization needs tuning") |
| | rc = 0 |
| | else: |
| | print(f"\n FAILED — Recall: {'OK' if recall_ok else 'FAIL'}, " |
| | f"General: {'OK' if general_ok else 'FAIL'}") |
| | rc = 1 |
| |
|
| | print("=" * 60) |
| | sys.exit(rc) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|