Track 3: TOML prompts + PURPOSE_LEARNING.md whitepaper — purpose_agent/robust_parser.py
Browse files- purpose_agent/robust_parser.py +101 -16
purpose_agent/robust_parser.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
"""
|
| 2 |
-
robust_parser.py — Universal LLM output parser
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
The
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
This replaces the fragile generate_structured → json.loads → crash pattern.
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
|
@@ -19,6 +19,90 @@ from typing import Any
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def extract_json(text: str) -> dict[str, Any] | None:
|
| 23 |
"""
|
| 24 |
Try to extract a JSON object from arbitrary LLM text.
|
|
@@ -64,6 +148,7 @@ def extract_field(text: str, field_name: str, default: str = "") -> str:
|
|
| 64 |
Extract a named field value from LLM text, regardless of format.
|
| 65 |
|
| 66 |
Handles:
|
|
|
|
| 67 |
- JSON: {"field": "value"}
|
| 68 |
- Markdown: **field:** value / field: value
|
| 69 |
- Labeled: FIELD: value
|
|
@@ -72,12 +157,12 @@ def extract_field(text: str, field_name: str, default: str = "") -> str:
|
|
| 72 |
text_lower = text.lower()
|
| 73 |
name_lower = field_name.lower()
|
| 74 |
|
| 75 |
-
# Try
|
| 76 |
-
obj =
|
| 77 |
if obj and field_name in obj:
|
| 78 |
return str(obj[field_name])
|
| 79 |
|
| 80 |
-
# Pattern: "field_name": "value" or field_name: value
|
| 81 |
patterns = [
|
| 82 |
rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
|
| 83 |
rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
|
|
@@ -168,10 +253,10 @@ def extract_code(text: str) -> str:
|
|
| 168 |
def parse_actor_response(text: str) -> dict[str, Any]:
|
| 169 |
"""
|
| 170 |
Parse an actor's response into thought/action/expected_delta.
|
| 171 |
-
Works with
|
| 172 |
"""
|
| 173 |
-
# Try
|
| 174 |
-
obj =
|
| 175 |
if obj and ("action" in obj or "thought" in obj):
|
| 176 |
return obj
|
| 177 |
|
|
@@ -209,10 +294,10 @@ def parse_actor_response(text: str) -> dict[str, Any]:
|
|
| 209 |
def parse_critic_response(text: str) -> dict[str, Any]:
|
| 210 |
"""
|
| 211 |
Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
|
| 212 |
-
Works with
|
| 213 |
"""
|
| 214 |
-
# Try
|
| 215 |
-
obj =
|
| 216 |
if obj and ("phi_before" in obj or "phi_after" in obj):
|
| 217 |
return {
|
| 218 |
"phi_before": float(obj.get("phi_before", 0)),
|
|
|
|
| 1 |
"""
|
| 2 |
+
robust_parser.py — Universal LLM output parser. Handles JSON, TOML, or free text.
|
| 3 |
|
| 4 |
+
TOML is the preferred output format because:
|
| 5 |
+
- ~40% fewer tokens than JSON (no quotes on keys, no braces, no commas)
|
| 6 |
+
- More natural for LLMs to generate (looks like config files)
|
| 7 |
+
- Python 3.11+ has tomllib in stdlib; we include a minimal fallback parser
|
| 8 |
|
| 9 |
+
The parser tries in order: TOML → JSON → field extraction → regex fallback.
|
| 10 |
+
Always returns something usable. Never crashes.
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
|
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
|
| 22 |
+
def _parse_toml_minimal(text: str) -> dict[str, Any] | None:
|
| 23 |
+
"""
|
| 24 |
+
Minimal TOML parser for LLM output. Handles the subset LLMs actually produce:
|
| 25 |
+
key = "value"
|
| 26 |
+
key = number
|
| 27 |
+
[section]
|
| 28 |
+
key = "value"
|
| 29 |
+
|
| 30 |
+
Uses stdlib tomllib (Python 3.11+) with fallback regex parser.
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
import tomllib
|
| 34 |
+
return tomllib.loads(text)
|
| 35 |
+
except ImportError:
|
| 36 |
+
pass
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
# Fallback: regex-based TOML subset parser
|
| 41 |
+
result = {}
|
| 42 |
+
current_section = result
|
| 43 |
+
section_path = []
|
| 44 |
+
|
| 45 |
+
for line in text.split('\n'):
|
| 46 |
+
line = line.strip()
|
| 47 |
+
if not line or line.startswith('#'):
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
# Section header: [section] or [section.subsection]
|
| 51 |
+
sec_match = re.match(r'^\[([^\]]+)\]$', line)
|
| 52 |
+
if sec_match:
|
| 53 |
+
parts = sec_match.group(1).split('.')
|
| 54 |
+
current_section = result
|
| 55 |
+
for part in parts:
|
| 56 |
+
part = part.strip()
|
| 57 |
+
if part not in current_section:
|
| 58 |
+
current_section[part] = {}
|
| 59 |
+
current_section = current_section[part]
|
| 60 |
+
section_path = parts
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
# Key = value
|
| 64 |
+
kv_match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
|
| 65 |
+
if kv_match:
|
| 66 |
+
key = kv_match.group(1).strip()
|
| 67 |
+
val = kv_match.group(2).strip()
|
| 68 |
+
# Parse value type
|
| 69 |
+
if val.startswith('"') and val.endswith('"'):
|
| 70 |
+
val = val[1:-1].replace('\\n', '\n').replace('\\"', '"')
|
| 71 |
+
elif val.startswith("'") and val.endswith("'"):
|
| 72 |
+
val = val[1:-1]
|
| 73 |
+
elif val.lower() in ('true', 'false'):
|
| 74 |
+
val = val.lower() == 'true'
|
| 75 |
+
else:
|
| 76 |
+
try:
|
| 77 |
+
val = float(val) if '.' in val else int(val)
|
| 78 |
+
except ValueError:
|
| 79 |
+
pass # Keep as string
|
| 80 |
+
current_section[key] = val
|
| 81 |
+
|
| 82 |
+
return result if result else None
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def extract_structured(text: str) -> dict[str, Any] | None:
|
| 86 |
+
"""
|
| 87 |
+
Try to extract structured data from LLM text.
|
| 88 |
+
Order: TOML → JSON → None.
|
| 89 |
+
"""
|
| 90 |
+
text = text.strip()
|
| 91 |
+
|
| 92 |
+
# Try TOML first (preferred — fewer tokens, more natural)
|
| 93 |
+
# Look for TOML-like content (key = value patterns)
|
| 94 |
+
if re.search(r'^\w+\s*=\s*', text, re.MULTILINE):
|
| 95 |
+
# Extract TOML block if in code fence
|
| 96 |
+
toml_match = re.search(r'```(?:toml)?\s*\n(.*?)```', text, re.DOTALL)
|
| 97 |
+
toml_text = toml_match.group(1) if toml_match else text
|
| 98 |
+
result = _parse_toml_minimal(toml_text)
|
| 99 |
+
if result:
|
| 100 |
+
return result
|
| 101 |
+
|
| 102 |
+
# Try JSON
|
| 103 |
+
return extract_json(text)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
def extract_json(text: str) -> dict[str, Any] | None:
|
| 107 |
"""
|
| 108 |
Try to extract a JSON object from arbitrary LLM text.
|
|
|
|
| 148 |
Extract a named field value from LLM text, regardless of format.
|
| 149 |
|
| 150 |
Handles:
|
| 151 |
+
- TOML: field = "value"
|
| 152 |
- JSON: {"field": "value"}
|
| 153 |
- Markdown: **field:** value / field: value
|
| 154 |
- Labeled: FIELD: value
|
|
|
|
| 157 |
text_lower = text.lower()
|
| 158 |
name_lower = field_name.lower()
|
| 159 |
|
| 160 |
+
# Try structured parse first (TOML → JSON)
|
| 161 |
+
obj = extract_structured(text)
|
| 162 |
if obj and field_name in obj:
|
| 163 |
return str(obj[field_name])
|
| 164 |
|
| 165 |
+
# Pattern: "field_name": "value" or field_name = value or field_name: value
|
| 166 |
patterns = [
|
| 167 |
rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
|
| 168 |
rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
|
|
|
|
| 253 |
def parse_actor_response(text: str) -> dict[str, Any]:
|
| 254 |
"""
|
| 255 |
Parse an actor's response into thought/action/expected_delta.
|
| 256 |
+
Works with TOML, JSON, or free text.
|
| 257 |
"""
|
| 258 |
+
# Try structured parse (TOML → JSON)
|
| 259 |
+
obj = extract_structured(text)
|
| 260 |
if obj and ("action" in obj or "thought" in obj):
|
| 261 |
return obj
|
| 262 |
|
|
|
|
| 294 |
def parse_critic_response(text: str) -> dict[str, Any]:
|
| 295 |
"""
|
| 296 |
Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
|
| 297 |
+
Works with TOML, JSON, or free text.
|
| 298 |
"""
|
| 299 |
+
# Try structured parse (TOML → JSON)
|
| 300 |
+
obj = extract_structured(text)
|
| 301 |
if obj and ("phi_before" in obj or "phi_after" in obj):
|
| 302 |
return {
|
| 303 |
"phi_before": float(obj.get("phi_before", 0)),
|