purpose-agent / purpose_agent /robust_parser.py
Rohan03's picture
Track 3: TOML prompts + PURPOSE_LEARNING.md whitepaper — purpose_agent/robust_parser.py
12ff1aa verified
"""
robust_parser.py — Universal LLM output parser. Handles JSON, TOML, or free text.
TOML is the preferred output format because:
- ~40% fewer tokens than JSON (no quotes on keys, no braces, no commas)
- More natural for LLMs to generate (looks like config files)
- Python 3.11+ has tomllib in stdlib; we include a minimal fallback parser
The parser tries in order: TOML → JSON → field extraction → regex fallback.
Always returns something usable. Never crashes.
"""
from __future__ import annotations
import json
import re
import logging
from typing import Any
logger = logging.getLogger(__name__)
def _parse_toml_minimal(text: str) -> dict[str, Any] | None:
"""
Minimal TOML parser for LLM output. Handles the subset LLMs actually produce:
key = "value"
key = number
[section]
key = "value"
Uses stdlib tomllib (Python 3.11+) with fallback regex parser.
"""
try:
import tomllib
return tomllib.loads(text)
except ImportError:
pass
except Exception:
pass
# Fallback: regex-based TOML subset parser
result = {}
current_section = result
section_path = []
for line in text.split('\n'):
line = line.strip()
if not line or line.startswith('#'):
continue
# Section header: [section] or [section.subsection]
sec_match = re.match(r'^\[([^\]]+)\]$', line)
if sec_match:
parts = sec_match.group(1).split('.')
current_section = result
for part in parts:
part = part.strip()
if part not in current_section:
current_section[part] = {}
current_section = current_section[part]
section_path = parts
continue
# Key = value
kv_match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
if kv_match:
key = kv_match.group(1).strip()
val = kv_match.group(2).strip()
# Parse value type
if val.startswith('"') and val.endswith('"'):
val = val[1:-1].replace('\\n', '\n').replace('\\"', '"')
elif val.startswith("'") and val.endswith("'"):
val = val[1:-1]
elif val.lower() in ('true', 'false'):
val = val.lower() == 'true'
else:
try:
val = float(val) if '.' in val else int(val)
except ValueError:
pass # Keep as string
current_section[key] = val
return result if result else None
def extract_structured(text: str) -> dict[str, Any] | None:
"""
Try to extract structured data from LLM text.
Order: TOML → JSON → None.
"""
text = text.strip()
# Try TOML first (preferred — fewer tokens, more natural)
# Look for TOML-like content (key = value patterns)
if re.search(r'^\w+\s*=\s*', text, re.MULTILINE):
# Extract TOML block if in code fence
toml_match = re.search(r'```(?:toml)?\s*\n(.*?)```', text, re.DOTALL)
toml_text = toml_match.group(1) if toml_match else text
result = _parse_toml_minimal(toml_text)
if result:
return result
# Try JSON
return extract_json(text)
def extract_json(text: str) -> dict[str, Any] | None:
"""
Try to extract a JSON object from arbitrary LLM text.
Handles: pure JSON, JSON in code blocks, JSON embedded in prose.
Returns None if no valid JSON found.
"""
text = text.strip()
# Strategy 1: Entire text is JSON
try:
return json.loads(text)
except (json.JSONDecodeError, ValueError):
pass
# Strategy 2: JSON in markdown code block
m = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except (json.JSONDecodeError, ValueError):
pass
# Strategy 3: Find outermost { ... } by brace matching
start = text.find('{')
if start >= 0:
depth = 0
for i in range(start, len(text)):
if text[i] == '{':
depth += 1
elif text[i] == '}':
depth -= 1
if depth == 0:
try:
return json.loads(text[start:i + 1])
except (json.JSONDecodeError, ValueError):
break
return None
def extract_field(text: str, field_name: str, default: str = "") -> str:
"""
Extract a named field value from LLM text, regardless of format.
Handles:
- TOML: field = "value"
- JSON: {"field": "value"}
- Markdown: **field:** value / field: value
- Labeled: FIELD: value
- Line-based: field\nvalue
"""
text_lower = text.lower()
name_lower = field_name.lower()
# Try structured parse first (TOML → JSON)
obj = extract_structured(text)
if obj and field_name in obj:
return str(obj[field_name])
# Pattern: "field_name": "value" or field_name = value or field_name: value
patterns = [
rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
rf'\*?\*?{field_name}\*?\*?\s*:\s*(.+?)(?:\n|$)', # Markdown/label
rf'{field_name}\s*[=:]\s*(.+?)(?:\n|$)', # Assignment
]
for pattern in patterns:
m = re.search(pattern, text, re.IGNORECASE)
if m:
return m.group(1).strip().strip('"').strip("'")
return default
def extract_number(text: str, field_name: str, default: float = 0.0) -> float:
"""Extract a numeric field from LLM text."""
val = extract_field(text, field_name)
if val:
try:
return float(val.rstrip('.').rstrip(','))
except (ValueError, TypeError):
pass
# Try direct pattern: field_name = X.X or field_name: X.X
m = re.search(rf'{field_name}\s*[=:]\s*([\d.]+)', text, re.IGNORECASE)
if m:
try:
return float(m.group(1).rstrip('.'))
except ValueError:
pass
return default
def extract_code(text: str) -> str:
"""
Extract Python code from LLM text.
Handles:
- Code in ``` blocks
- Code in "code" JSON field
- Raw code with def/class keywords
"""
# Strategy 1: JSON with code field
obj = extract_json(text)
if obj:
# Nested: action.params.code
action = obj.get("action", {})
if isinstance(action, dict):
params = action.get("params", {})
if isinstance(params, dict) and "code" in params:
return params["code"]
if "code" in obj:
return obj["code"]
# Strategy 2: Python code block
m = re.search(r'```(?:python)?\s*\n(.*?)```', text, re.DOTALL)
if m:
return m.group(1).strip()
# Strategy 3: Find code starting with def/class
lines = text.split('\n')
code_lines = []
in_code = False
for line in lines:
if re.match(r'^(def |class |import |from )', line.strip()):
in_code = True
if in_code:
# Stop at empty line after code, or at non-code text
if line.strip() == '' and code_lines and not code_lines[-1].strip().endswith(':'):
# Could be blank line in code — keep going if next line is indented
code_lines.append(line)
elif in_code and (line.startswith(' ') or line.startswith('\t') or
re.match(r'^(def |class |import |from |#|$)', line.strip())):
code_lines.append(line)
elif re.match(r'^(def |class )', line.strip()):
code_lines.append(line)
else:
if code_lines:
break
if code_lines:
return '\n'.join(code_lines).strip()
return ""
def parse_actor_response(text: str) -> dict[str, Any]:
"""
Parse an actor's response into thought/action/expected_delta.
Works with TOML, JSON, or free text.
"""
# Try structured parse (TOML → JSON)
obj = extract_structured(text)
if obj and ("action" in obj or "thought" in obj):
return obj
# Extract fields individually
thought = extract_field(text, "thought")
expected_delta = extract_field(text, "expected_delta")
# Extract action name
action_name = extract_field(text, "name", "")
if not action_name:
action_name = extract_field(text, "action", "")
if action_name and action_name.startswith("{"):
action_name = "" # It's a JSON object, not a name
# Extract code if this is a coding task
code = extract_code(text)
# Build action
action = {"name": action_name or "UNKNOWN", "params": {}}
if code:
action["name"] = action.get("name", "submit_code") if action["name"] == "UNKNOWN" else action["name"]
action["params"]["code"] = code
if not thought:
# Use the first sentence as thought
thought = text.split('\n')[0][:200] if text else ""
return {
"thought": thought,
"action": action,
"expected_delta": expected_delta or "",
}
def parse_critic_response(text: str) -> dict[str, Any]:
"""
Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
Works with TOML, JSON, or free text.
"""
# Try structured parse (TOML → JSON)
obj = extract_structured(text)
if obj and ("phi_before" in obj or "phi_after" in obj):
return {
"phi_before": float(obj.get("phi_before", 0)),
"phi_after": float(obj.get("phi_after", 0)),
"reasoning": str(obj.get("reasoning", "")),
"evidence": str(obj.get("evidence", "")),
"confidence": float(obj.get("confidence", 0.5)),
}
# Extract scores from text
phi_before = extract_number(text, "phi_before", 0.0)
if phi_before == 0.0:
phi_before = extract_number(text, "Φ(state_before)", 0.0)
if phi_before == 0.0:
phi_before = extract_number(text, "state_before", 0.0)
phi_after = extract_number(text, "phi_after", 0.0)
if phi_after == 0.0:
phi_after = extract_number(text, "Φ(state_after)", 0.0)
if phi_after == 0.0:
phi_after = extract_number(text, "state_after", 0.0)
# Try SCORE: X pattern
if phi_before == 0.0 and phi_after == 0.0:
scores = re.findall(r'(?:score|SCORE|Score)\s*[=:]\s*([\d.]+)', text)
if len(scores) >= 2:
phi_before = float(scores[0].rstrip('.'))
phi_after = float(scores[1].rstrip('.'))
elif len(scores) == 1:
phi_after = float(scores[0].rstrip('.'))
reasoning = extract_field(text, "reasoning")
evidence = extract_field(text, "evidence")
confidence = extract_number(text, "confidence", 0.5)
if not reasoning:
reasoning = text[:300]
if not evidence:
evidence = text[300:500] if len(text) > 300 else ""
return {
"phi_before": min(10.0, max(0.0, phi_before)),
"phi_after": min(10.0, max(0.0, phi_after)),
"reasoning": reasoning,
"evidence": evidence,
"confidence": min(1.0, max(0.0, confidence)),
}
def parse_optimizer_response(text: str) -> dict[str, Any]:
"""
Parse optimizer output into heuristics list.
"""
obj = extract_json(text)
if obj and "heuristics" in obj:
return obj
# Try to find a JSON array
m = re.search(r'\[.*\]', text, re.DOTALL)
if m:
try:
arr = json.loads(m.group())
if isinstance(arr, list):
return {"heuristics": arr}
except (json.JSONDecodeError, ValueError):
pass
# Extract from text patterns
heuristics = []
patterns = re.findall(r'(?:pattern|when|if)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
strategies = re.findall(r'(?:strategy|do|then|action)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
for pat, strat in zip(patterns, strategies):
heuristics.append({"tier": "strategic", "pattern": pat.strip(), "strategy": strat.strip()})
# If nothing found, try numbered list items
if not heuristics:
items = re.findall(r'\d+\.\s*(.+?)(?:\n|$)', text)
for item in items[:5]:
heuristics.append({"tier": "strategic", "pattern": "General", "strategy": item.strip()})
return {"heuristics": heuristics}