Rohan03 commited on
Commit
12ff1aa
·
verified ·
1 Parent(s): 9557780

Track 3: TOML prompts + PURPOSE_LEARNING.md whitepaper — purpose_agent/robust_parser.py

Browse files
Files changed (1) hide show
  1. purpose_agent/robust_parser.py +101 -16
purpose_agent/robust_parser.py CHANGED
@@ -1,13 +1,13 @@
1
  """
2
- robust_parser.py — Universal LLM output parser that never requires JSON.
3
 
4
- The problem: LLMs are unreliable at producing valid JSON. Different models
5
- format differently. Structured output (json_schema) isn't supported everywhere.
 
 
6
 
7
- The solution: Parse whatever the LLM gives you. Extract fields by multiple
8
- strategies, fall back gracefully, and always return something usable.
9
-
10
- This replaces the fragile generate_structured → json.loads → crash pattern.
11
  """
12
  from __future__ import annotations
13
 
@@ -19,6 +19,90 @@ from typing import Any
19
  logger = logging.getLogger(__name__)
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def extract_json(text: str) -> dict[str, Any] | None:
23
  """
24
  Try to extract a JSON object from arbitrary LLM text.
@@ -64,6 +148,7 @@ def extract_field(text: str, field_name: str, default: str = "") -> str:
64
  Extract a named field value from LLM text, regardless of format.
65
 
66
  Handles:
 
67
  - JSON: {"field": "value"}
68
  - Markdown: **field:** value / field: value
69
  - Labeled: FIELD: value
@@ -72,12 +157,12 @@ def extract_field(text: str, field_name: str, default: str = "") -> str:
72
  text_lower = text.lower()
73
  name_lower = field_name.lower()
74
 
75
- # Try JSON first
76
- obj = extract_json(text)
77
  if obj and field_name in obj:
78
  return str(obj[field_name])
79
 
80
- # Pattern: "field_name": "value" or field_name: value
81
  patterns = [
82
  rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
83
  rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
@@ -168,10 +253,10 @@ def extract_code(text: str) -> str:
168
  def parse_actor_response(text: str) -> dict[str, Any]:
169
  """
170
  Parse an actor's response into thought/action/expected_delta.
171
- Works with any format the LLM produces.
172
  """
173
- # Try JSON first (best case)
174
- obj = extract_json(text)
175
  if obj and ("action" in obj or "thought" in obj):
176
  return obj
177
 
@@ -209,10 +294,10 @@ def parse_actor_response(text: str) -> dict[str, Any]:
209
  def parse_critic_response(text: str) -> dict[str, Any]:
210
  """
211
  Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
212
- Works with any format.
213
  """
214
- # Try JSON first
215
- obj = extract_json(text)
216
  if obj and ("phi_before" in obj or "phi_after" in obj):
217
  return {
218
  "phi_before": float(obj.get("phi_before", 0)),
 
1
  """
2
+ robust_parser.py — Universal LLM output parser. Handles JSON, TOML, or free text.
3
 
4
+ TOML is the preferred output format because:
5
+ - ~40% fewer tokens than JSON (no quotes on keys, no braces, no commas)
6
+ - More natural for LLMs to generate (looks like config files)
7
+ - Python 3.11+ has tomllib in stdlib; we include a minimal fallback parser
8
 
9
+ The parser tries in order: TOML JSON field extraction → regex fallback.
10
+ Always returns something usable. Never crashes.
 
 
11
  """
12
  from __future__ import annotations
13
 
 
19
  logger = logging.getLogger(__name__)
20
 
21
 
22
+ def _parse_toml_minimal(text: str) -> dict[str, Any] | None:
23
+ """
24
+ Minimal TOML parser for LLM output. Handles the subset LLMs actually produce:
25
+ key = "value"
26
+ key = number
27
+ [section]
28
+ key = "value"
29
+
30
+ Uses stdlib tomllib (Python 3.11+) with fallback regex parser.
31
+ """
32
+ try:
33
+ import tomllib
34
+ return tomllib.loads(text)
35
+ except ImportError:
36
+ pass
37
+ except Exception:
38
+ pass
39
+
40
+ # Fallback: regex-based TOML subset parser
41
+ result = {}
42
+ current_section = result
43
+ section_path = []
44
+
45
+ for line in text.split('\n'):
46
+ line = line.strip()
47
+ if not line or line.startswith('#'):
48
+ continue
49
+
50
+ # Section header: [section] or [section.subsection]
51
+ sec_match = re.match(r'^\[([^\]]+)\]$', line)
52
+ if sec_match:
53
+ parts = sec_match.group(1).split('.')
54
+ current_section = result
55
+ for part in parts:
56
+ part = part.strip()
57
+ if part not in current_section:
58
+ current_section[part] = {}
59
+ current_section = current_section[part]
60
+ section_path = parts
61
+ continue
62
+
63
+ # Key = value
64
+ kv_match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
65
+ if kv_match:
66
+ key = kv_match.group(1).strip()
67
+ val = kv_match.group(2).strip()
68
+ # Parse value type
69
+ if val.startswith('"') and val.endswith('"'):
70
+ val = val[1:-1].replace('\\n', '\n').replace('\\"', '"')
71
+ elif val.startswith("'") and val.endswith("'"):
72
+ val = val[1:-1]
73
+ elif val.lower() in ('true', 'false'):
74
+ val = val.lower() == 'true'
75
+ else:
76
+ try:
77
+ val = float(val) if '.' in val else int(val)
78
+ except ValueError:
79
+ pass # Keep as string
80
+ current_section[key] = val
81
+
82
+ return result if result else None
83
+
84
+
85
+ def extract_structured(text: str) -> dict[str, Any] | None:
86
+ """
87
+ Try to extract structured data from LLM text.
88
+ Order: TOML → JSON → None.
89
+ """
90
+ text = text.strip()
91
+
92
+ # Try TOML first (preferred — fewer tokens, more natural)
93
+ # Look for TOML-like content (key = value patterns)
94
+ if re.search(r'^\w+\s*=\s*', text, re.MULTILINE):
95
+ # Extract TOML block if in code fence
96
+ toml_match = re.search(r'```(?:toml)?\s*\n(.*?)```', text, re.DOTALL)
97
+ toml_text = toml_match.group(1) if toml_match else text
98
+ result = _parse_toml_minimal(toml_text)
99
+ if result:
100
+ return result
101
+
102
+ # Try JSON
103
+ return extract_json(text)
104
+
105
+
106
  def extract_json(text: str) -> dict[str, Any] | None:
107
  """
108
  Try to extract a JSON object from arbitrary LLM text.
 
148
  Extract a named field value from LLM text, regardless of format.
149
 
150
  Handles:
151
+ - TOML: field = "value"
152
  - JSON: {"field": "value"}
153
  - Markdown: **field:** value / field: value
154
  - Labeled: FIELD: value
 
157
  text_lower = text.lower()
158
  name_lower = field_name.lower()
159
 
160
+ # Try structured parse first (TOML → JSON)
161
+ obj = extract_structured(text)
162
  if obj and field_name in obj:
163
  return str(obj[field_name])
164
 
165
+ # Pattern: "field_name": "value" or field_name = value or field_name: value
166
  patterns = [
167
  rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
168
  rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
 
253
  def parse_actor_response(text: str) -> dict[str, Any]:
254
  """
255
  Parse an actor's response into thought/action/expected_delta.
256
+ Works with TOML, JSON, or free text.
257
  """
258
+ # Try structured parse (TOML → JSON)
259
+ obj = extract_structured(text)
260
  if obj and ("action" in obj or "thought" in obj):
261
  return obj
262
 
 
294
  def parse_critic_response(text: str) -> dict[str, Any]:
295
  """
296
  Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
297
+ Works with TOML, JSON, or free text.
298
  """
299
+ # Try structured parse (TOML → JSON)
300
+ obj = extract_structured(text)
301
  if obj and ("phi_before" in obj or "phi_after" in obj):
302
  return {
303
  "phi_before": float(obj.get("phi_before", 0)),