Spaces:
Running
Running
| """Developer and Critic agents for VisionCoder OpenEnv. | |
| All agent logic (tool-call handling, TODO-list critique, episode loop) | |
| lives here. Prompts are in openenv.prompts. | |
| Usage: | |
| from openenv.agents import run_episode, AgentConfig | |
| config = AgentConfig(api_key=..., api_base=..., model=...) | |
| result = run_episode(env_client, config, difficulty="hard", session=obs, dbg=dbg) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import re | |
| from dataclasses import dataclass, field | |
| from typing import List, Optional, Tuple | |
| from openai import OpenAI | |
| from openenv.prompts import ( | |
| DEVELOPER_SYSTEM, | |
| FIRST_CRITIC_SYSTEM, | |
| SUBSEQUENT_CRITIC_SYSTEM, | |
| FALLBACK_HTML, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # TODO list tracker | |
| # --------------------------------------------------------------------------- | |
| _PRIORITY_ORDER = {"HIGH": 0, "MEDIUM": 1, "LOW": 2} | |
| class TodoItem: | |
| text: str # full item text including "PRIORITY | DIMENSION — description" | |
| done: bool = False | |
| priority: str = "MEDIUM" # HIGH / MEDIUM / LOW | |
| class TodoList: | |
| items: List[TodoItem] = field(default_factory=list) | |
| def all_done(self) -> bool: | |
| return bool(self.items) and all(item.done for item in self.items) | |
| def pending_count(self) -> int: | |
| return sum(1 for item in self.items if not item.done) | |
| def format_for_critic(self) -> str: | |
| """Previous TODO list passed to Critic — includes priority tag for exact copying.""" | |
| if not self.items: | |
| return "(No previous TODO list — this is the first review.)" | |
| lines = ["Previous TODO list (copy with EXACT priority and text, update only the markers):"] | |
| for item in self.items: | |
| marker = "[✓]" if item.done else "[ ]" | |
| lines.append(f"{marker} {item.priority} | {item.text}") | |
| return "\n".join(lines) | |
| def format_for_developer(self) -> str: | |
| """Pending items sorted by priority, formatted as actionable critique.""" | |
| _NOISE_PHRASES = ( | |
| "matches the reference", "which matches", | |
| "is present and correct", "is correct", "matches reference", | |
| ) | |
| pending = [ | |
| item for item in self.items | |
| if not item.done | |
| and not any(p in item.text.lower() for p in _NOISE_PHRASES) | |
| ] | |
| if not pending: | |
| return ( | |
| "The Critic found no remaining issues. Look carefully at the reference " | |
| "screenshot for fine details (spacing, colors, missing elements) and refine." | |
| ) | |
| # Sort by priority: HIGH first, cap at 8 so Developer gets focused feedback | |
| pending.sort(key=lambda it: _PRIORITY_ORDER.get(it.priority, 1)) | |
| pending = pending[:8] | |
| lines = ["Fix these issues in priority order (Critic feedback):"] | |
| for item in pending: | |
| lines.append(f"- [{item.priority}] {item.text}") | |
| return "\n".join(lines) | |
| def parse(cls, text: str) -> "TodoList": | |
| """Parse a TODO list from Critic output text. | |
| Expected item format: [✓/[ ]/[+]] PRIORITY | DIMENSION — description | |
| Priority tag (HIGH/MEDIUM/LOW) is optional — defaults to MEDIUM if absent. | |
| [+] items are always kept pending (can't resolve same step they're discovered). | |
| Duplicate and truncated items are dropped. | |
| """ | |
| _TRUNCATION_ENDINGS = ( | |
| " in", " on", " at", " to", " of", " for", " and", " the", | |
| " a", " an", " with", " by", " from", " as", " or", " but", | |
| ) | |
| _VALID_PRIORITIES = {"HIGH", "MEDIUM", "LOW"} | |
| result = cls() | |
| seen: set = set() | |
| for line in text.split("\n"): | |
| line = line.strip() | |
| if line.startswith("[✓]"): | |
| item_text = line[3:].strip() | |
| done = True | |
| elif line.startswith("[ ]"): | |
| item_text = line[3:].strip() | |
| done = False | |
| elif line.startswith("[+]"): | |
| item_text = line[3:].strip() | |
| done = False | |
| else: | |
| continue | |
| if len(item_text) < 10: | |
| continue | |
| if any(item_text.lower().endswith(e) for e in _TRUNCATION_ENDINGS): | |
| continue | |
| # Extract priority if present: "HIGH | LAYOUT — ..." | |
| priority = "MEDIUM" | |
| parts = item_text.split("|", 1) | |
| if len(parts) == 2: | |
| candidate = parts[0].strip().upper() | |
| if candidate in _VALID_PRIORITIES: | |
| priority = candidate | |
| item_text = parts[1].strip() | |
| key = item_text.lower()[:60] | |
| if key not in seen: | |
| seen.add(key) | |
| result.items.append(TodoItem(text=item_text, done=done, priority=priority)) | |
| return result | |
| def merge(cls, prev: "TodoList", updated: "TodoList") -> "TodoList": | |
| """Merge updated list back — re-adds any pending prev items the Critic forgot. | |
| Uses 40-char prefix matching so paraphrased items count as the same issue. | |
| Resolved prev items (done=True) are never re-added. | |
| New [+] items introduced in this step are capped at 3 by priority so the | |
| list doesn't balloon when the model ignores the per-step limit. | |
| """ | |
| prev_prefixes = {item.text.lower()[:40] for item in prev.items} | |
| # Separate carried items (also in prev) from genuinely new [+] items | |
| carried: list = [] | |
| new_items: list = [] | |
| for item in updated.items: | |
| if item.text.lower()[:40] in prev_prefixes: | |
| carried.append(item) | |
| else: | |
| new_items.append(item) | |
| # Keep at most 3 new items (highest priority first) | |
| new_items.sort(key=lambda it: _PRIORITY_ORDER.get(it.priority, 1)) | |
| new_items = new_items[:3] | |
| result = cls(items=carried + new_items) | |
| updated_prefixes = {item.text.lower()[:40] for item in result.items} | |
| # Re-add any pending prev items the Critic dropped entirely | |
| for prev_item in prev.items: | |
| if prev_item.done: | |
| continue | |
| if prev_item.text.lower()[:40] not in updated_prefixes: | |
| result.items.append(prev_item) | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # HTML helpers | |
| # --------------------------------------------------------------------------- | |
| def _looks_like_html(text: str) -> bool: | |
| t = text.strip().lower() | |
| return t.startswith("<!doctype") or t.startswith("<html") | |
| def _parse_qwen_xml_tool_call(content: str) -> Optional[Tuple[str, dict]]: | |
| """Fallback parser for Qwen3's XML tool call format when vllm hermes parser misses it.""" | |
| if "<tool_call>" not in content: | |
| return None | |
| fn_m = re.search(r"<function=(\w+)>", content) | |
| if not fn_m: | |
| return None | |
| func_name = fn_m.group(1) | |
| args = { | |
| m.group(1): m.group(2).strip() | |
| for m in re.finditer(r"<parameter=(\w+)>(.*?)(?:</parameter>|\Z)", content, re.DOTALL) | |
| } | |
| return (func_name, args) if args else None | |
| def _clean_html_output(content: str) -> str: | |
| """Strip residual <tool_call> wrapper or markdown fences from model output.""" | |
| parsed = _parse_qwen_xml_tool_call(content) | |
| if parsed: | |
| _, args = parsed | |
| if "html" in args: | |
| return args["html"] | |
| fence = re.match(r"```(?:html)?\s*(.*?)\s*```", content, re.DOTALL) | |
| if fence: | |
| return fence.group(1) | |
| return content | |
| # --------------------------------------------------------------------------- | |
| # Developer agent | |
| # --------------------------------------------------------------------------- | |
| def developer_turn( | |
| client: OpenAI, | |
| env_client, # unused — kept for signature compatibility | |
| model: str, | |
| ref_b64: str, | |
| current_html: str, | |
| todo: Optional[TodoList] = None, | |
| dbg=None, | |
| ) -> str: | |
| """Developer generates HTML from the reference screenshot in a single LLM call. | |
| No tools — rendering is the environment's responsibility after step(). | |
| On subsequent steps the Critic's TODO list is included so the Developer | |
| knows exactly what to fix. | |
| """ | |
| if dbg: | |
| dbg.log_developer_input(current_html, todo.format_for_developer() if todo else None) | |
| messages = [{"role": "system", "content": DEVELOPER_SYSTEM}] | |
| user_content: list = [ | |
| {"type": "text", "text": "Reference screenshot (reproduce this UI):"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| ] | |
| if current_html and todo and todo.items: | |
| user_content.append({ | |
| "type": "text", | |
| "text": ( | |
| f"\n\nYour previous HTML:\n```html\n{current_html[:5000]}\n```\n\n" | |
| f"{todo.format_for_developer()}\n\n" | |
| "Output the revised HTML only." | |
| ), | |
| }) | |
| else: | |
| user_content.append({ | |
| "type": "text", | |
| "text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.", | |
| }) | |
| messages.append({"role": "user", "content": user_content}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| max_tokens=4096, | |
| temperature=0.7, | |
| ) | |
| content = response.choices[0].message.content or "" | |
| html_out = _clean_html_output(content) | |
| if not _looks_like_html(html_out): | |
| html_out = FALLBACK_HTML | |
| if dbg: | |
| dbg.log_developer_output(html_out) | |
| return html_out | |
| # --------------------------------------------------------------------------- | |
| # Critic agent | |
| # --------------------------------------------------------------------------- | |
| def critic_turn( | |
| client: OpenAI, | |
| model: str, | |
| ref_b64: str, | |
| render_curr_b64: str, | |
| prev_todo: Optional[TodoList], | |
| render_prev_b64: Optional[str] = None, | |
| current_html: str = "", | |
| dbg=None, | |
| ) -> Tuple[str, TodoList]: | |
| """Critic reviews current render vs reference and returns (raw_text, updated TodoList). | |
| Receives the Developer's HTML source so it can write selector-specific CSS fixes | |
| instead of abstract visual observations. | |
| """ | |
| is_first = prev_todo is None | |
| if dbg: | |
| prev_critique_text = prev_todo.format_for_developer() if prev_todo else None | |
| dbg.log_critic_input(ref_b64, render_prev_b64, prev_critique_text, render_curr_b64) | |
| system = FIRST_CRITIC_SYSTEM if is_first else SUBSEQUENT_CRITIC_SYSTEM | |
| critic_messages = [{"role": "system", "content": system}] | |
| content: list = [ | |
| {"type": "text", "text": "Reference screenshot:"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| ] | |
| if render_prev_b64 and prev_todo: | |
| content += [ | |
| {"type": "text", "text": "Previous render (before this step's revision):"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_prev_b64}"}}, | |
| ] | |
| content += [ | |
| {"type": "text", "text": "Current render:"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_curr_b64}"}}, | |
| ] | |
| if current_html: | |
| content.append({ | |
| "type": "text", | |
| "text": ( | |
| f"\nDeveloper's current HTML source (use exact selectors in your FIX instructions):\n" | |
| f"```html\n{current_html[:5000]}\n```" | |
| ), | |
| }) | |
| if is_first: | |
| content.append({ | |
| "type": "text", | |
| "text": ( | |
| "\nThis is the first review. Perform a comprehensive visual audit covering " | |
| "LAYOUT, STRUCTURE, COLOR, TYPOGRAPHY, SPACING, and TEXT dimensions. " | |
| "Output your initial TODO LIST with [+] items only. " | |
| "Each item MUST include a → FIX: instruction with exact CSS." | |
| ), | |
| }) | |
| else: | |
| content.append({ | |
| "type": "text", | |
| "text": ( | |
| f"\n{prev_todo.format_for_critic()}\n\n" | |
| "Update the TODO list based on what you see in the CURRENT RENDER and HTML. " | |
| "Mark fixed items [✓], keep unresolved items [ ] (update FIX selector if HTML changed), " | |
| "add new issues with [+]. Each item must have a → FIX: instruction. " | |
| "Stop after the last item — no STATUS or summary line." | |
| ), | |
| }) | |
| critic_messages.append({"role": "user", "content": content}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=critic_messages, | |
| max_tokens=2048, | |
| temperature=0.1, | |
| ) | |
| critique_text = response.choices[0].message.content or "" | |
| updated_todo = TodoList.parse(critique_text) | |
| if prev_todo: | |
| if updated_todo.all_done(): | |
| # Critic explicitly marked every visible item [✓] — trust that signal. | |
| # Skipping merge avoids re-adding items the Critic intentionally resolved. | |
| pass | |
| else: | |
| updated_todo = TodoList.merge(prev_todo, updated_todo) | |
| if dbg: | |
| dbg.log_critic_output(critique_text, updated_todo) | |
| return critique_text, updated_todo | |
| # --------------------------------------------------------------------------- | |
| # Episode config | |
| # --------------------------------------------------------------------------- | |
| class AgentConfig: | |
| api_key: str | |
| api_base: str | |
| model: str | |
| max_steps: int = 5 | |
| # --------------------------------------------------------------------------- | |
| # Episode runner | |
| # --------------------------------------------------------------------------- | |
| class StepResult: | |
| step: int | |
| html: str | |
| reward: float | |
| done: bool | |
| critique: str | |
| todo: Optional[TodoList] | |
| render_full_b64: Optional[str] | |
| sub_rewards: Optional[dict] | |
| error: Optional[str] = None | |
| def run_episode( | |
| env_client, | |
| config: AgentConfig, | |
| session_id: str, | |
| ref_b64: str, | |
| dbg=None, | |
| on_step=None, # optional callback(StepResult) → None, called immediately after env step | |
| ) -> List[StepResult]: | |
| """Run one full episode (Developer↔Critic loop) and return per-step results. | |
| Terminates when: | |
| - max_steps reached (env done=True) | |
| - Critic marks all TODO items resolved | |
| - No reward improvement for 2 consecutive steps (plateau) | |
| Monotonic reward guarantee: Developer always receives the best-seen HTML as | |
| its base, so regressions don't compound. If a step produces lower reward the | |
| Developer retries from the best-known state on the next step. | |
| """ | |
| client = OpenAI(api_key=config.api_key, base_url=config.api_base) | |
| current_html = "" | |
| best_html = "" | |
| best_reward = 0.0 | |
| no_improve_streak = 0 | |
| _MAX_NO_IMPROVE = 2 | |
| todo: Optional[TodoList] = None | |
| render_prev: Optional[str] = None | |
| results: List[StepResult] = [] | |
| for step_i in range(config.max_steps): | |
| # Guard: Critic resolved everything | |
| if todo is not None and todo.pending_count() == 0: | |
| break | |
| # Guard: plateau — no improvement for N consecutive steps | |
| if no_improve_streak >= _MAX_NO_IMPROVE: | |
| print( | |
| f"[CRITIC] No improvement for {_MAX_NO_IMPROVE} consecutive steps " | |
| f"(best={best_reward:.3f}) — stopping early.", | |
| flush=True, | |
| ) | |
| break | |
| error: Optional[str] = None | |
| # Developer always starts from the best-seen HTML to avoid compounding regressions | |
| try: | |
| current_html = developer_turn( | |
| client, env_client, config.model, | |
| ref_b64, best_html, todo, dbg, | |
| ) | |
| except Exception as exc: | |
| error = str(exc)[:120] | |
| current_html = FALLBACK_HTML | |
| # Step the environment | |
| step_resp = env_client.post( | |
| "/step", | |
| json={"html": current_html, "session_id": session_id}, | |
| ) | |
| step_resp.raise_for_status() | |
| result = step_resp.json() | |
| reward = float(result.get("reward", 0.0)) | |
| env_done = bool(result.get("done", False)) | |
| render_full = result.get("render_full") | |
| sub_rewards = result.get("metadata", {}).get("rewards") | |
| # Monotonic tracking — update best only on genuine improvement | |
| if reward > best_reward: | |
| best_reward = reward | |
| best_html = current_html | |
| no_improve_streak = 0 | |
| else: | |
| no_improve_streak += 1 | |
| if dbg: | |
| dbg.log_step_result(reward, env_done, render_full, sub_rewards) | |
| step_n = step_i + 1 | |
| sr = StepResult( | |
| step=step_n, | |
| html=current_html, | |
| reward=reward, | |
| done=env_done, | |
| critique="", | |
| todo=todo, | |
| render_full_b64=render_full, | |
| sub_rewards=sub_rewards, | |
| error=error, | |
| ) | |
| # Notify caller immediately so [STEP] prints before [CRITIC] | |
| if on_step: | |
| on_step(sr) | |
| # Critic turn (skip on final env step) | |
| if not env_done: | |
| try: | |
| critique_text, todo = critic_turn( | |
| client, config.model, | |
| ref_b64, render_full, | |
| prev_todo=todo, | |
| render_prev_b64=render_prev, | |
| current_html=current_html, | |
| dbg=dbg, | |
| ) | |
| sr.critique = critique_text | |
| sr.todo = todo | |
| preview = critique_text.replace("\n", " ")[:200] | |
| print( | |
| f"[CRITIC] step={step_n} reward={reward:.3f} best={best_reward:.3f} → {preview}", | |
| flush=True, | |
| ) | |
| except Exception as exc: | |
| logger.warning("Critic failed: %s", exc) | |
| todo = None | |
| results.append(sr) | |
| render_prev = render_full | |
| if env_done: | |
| break | |
| if todo is not None and todo.pending_count() == 0: | |
| print( | |
| f"[CRITIC] All items resolved at step={step_n} reward={reward:.3f} — stopping.", | |
| flush=True, | |
| ) | |
| break | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # Approach B: Long-horizon Developer (no Critic, sees full history) | |
| # --------------------------------------------------------------------------- | |
| def developer_turn_long_horizon( | |
| client: OpenAI, | |
| model: str, | |
| ref_b64: str, | |
| history: List[Tuple[str, str]], # list of (render_full_b64, html) | |
| dbg=None, | |
| ) -> str: | |
| """Developer with full history: reference + all previous renders + all previous HTML.""" | |
| messages = [{"role": "system", "content": DEVELOPER_SYSTEM}] | |
| user_content: list = [ | |
| {"type": "text", "text": "Reference screenshot (reproduce this UI):"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| ] | |
| if history: | |
| for i, (render_b64, prev_html) in enumerate(history, 1): | |
| user_content.append({ | |
| "type": "text", | |
| "text": f"\n\nStep {i} render:", | |
| }) | |
| user_content.append({ | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/png;base64,{render_b64}"}, | |
| }) | |
| user_content.append({ | |
| "type": "text", | |
| "text": f"Step {i} HTML:\n```html\n{prev_html[:2000]}\n```", | |
| }) | |
| user_content.append({ | |
| "type": "text", | |
| "text": ( | |
| "\n\nAll your previous attempts are shown above. " | |
| "Generate improved HTML that better matches the reference. " | |
| "Output the HTML only." | |
| ), | |
| }) | |
| else: | |
| user_content.append({ | |
| "type": "text", | |
| "text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.", | |
| }) | |
| messages.append({"role": "user", "content": user_content}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| max_tokens=4096, | |
| temperature=0.7, | |
| ) | |
| content = response.choices[0].message.content or "" | |
| html_out = _clean_html_output(content) | |
| return html_out if _looks_like_html(html_out) else FALLBACK_HTML | |
| def run_episode_long_dev( | |
| env_client, | |
| config: AgentConfig, | |
| session_id: str, | |
| ref_b64: str, | |
| dbg=None, | |
| on_step=None, | |
| ) -> List[StepResult]: | |
| """Approach B: Long-horizon Developer only — full history, no Critic.""" | |
| client = OpenAI(api_key=config.api_key, base_url=config.api_base) | |
| current_html = "" | |
| history: List[Tuple[str, str]] = [] | |
| results: List[StepResult] = [] | |
| for step_i in range(config.max_steps): | |
| error: Optional[str] = None | |
| try: | |
| current_html = developer_turn_long_horizon( | |
| client, config.model, ref_b64, history, dbg | |
| ) | |
| except Exception as exc: | |
| error = str(exc)[:120] | |
| current_html = FALLBACK_HTML | |
| step_resp = env_client.post( | |
| "/step", | |
| json={"html": current_html, "session_id": session_id}, | |
| ) | |
| step_resp.raise_for_status() | |
| result = step_resp.json() | |
| reward = float(result.get("reward", 0.0)) | |
| env_done = bool(result.get("done", False)) | |
| render_full = result.get("render_full") | |
| sub_rewards = result.get("metadata", {}).get("rewards") | |
| step_n = step_i + 1 | |
| sr = StepResult( | |
| step=step_n, | |
| html=current_html, | |
| reward=reward, | |
| done=env_done, | |
| critique="", | |
| todo=None, | |
| render_full_b64=render_full, | |
| sub_rewards=sub_rewards, | |
| error=error, | |
| ) | |
| if on_step: | |
| on_step(sr) | |
| if render_full: | |
| history.append((render_full, current_html)) | |
| results.append(sr) | |
| if env_done: | |
| break | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # Approach C: Short-horizon Developer (no Critic, sees only last render) | |
| # --------------------------------------------------------------------------- | |
| def developer_turn_short_horizon( | |
| client: OpenAI, | |
| model: str, | |
| ref_b64: str, | |
| prev_render_b64: Optional[str], | |
| prev_html: Optional[str], | |
| dbg=None, | |
| ) -> str: | |
| """Developer with short horizon: reference + only last render + only last HTML.""" | |
| messages = [{"role": "system", "content": DEVELOPER_SYSTEM}] | |
| user_content: list = [ | |
| {"type": "text", "text": "Reference screenshot (reproduce this UI):"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| ] | |
| if prev_render_b64 and prev_html: | |
| user_content += [ | |
| {"type": "text", "text": "\n\nYour previous render:"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{prev_render_b64}"}}, | |
| { | |
| "type": "text", | |
| "text": ( | |
| f"\n\nYour previous HTML:\n```html\n{prev_html[:3000]}\n```\n\n" | |
| "Compare the renders and output improved HTML only." | |
| ), | |
| }, | |
| ] | |
| else: | |
| user_content.append({ | |
| "type": "text", | |
| "text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.", | |
| }) | |
| messages.append({"role": "user", "content": user_content}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| max_tokens=4096, | |
| temperature=0.7, | |
| ) | |
| content = response.choices[0].message.content or "" | |
| html_out = _clean_html_output(content) | |
| return html_out if _looks_like_html(html_out) else FALLBACK_HTML | |
| def run_episode_short_dev( | |
| env_client, | |
| config: AgentConfig, | |
| session_id: str, | |
| ref_b64: str, | |
| dbg=None, | |
| on_step=None, | |
| ) -> List[StepResult]: | |
| """Approach C: Short-horizon Developer only — sees only last render each step, no Critic.""" | |
| client = OpenAI(api_key=config.api_key, base_url=config.api_base) | |
| current_html = "" | |
| prev_render: Optional[str] = None | |
| results: List[StepResult] = [] | |
| for step_i in range(config.max_steps): | |
| error: Optional[str] = None | |
| try: | |
| current_html = developer_turn_short_horizon( | |
| client, config.model, ref_b64, | |
| prev_render, | |
| current_html if step_i > 0 else None, | |
| dbg, | |
| ) | |
| except Exception as exc: | |
| error = str(exc)[:120] | |
| current_html = FALLBACK_HTML | |
| step_resp = env_client.post( | |
| "/step", | |
| json={"html": current_html, "session_id": session_id}, | |
| ) | |
| step_resp.raise_for_status() | |
| result = step_resp.json() | |
| reward = float(result.get("reward", 0.0)) | |
| env_done = bool(result.get("done", False)) | |
| render_full = result.get("render_full") | |
| sub_rewards = result.get("metadata", {}).get("rewards") | |
| step_n = step_i + 1 | |
| sr = StepResult( | |
| step=step_n, | |
| html=current_html, | |
| reward=reward, | |
| done=env_done, | |
| critique="", | |
| todo=None, | |
| render_full_b64=render_full, | |
| sub_rewards=sub_rewards, | |
| error=error, | |
| ) | |
| if on_step: | |
| on_step(sr) | |
| prev_render = render_full # only keep the latest render | |
| results.append(sr) | |
| if env_done: | |
| break | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # Approach D: Long-horizon Developer (low-res renders) + simple free-form Critic | |
| # --------------------------------------------------------------------------- | |
| _SIMPLE_CRITIC_SYSTEM = ( | |
| "You are a UI reviewer. You will be shown a reference screenshot and a current render " | |
| "of HTML that is meant to reproduce it.\n\n" | |
| "Describe what needs to change the most to make the render match the reference. " | |
| "Be concise and specific — mention exact colors, sizes, or elements where helpful. " | |
| "You can write a short paragraph or a bullet list. No structured format required." | |
| ) | |
| _SIMPLE_DEV_SYSTEM = ( | |
| "You are a UI-to-code expert. Given a reference screenshot of a web page, " | |
| "generate complete HTML with inline CSS that reproduces the layout as accurately as possible.\n\n" | |
| "Critical layout rules:\n" | |
| "- Always use `* { box-sizing: border-box; margin: 0; padding: 0; }` reset.\n" | |
| "- Page and all top-level sections must be full-width: `width: 100%; min-height: 100vh`.\n" | |
| "- Never center-constrain the overall page — only constrain inner content containers if the reference does.\n" | |
| "- Match background colors, section colors, and typography as precisely as possible.\n\n" | |
| "Output ONLY the raw HTML code starting with <!DOCTYPE html>. " | |
| "No explanations, no markdown fences — just the HTML." | |
| ) | |
| def _simple_critic_turn( | |
| client: OpenAI, | |
| model: str, | |
| ref_b64: str, | |
| render_full_b64: str, | |
| ) -> str: | |
| """Simple free-form critic: compare ref vs render, say what needs to change most.""" | |
| messages = [{"role": "system", "content": _SIMPLE_CRITIC_SYSTEM}] | |
| messages.append({"role": "user", "content": [ | |
| {"type": "text", "text": "Reference:"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| {"type": "text", "text": "Current render:"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_full_b64}"}}, | |
| {"type": "text", "text": "What needs to change the most?"}, | |
| ]}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| max_tokens=512, | |
| temperature=0.1, | |
| ) | |
| return response.choices[0].message.content or "" | |
| def _developer_turn_d( | |
| client: OpenAI, | |
| model: str, | |
| ref_b64: str, | |
| history: List[Tuple[str, str]], # (render_low_b64, html) | |
| critique: Optional[str], | |
| ) -> str: | |
| """Approach D developer: full-res ref + all previous low-res renders + all HTML + critic feedback.""" | |
| messages = [{"role": "system", "content": _SIMPLE_DEV_SYSTEM}] | |
| user_content: list = [ | |
| {"type": "text", "text": "Reference screenshot (reproduce this UI):"}, | |
| {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{ref_b64}"}}, | |
| ] | |
| if history: | |
| for i, (render_low_b64, prev_html) in enumerate(history, 1): | |
| user_content.append({"type": "text", "text": f"\n\nStep {i} render (low-res preview):"}) | |
| user_content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{render_low_b64}"}}) | |
| user_content.append({"type": "text", "text": f"Step {i} HTML:\n```html\n{prev_html[:2000]}\n```"}) | |
| if critique: | |
| user_content.append({ | |
| "type": "text", | |
| "text": f"\n\nReviewer feedback on your last render:\n{critique}\n\nGenerate improved HTML addressing this feedback. Output the HTML only.", | |
| }) | |
| elif history: | |
| user_content.append({ | |
| "type": "text", | |
| "text": "\n\nGenerate improved HTML that better matches the reference. Output the HTML only.", | |
| }) | |
| else: | |
| user_content.append({ | |
| "type": "text", | |
| "text": "\n\nGenerate complete HTML with inline CSS. Output the HTML only.", | |
| }) | |
| messages.append({"role": "user", "content": user_content}) | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| max_tokens=4096, | |
| temperature=0.7, | |
| ) | |
| content = response.choices[0].message.content or "" | |
| html_out = _clean_html_output(content) | |
| return html_out if _looks_like_html(html_out) else FALLBACK_HTML | |
| def run_episode_d( | |
| env_client, | |
| config: AgentConfig, | |
| session_id: str, | |
| ref_b64: str, | |
| dbg=None, | |
| on_step=None, | |
| ) -> List[StepResult]: | |
| """Approach D: long-horizon dev (low-res renders) + simple free-form critic.""" | |
| client = OpenAI(api_key=config.api_key, base_url=config.api_base) | |
| current_html = "" | |
| history: List[Tuple[str, str]] = [] # (render_low_b64, html) | |
| critique: Optional[str] = None | |
| results: List[StepResult] = [] | |
| for step_i in range(config.max_steps): | |
| error: Optional[str] = None | |
| try: | |
| current_html = _developer_turn_d( | |
| client, config.model, ref_b64, history, critique | |
| ) | |
| except Exception as exc: | |
| error = str(exc)[:120] | |
| current_html = FALLBACK_HTML | |
| step_resp = env_client.post( | |
| "/step", | |
| json={"html": current_html, "session_id": session_id}, | |
| ) | |
| step_resp.raise_for_status() | |
| result = step_resp.json() | |
| reward = float(result.get("reward", 0.0)) | |
| env_done = bool(result.get("done", False)) | |
| render_full = result.get("render_full") | |
| render_low = result.get("render_low") | |
| sub_rewards = result.get("metadata", {}).get("rewards") | |
| step_n = step_i + 1 | |
| sr = StepResult( | |
| step=step_n, | |
| html=current_html, | |
| reward=reward, | |
| done=env_done, | |
| critique=critique or "", | |
| todo=None, | |
| render_full_b64=render_full, | |
| sub_rewards=sub_rewards, | |
| error=error, | |
| ) | |
| if on_step: | |
| on_step(sr) | |
| if render_low: | |
| history.append((render_low, current_html)) | |
| # Critic turn (skip on final env step) | |
| if not env_done and render_full: | |
| try: | |
| critique = _simple_critic_turn(client, config.model, ref_b64, render_full) | |
| preview = critique.replace("\n", " ")[:200] | |
| print(f"[CRITIC-D] step={step_n} reward={reward:.2f} → {preview}", flush=True) | |
| except Exception as exc: | |
| logger.warning("Critic-D failed: %s", exc) | |
| critique = None | |
| results.append(sr) | |
| if env_done: | |
| break | |
| return results | |