GD-ML
/

Code2World

@@ -22,20 +22,18 @@ pip install transformers==4.57.0
 ```
 ```python
-import os
-import re
-import json
-import math
 import torch
-from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
-from playwright.sync_api import sync_playwright
 from prompt_builder import SYSTEM_PROMPT, build_user_prompt
 from visual_hint import build_visual_hint
-from render_utils import render_html_to_image, save_demo_outputs
 MODEL_NAME = "GD-ML/Code2World"
@@ -45,34 +43,25 @@ model = Qwen3VLForConditionalGeneration.from_pretrained(
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
-def extract_clean_html(text: str) -> str:
-    text = text.replace("```html", "").replace("```", "")
-    start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
-    end_match = re.search(r"</html>", text, re.IGNORECASE)
-    if start_match and end_match:
-        start_idx = start_match.start()
-        end_idx = end_match.end()
-        if end_idx > start_idx:
-            return text[start_idx:end_idx]
-    return text.strip()
-def build_messages(image: Image.Image, instruction: str, action: dict, semantic_desc=None):
     user_prompt = build_user_prompt(
         instruction_str=instruction,
         action=action,
-        semantic_desc=semantic_desc,
     )
-    return [
-        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {
             "role": "user",
             "content": [
@@ -81,15 +70,15 @@ def build_messages(image: Image.Image, instruction: str, action: dict, semantic_
             ],
         },
     ]
 @torch.inference_mode()
-def generate_html(image: Image.Image, instruction: str, action: dict, semantic_desc=None, max_new_tokens: int = 8192):
     messages = build_messages(
         image=image,
         instruction=instruction,
         action=action,
-        semantic_desc=semantic_desc,
     )
     inputs = processor.apply_chat_template(
@@ -101,9 +90,14 @@ def generate_html(image: Image.Image, instruction: str, action: dict, semantic_d
     )
     inputs = inputs.to(model.device)
-    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
     generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
@@ -112,58 +106,60 @@ def generate_html(image: Image.Image, instruction: str, action: dict, semantic_d
         clean_up_tokenization_spaces=False,
     )[0]
-    return extract_clean_html(output_text)
-def run_demo(
-    image_path: str,
-    instruction: str,
-    action: dict,
-    step_pam: dict | None = None,
-    semantic_desc: str | None = None,
-    use_visual_hint: bool = True,
-    max_new_tokens: int = 8192,
-    output_dir: str = "./demo_outputs",
-):
     image = Image.open(image_path).convert("RGB")
-    if use_visual_hint:
-        hinted_image = build_visual_hint(image, action, step_pam)
-    else:
-        hinted_image = image
     html = generate_html(
         image=hinted_image,
         instruction=instruction,
         action=action,
-        semantic_desc=semantic_desc,
-        max_new_tokens=max_new_tokens,
     )
     rendered_image = render_html_to_image(html)
-    save_demo_outputs(output_dir, hinted_image, html, rendered_image)
     return hinted_image, html, rendered_image
 if __name__ == "__main__":
-    example_action = {
-        "action_type": "click",
-        "x": 540,
-        "y": 320,
-    }
-    example_step_pam = {
-        "coordinate": [540, 320]
     }
-    run_demo(
-        image_path="./examples/current.png",
-        instruction="Tap the search bar to start searching.",
-        action=example_action,
-        step_pam=example_step_pam,
-        output_dir="./demo_outputs",
-    )
 ```
 ## Citation

 ```
 ```python
 import torch
+from PIL import Image
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 from prompt_builder import SYSTEM_PROMPT, build_user_prompt
 from visual_hint import build_visual_hint
+from render_utils import extract_clean_html, render_html_to_image, save_demo_outputs
+# ============================================================
+# 1. Load model
+# ============================================================
 MODEL_NAME = "GD-ML/Code2World"
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+# ============================================================
+# 2. Helper functions
+# ============================================================
+def build_messages(image, instruction, action):
     user_prompt = build_user_prompt(
         instruction_str=instruction,
         action=action,
     )
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": SYSTEM_PROMPT}],
+        },
         {
             "role": "user",
             "content": [
             ],
         },
     ]
+    return messages
 @torch.inference_mode()
+def generate_html(image, instruction, action, max_new_tokens=8192):
     messages = build_messages(
         image=image,
         instruction=instruction,
         action=action,
     )
     inputs = processor.apply_chat_template(
     )
     inputs = inputs.to(model.device)
+    generated_ids = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+    )
     generated_ids_trimmed = [
+        out_ids[len(in_ids):]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
         clean_up_tokenization_spaces=False,
     )[0]
+    html = extract_clean_html(output_text)
+    return html
+def run_demo(case_data, output_dir="./demo_outputs"):
+    """
+    case_data 只需要这些 key：
+    - images[0]
+    - instruction
+    - action
+    """
+    image_path = case_data["images"][0]
+    instruction = case_data["instruction"]
+    action = case_data["action"]
     image = Image.open(image_path).convert("RGB")
+    hinted_image = build_visual_hint(image, action)
     html = generate_html(
         image=hinted_image,
         instruction=instruction,
         action=action,
     )
     rendered_image = render_html_to_image(html)
+    save_demo_outputs(
+        output_dir=output_dir,
+        hinted_image=hinted_image,
+        html=html,
+        rendered_image=rendered_image,
+    )
     return hinted_image, html, rendered_image
+# ============================================================
+# 3. Example case
+# ============================================================
 if __name__ == "__main__":
+    case_data = {
+        "images": [
+            "/mnt/workspace/zyh/wm_ability/android_control/test_images_mini/904_7.png"
+        ],
+        "instruction": "Click on the Search Omio button.",
+        "action": {
+            "action_type": "click",
+            "x": 540,
+            "y": 1470
+        }
     }
+    run_demo(case_data, output_dir="./demo_outputs")
 ```
 ## Citation