Spaces:

lapa-llm
/

lapa

Running on Zero

Vladyslav Humennyy Claude commited on Oct 3

Commit

14729c6

1 Parent(s): a113c8a

Fix image handling for Gradio compatibility

- Store images as file paths for Gradio display (type: "image")
- Keep base64 in _base64 metadata for model processing
- Clean metadata before displaying to avoid validation errors
- Combine text and image in single message structure
- Properly convert base64 to PIL images for processor

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +56 -16

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ model, tokenizer, processor, device = load_model()
 def user(user_message, image_data, history: list):
-    """Format user message with optional image (like app_chat_vllm.py)."""
     import base64
     import io
     from PIL import Image
@@ -72,22 +72,30 @@ def user(user_message, image_data, history: list):
     stripped_message = user_message.strip()
-    # Format message with image in base64 format (matching app_chat_vllm.py)
     if image_data is not None:
-        # Convert PIL image to base64
         buffered = io.BytesIO()
         image_data.save(buffered, format="JPEG")
         img_base64 = base64.b64encode(buffered.getvalue()).decode()
         text_content = stripped_message if stripped_message else "Describe this image"
         updated_history.append({
             "role": "user",
             "content": [
                 {"type": "text", "text": text_content},
                 {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                 },
             ],
         })
@@ -126,6 +134,33 @@ def _extract_text_from_content(content: Any) -> str:
     return str(content)
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
@@ -147,7 +182,7 @@ def bot(
     # Check if any message contains images
     has_images = any(
         isinstance(msg.get("content"), list) and
-        any(item.get("type") == "image_url" for item in msg.get("content") if isinstance(item, dict))
         for msg in history
     )
@@ -156,8 +191,7 @@ def bot(
     # Use processor if images are present
     if processor is not None and has_images:
         try:
-            # Processor expects messages with PIL images, not base64
-            # We need to convert base64 back to PIL for the processor
             from PIL import Image
             import base64
             import io
@@ -175,13 +209,19 @@ def bot(
                         if isinstance(item, dict):
                             if item.get("type") == "text":
                                 formatted_content.append({"type": "text", "text": item.get("text", "")})
-                            elif item.get("type") == "image_url":
-                                # Extract base64 and convert to PIL
-                                img_url = item.get("image_url", {}).get("url", "")
-                                if img_url.startswith("data:image"):
-                                    base64_data = img_url.split(",")[1]
-                                    img_data = base64.b64decode(base64_data)
-                                    pil_image = Image.open(io.BytesIO(img_data))
                                     formatted_content.append({"type": "image", "image": pil_image})
                     if formatted_content:
                         processor_history.append({"role": role, "content": formatted_content})
@@ -241,7 +281,7 @@ def bot(
     # Yield tokens as they come in
     for new_text in streamer:
         history[-1]["content"] += new_text
-        yield history
     assistant_message = history[-1]["content"]
     logger.log_interaction(user=user_message_text, answer=assistant_message)

 def user(user_message, image_data, history: list):
+    """Format user message with optional image."""
     import base64
     import io
     from PIL import Image
     stripped_message = user_message.strip()
+    # If we have an image, save it to temp file for Gradio display and also encode as base64 for model
     if image_data is not None:
+        # Save to temp file for Gradio display
+        fd, tmp_path = tempfile.mkstemp(suffix=".jpg")
+        os.close(fd)
+        image_data.save(tmp_path, format="JPEG")
+        # Also encode as base64 for model processing (stored in metadata)
         buffered = io.BytesIO()
         image_data.save(buffered, format="JPEG")
         img_base64 = base64.b64encode(buffered.getvalue()).decode()
         text_content = stripped_message if stripped_message else "Describe this image"
+        # Store both text and image in a single message with base64 in metadata
         updated_history.append({
             "role": "user",
             "content": [
                 {"type": "text", "text": text_content},
                 {
+                    "type": "image",
+                    "path": tmp_path,
+                    "alt_text": "User uploaded image",
+                    "_base64": f"data:image/jpeg;base64,{img_base64}",  # Store base64 for model
                 },
             ],
         })
     return str(content)
+def _clean_history_for_display(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Remove internal metadata fields like _base64 before displaying in Gradio."""
+    cleaned = []
+    for message in history:
+        cleaned_message = {"role": message.get("role", "user")}
+        content = message.get("content")
+        if isinstance(content, str):
+            cleaned_message["content"] = content
+        elif isinstance(content, list):
+            cleaned_content = []
+            for item in content:
+                if isinstance(item, dict):
+                    # Remove _base64 metadata
+                    cleaned_item = {k: v for k, v in item.items() if not k.startswith("_")}
+                    cleaned_content.append(cleaned_item)
+                else:
+                    cleaned_content.append(item)
+            cleaned_message["content"] = cleaned_content
+        else:
+            cleaned_message["content"] = content
+        cleaned.append(cleaned_message)
+    return cleaned
 @spaces.GPU
 def bot(
     history: list[dict[str, Any]]
     # Check if any message contains images
     has_images = any(
         isinstance(msg.get("content"), list) and
+        any(item.get("type") == "image" for item in msg.get("content") if isinstance(item, dict))
         for msg in history
     )
     # Use processor if images are present
     if processor is not None and has_images:
         try:
+            # Processor expects messages with PIL images
             from PIL import Image
             import base64
             import io
                         if isinstance(item, dict):
                             if item.get("type") == "text":
                                 formatted_content.append({"type": "text", "text": item.get("text", "")})
+                            elif item.get("type") == "image":
+                                # Use _base64 metadata if available, otherwise load from path
+                                pil_image = None
+                                if "_base64" in item:
+                                    img_url = item["_base64"]
+                                    if img_url.startswith("data:image"):
+                                        base64_data = img_url.split(",")[1]
+                                        img_data = base64.b64decode(base64_data)
+                                        pil_image = Image.open(io.BytesIO(img_data))
+                                elif "path" in item:
+                                    pil_image = Image.open(item["path"])
+                                if pil_image is not None:
                                     formatted_content.append({"type": "image", "image": pil_image})
                     if formatted_content:
                         processor_history.append({"role": role, "content": formatted_content})
     # Yield tokens as they come in
     for new_text in streamer:
         history[-1]["content"] += new_text
+        yield _clean_history_for_display(history)
     assistant_message = history[-1]["content"]
     logger.log_interaction(user=user_message_text, answer=assistant_message)