Qwen3-VL-8B-Instruct

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 23 days ago

Commit

609cf38

verified ·

1 Parent(s): 4913703

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +359 -0

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+I'll create a chat application using the Qwen3-VL-4B-Instruct model that can handle both text and image inputs. This will be a multimodal chatbot that can analyze images and respond to questions about them.
+```python
+import gradio as gr
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from PIL import Image
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+import spaces
+import io
+import base64
+# Initialize the model and processor
+model_id = "Qwen/Qwen2-VL-2B-Instruct"  # Using 2B version for better performance on Spaces
+# Load model with optimizations for inference
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id)
+@spaces.GPU(duration=60)
+def process_chat_message(
+    message: str,
+    image: Optional[Image.Image],
+    history: List[Dict[str, Any]]
+) -> str:
+    """
+    Process a chat message with optional image input using Qwen3-VL model.
+    Args:
+        message: The user's text message
+        image: Optional PIL Image
+        history: Chat history
+    Returns:
+        The model's response
+    """
+    # Prepare the message content
+    content = []
+    # Add image if provided
+    if image is not None:
+        # Convert PIL image to format expected by the model
+        content.append({"type": "image", "image": image})
+    # Add text message
+    if message:
+        content.append({"type": "text", "text": message})
+    # Create the messages format for the model
+    messages = []
+    # Add history if exists (text only for simplicity)
+    for hist_item in history:
+        if hist_item["role"] == "user":
+            messages.append({
+                "role": "user",
+                "content": hist_item.get("content", "")
+            })
+        elif hist_item["role"] == "assistant":
+            messages.append({
+                "role": "assistant",
+                "content": hist_item.get("content", "")
+            })
+    # Add current message
+    if content:
+        messages.append({
+            "role": "user",
+            "content": content
+        })
+    # Prepare inputs for the model
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    if image is not None:
+        inputs = processor(
+            text=[text],
+            images=[image],
+            return_tensors="pt"
+        ).to(model.device)
+    else:
+        inputs = processor(
+            text=[text],
+            return_tensors="pt"
+        ).to(model.device)
+    # Generate response
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.95
+        )
+    # Decode the generated response
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    response = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )[0]
+    return response
+def chat_fn(message: Dict[str, Any], history: List[List[Any]]) -> Tuple[str, List[List[Any]]]:
+    """
+    Main chat function that processes user input and returns response.
+    Args:
+        message: Dictionary containing text and optional files
+        history: Chat history as list of [user_msg, assistant_msg] pairs
+    Returns:
+        Empty string and updated history
+    """
+    text = message.get("text", "")
+    files = message.get("files", [])
+    # Process image if provided
+    image = None
+    if files and len(files) > 0:
+        try:
+            image = Image.open(files[0])
+            # Convert RGBA to RGB if necessary
+            if image.mode == "RGBA":
+                background = Image.new("RGB", image.size, (255, 255, 255))
+                background.paste(image, mask=image.split()[3])
+                image = background
+        except Exception as e:
+            print(f"Error loading image: {e}")
+            image = None
+    # Convert history to format expected by model
+    model_history = []
+    for user_msg, assistant_msg in history:
+        if isinstance(user_msg, dict):
+            model_history.append({"role": "user", "content": user_msg.get("text", "")})
+        elif isinstance(user_msg, str):
+            model_history.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            model_history.append({"role": "assistant", "content": assistant_msg})
+    # Get response from model
+    try:
+        response = process_chat_message(text, image, model_history)
+    except Exception as e:
+        response = f"Sorry, I encountered an error: {str(e)}"
+    # Update history
+    if image is not None:
+        # Store message with image indicator
+        user_message = {"text": text, "image": "[Image uploaded]"}
+    else:
+        user_message = text
+    history.append([user_message, response])
+    return "", history
+def retry_fn(history: List[List[Any]]) -> Tuple[str, List[List[Any]]]:
+    """Retry the last message."""
+    if not history:
+        return "", history
+    # Remove last assistant response and regenerate
+    last_user_msg = history[-1][0]
+    history = history[:-1]
+    # Recreate the message dict
+    if isinstance(last_user_msg, dict):
+        message = {"text": last_user_msg.get("text", "")}
+    else:
+        message = {"text": last_user_msg}
+    return chat_fn(message, history)
+def undo_fn(history: List[List[Any]]) -> List[List[Any]]:
+    """Undo the last message."""
+    if history:
+        return history[:-1]
+    return history
+def clear_fn() -> Tuple[None, List]:
+    """Clear the chat."""
+    return None, []
+# Create the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
+    gr.Markdown(
+        """
+        # 🌟 Qwen3-VL Multimodal Chat
+        Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images!
+        **Features:**
+        - 📝 Text conversations
+        - 🖼️ Image understanding and analysis
+        - 🎨 Visual question answering
+        - 🔍 Detailed image descriptions
+        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+                ### 💡 Tips:
+                - Upload an image and ask questions about it
+                - Try asking for detailed descriptions
+                - Ask about objects, colors, text in images
+                - Compare elements within the image
+                """
+            )
+            gr.Markdown(
+                """
+                ### 📸 Example Prompts:
+                - "What's in this image?"
+                - "Describe this scene in detail"
+                - "What text can you see?"
+                - "Count the objects in the image"
+                - "What's the mood of this image?"
+                """
+            )
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(
+                label="Chat",
+                type="messages",
+                height=500,
+                show_copy_button=True,
+                bubble_full_width=False,
+                avatar_images=[None, "🤖"]
+            )
+            with gr.Row():
+                msg = gr.MultimodalTextbox(
+                    label="Message",
+                    placeholder="Type a message or upload an image...",
+                    file_types=["image"],
+                    submit_btn=True,
+                    stop_btn=False
+                )
+            with gr.Row():
+                retry_btn = gr.Button("🔄 Retry", variant="secondary", size="sm")
+                undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm")
+    with gr.Accordion("⚙️ Advanced Settings", open=False):
+        gr.Markdown(
+            """
+            **Model Information:**
+            - Model: Qwen3-VL-4B-Instruct
+            - Optimized for vision-language tasks
+            - Supports multiple languages
+            - Best performance with clear, well-lit images
+            """
+        )
+    # Set up event handlers
+    msg.submit(
+        chat_fn,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        queue=True
+    )
+    retry_btn.click(
+        retry_fn,
+        inputs=[chatbot],
+        outputs=[msg, chatbot],
+        queue=True
+    )
+    undo_btn.click(
+        undo_fn,
+        inputs=[chatbot],
+        outputs=[chatbot],
+        queue=False
+    )
+    clear_btn.click(
+        clear_fn,
+        outputs=[msg, chatbot],
+        queue=False
+    )
+    # Add examples
+    gr.Examples(
+        examples=[
+            {"text": "Hello! What can you help me with today?"},
+            {"text": "Can you describe an image if I upload one?"},
+            {"text": "What are your capabilities?"},
+        ],
+        inputs=msg,
+        label="Example Messages"
+    )
+if __name__ == "__main__":
+    demo.launch(
+        show_error=True,
+        share=False,
+        debug=True
+    )
+```
+Now let's create the requirements.txt file:
+```
+gradio
+transformers
+torch
+torchvision
+spaces
+Pillow
+numpy
+accelerate
+sentencepiece
+einops
+transformers_stream_generator
+```
+This application creates a multimodal chat interface with the following features:
+1. **Multimodal Input**: Users can send text messages, images, or both
+2. **Vision-Language Understanding**: The Qwen3-VL model can analyze images and answer questions about them
+3. **Chat History**: Maintains conversation context
+4. **Interactive Controls**: Retry, undo, and clear buttons for better user experience
+5. **GPU Optimization**: Uses the @spaces.GPU decorator for efficient inference
+6. **Clean UI**: Professional interface with helpful tips and examples
+The app can:
+- Describe images in detail
+- Answer questions about image content
+- Count objects in images
+- Read text from images
+- Discuss colors, composition, and mood
+- Maintain conversational context
+The interface is user-friendly with a clean design and provides guidance on how to use the multimodal capabilities effectively.