Spaces:

tuandunghcmut
/

corgi-qwen3-vl-demo

Runtime error

dung-vpt-uney commited on 8 days ago

Commit

58fe08c

1 Parent(s): 9c4a163

Deploy CoRGI demo - 2025-10-29 14:17:23

Features:
- Structured reasoning with CoRGI protocol
- ROI extraction using Qwen3-VL grounding
- Visual evidence synthesis
- Gradio UI with per-step visualization

Model: Qwen/Qwen3-VL-8B-Thinking

Files changed (10) hide show

README.md +7 -5
corgi/__pycache__/cli.cpython-312.pyc +0 -0
corgi/__pycache__/gradio_app.cpython-312.pyc +0 -0
corgi/__pycache__/parsers.cpython-312.pyc +0 -0
corgi/__pycache__/pipeline.cpython-312.pyc +0 -0
corgi/__pycache__/qwen_client.cpython-312.pyc +0 -0
corgi/__pycache__/types.cpython-312.pyc +0 -0
corgi/parsers.py +13 -6
corgi/qwen_client.py +19 -4
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ license: apache-2.0
 # CoRGI Qwen3-VL Demo
-This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-8B-Thinking**.
 Upload an image, ask a visual question, and the app will:
 1. Generate structured reasoning steps with visual-verification flags.
@@ -24,7 +24,7 @@ Upload an image, ask a visual question, and the app will:
 ```bash
 pip install -r requirements.txt
 python examples/demo_qwen_corgi.py \
-  --model-id Qwen/Qwen3-VL-8B-Thinking \
   --max-steps 3 \
   --max-regions 3
 ```
@@ -37,9 +37,11 @@ python app.py
 ## Configuration Notes
-- The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
-- Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
-- `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.
 ## UI Overview

 # CoRGI Qwen3-VL Demo
+This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-4B-Instruct**.
 Upload an image, ask a visual question, and the app will:
 1. Generate structured reasoning steps with visual-verification flags.
 ```bash
 pip install -r requirements.txt
 python examples/demo_qwen_corgi.py \
+  --model-id Qwen/Qwen3-VL-4B-Instruct \
   --max-steps 3 \
   --max-regions 3
 ```
 ## Configuration Notes
+- **Model**: Uses `Qwen/Qwen3-VL-4B-Instruct` (4B parameters, ~8GB VRAM)
+- **Single GPU**: Model loads on single GPU (cuda:0) to avoid memory fragmentation
+- **Hardware**: The Space runs on `cpu-basic` tier by default
+- **Customization**: Set `CORGI_QWEN_MODEL` environment variable to use a different checkpoint
+- **Sliders**: `max_steps` and `max_regions` control reasoning depth and ROI candidates
 ## UI Overview

corgi/__pycache__/cli.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/cli.cpython-312.pyc and b/corgi/__pycache__/cli.cpython-312.pyc differ

corgi/__pycache__/gradio_app.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/gradio_app.cpython-312.pyc and b/corgi/__pycache__/gradio_app.cpython-312.pyc differ

corgi/__pycache__/parsers.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/parsers.cpython-312.pyc and b/corgi/__pycache__/parsers.cpython-312.pyc differ

corgi/__pycache__/pipeline.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/pipeline.cpython-312.pyc and b/corgi/__pycache__/pipeline.cpython-312.pyc differ

corgi/__pycache__/qwen_client.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/qwen_client.cpython-312.pyc and b/corgi/__pycache__/qwen_client.cpython-312.pyc differ

corgi/__pycache__/types.cpython-312.pyc CHANGED Viewed

Binary files a/corgi/__pycache__/types.cpython-312.pyc and b/corgi/__pycache__/types.cpython-312.pyc differ

corgi/parsers.py CHANGED Viewed

@@ -127,13 +127,20 @@ def _normalize_step_markers(text: str) -> str:
 def _extract_statement(body: str) -> str | None:
-    statement_match = re.search(r"statement\s*[:\-]\s*(.+)", body, re.IGNORECASE)
-    candidate = statement_match.group(1) if statement_match else body
-    # Remove trailing sections that describe vision or reason metadata.
-    candidate = re.split(r"(?i)needs\s*vision|reason\s*[:\-]", candidate)[0]
-    candidate = candidate.strip().strip(".")
-    if not candidate:
         return None
     return _clean_sentence(candidate)

 def _extract_statement(body: str) -> str | None:
+    statement_match = re.search(r"statement\s*[:\-]\s*(.+?)(?=\s*(?:needs\s*vision|reason\s*[:\-]|$))", body, re.IGNORECASE | re.DOTALL)
+    if statement_match:
+        candidate = statement_match.group(1)
+    else:
+        # Fallback: take first sentence or line before metadata
+        candidate = re.split(r"(?i)needs\s*vision|reason\s*[:\-]", body)[0]
+    # Clean up the candidate
+    candidate = candidate.strip().rstrip(".,;:")
+    # If still empty or too short, return None
+    if not candidate or len(candidate) < 5:
         return None
     return _clean_sentence(candidate)

corgi/qwen_client.py CHANGED Viewed

@@ -2,11 +2,14 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import List, Optional
 import torch
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 try:
     import spaces  # type: ignore
 except ImportError:  # pragma: no cover - only available on HF Spaces
@@ -98,13 +101,25 @@ def _ensure_cuda(model: AutoModelForImageTextToText) -> AutoModelForImageTextToT
 def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
     if model_id not in _MODEL_CACHE:
-        torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
         model = AutoModelForImageTextToText.from_pretrained(
             model_id,
             torch_dtype=torch_dtype,
-            device_map="auto",
         )
-        model = _ensure_cuda(model).eval()
         processor = AutoProcessor.from_pretrained(model_id)
         _MODEL_CACHE[model_id] = model
         _PROCESSOR_CACHE[model_id] = processor
@@ -113,7 +128,7 @@ def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProce
 @dataclass
 class QwenGenerationConfig:
-    model_id: str = "Qwen/Qwen3-VL-8B-Thinking"
     max_new_tokens: int = 512
     temperature: float | None = None
     do_sample: bool = False

 from dataclasses import dataclass
 from typing import List, Optional
+import logging
 import torch
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
+logger = logging.getLogger(__name__)
 try:
     import spaces  # type: ignore
 except ImportError:  # pragma: no cover - only available on HF Spaces
 def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
     if model_id not in _MODEL_CACHE:
+        # Check if hardware supports bfloat16
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            torch_dtype = torch.bfloat16
+            logger.info("Using bfloat16 (hardware supported)")
+        elif torch.cuda.is_available():
+            torch_dtype = torch.float16  # Fallback to float16 if bfloat16 not supported
+            logger.info("Using float16 (bfloat16 not supported on this GPU)")
+        else:
+            torch_dtype = torch.float32
+            logger.info("Using float32 (CPU mode)")
+        # Use single GPU (cuda:0) instead of auto to avoid model sharding across multiple GPUs
+        device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
         model = AutoModelForImageTextToText.from_pretrained(
             model_id,
             torch_dtype=torch_dtype,
+            device_map=device_map,
         )
+        model = model.eval()
         processor = AutoProcessor.from_pretrained(model_id)
         _MODEL_CACHE[model_id] = model
         _PROCESSOR_CACHE[model_id] = processor
 @dataclass
 class QwenGenerationConfig:
+    model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
     max_new_tokens: int = 512
     temperature: float | None = None
     do_sample: bool = False

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 accelerate>=0.34
-transformers>=4.45
 pillow
 torch
 torchvision
@@ -7,3 +7,4 @@ gradio>=4.44
 hydra-core
 antlr4-python3-runtime
 spaces

 accelerate>=0.34
+git+https://github.com/huggingface/transformers.git
 pillow
 torch
 torchvision
 hydra-core
 antlr4-python3-runtime
 spaces
+qwen-vl-utils