Spaces:
Runtime error
Runtime error
dung-vpt-uney
commited on
Commit
·
58fe08c
1
Parent(s):
9c4a163
Deploy CoRGI demo - 2025-10-29 14:17:23
Browse filesFeatures:
- Structured reasoning with CoRGI protocol
- ROI extraction using Qwen3-VL grounding
- Visual evidence synthesis
- Gradio UI with per-step visualization
Model: Qwen/Qwen3-VL-8B-Thinking
- README.md +7 -5
- corgi/__pycache__/cli.cpython-312.pyc +0 -0
- corgi/__pycache__/gradio_app.cpython-312.pyc +0 -0
- corgi/__pycache__/parsers.cpython-312.pyc +0 -0
- corgi/__pycache__/pipeline.cpython-312.pyc +0 -0
- corgi/__pycache__/qwen_client.cpython-312.pyc +0 -0
- corgi/__pycache__/types.cpython-312.pyc +0 -0
- corgi/parsers.py +13 -6
- corgi/qwen_client.py +19 -4
- requirements.txt +2 -1
README.md
CHANGED
|
@@ -12,7 +12,7 @@ license: apache-2.0
|
|
| 12 |
|
| 13 |
# CoRGI Qwen3-VL Demo
|
| 14 |
|
| 15 |
-
This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-
|
| 16 |
Upload an image, ask a visual question, and the app will:
|
| 17 |
|
| 18 |
1. Generate structured reasoning steps with visual-verification flags.
|
|
@@ -24,7 +24,7 @@ Upload an image, ask a visual question, and the app will:
|
|
| 24 |
```bash
|
| 25 |
pip install -r requirements.txt
|
| 26 |
python examples/demo_qwen_corgi.py \
|
| 27 |
-
--model-id Qwen/Qwen3-VL-
|
| 28 |
--max-steps 3 \
|
| 29 |
--max-regions 3
|
| 30 |
```
|
|
@@ -37,9 +37,11 @@ python app.py
|
|
| 37 |
|
| 38 |
## Configuration Notes
|
| 39 |
|
| 40 |
-
-
|
| 41 |
-
-
|
| 42 |
-
-
|
|
|
|
|
|
|
| 43 |
|
| 44 |
## UI Overview
|
| 45 |
|
|
|
|
| 12 |
|
| 13 |
# CoRGI Qwen3-VL Demo
|
| 14 |
|
| 15 |
+
This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-4B-Instruct**.
|
| 16 |
Upload an image, ask a visual question, and the app will:
|
| 17 |
|
| 18 |
1. Generate structured reasoning steps with visual-verification flags.
|
|
|
|
| 24 |
```bash
|
| 25 |
pip install -r requirements.txt
|
| 26 |
python examples/demo_qwen_corgi.py \
|
| 27 |
+
--model-id Qwen/Qwen3-VL-4B-Instruct \
|
| 28 |
--max-steps 3 \
|
| 29 |
--max-regions 3
|
| 30 |
```
|
|
|
|
| 37 |
|
| 38 |
## Configuration Notes
|
| 39 |
|
| 40 |
+
- **Model**: Uses `Qwen/Qwen3-VL-4B-Instruct` (4B parameters, ~8GB VRAM)
|
| 41 |
+
- **Single GPU**: Model loads on single GPU (cuda:0) to avoid memory fragmentation
|
| 42 |
+
- **Hardware**: The Space runs on `cpu-basic` tier by default
|
| 43 |
+
- **Customization**: Set `CORGI_QWEN_MODEL` environment variable to use a different checkpoint
|
| 44 |
+
- **Sliders**: `max_steps` and `max_regions` control reasoning depth and ROI candidates
|
| 45 |
|
| 46 |
## UI Overview
|
| 47 |
|
corgi/__pycache__/cli.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/cli.cpython-312.pyc and b/corgi/__pycache__/cli.cpython-312.pyc differ
|
|
|
corgi/__pycache__/gradio_app.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/gradio_app.cpython-312.pyc and b/corgi/__pycache__/gradio_app.cpython-312.pyc differ
|
|
|
corgi/__pycache__/parsers.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/parsers.cpython-312.pyc and b/corgi/__pycache__/parsers.cpython-312.pyc differ
|
|
|
corgi/__pycache__/pipeline.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/pipeline.cpython-312.pyc and b/corgi/__pycache__/pipeline.cpython-312.pyc differ
|
|
|
corgi/__pycache__/qwen_client.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/qwen_client.cpython-312.pyc and b/corgi/__pycache__/qwen_client.cpython-312.pyc differ
|
|
|
corgi/__pycache__/types.cpython-312.pyc
CHANGED
|
Binary files a/corgi/__pycache__/types.cpython-312.pyc and b/corgi/__pycache__/types.cpython-312.pyc differ
|
|
|
corgi/parsers.py
CHANGED
|
@@ -127,13 +127,20 @@ def _normalize_step_markers(text: str) -> str:
|
|
| 127 |
|
| 128 |
|
| 129 |
def _extract_statement(body: str) -> str | None:
|
| 130 |
-
statement_match = re.search(r"statement\s*[:\-]\s*(
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
return None
|
|
|
|
| 137 |
return _clean_sentence(candidate)
|
| 138 |
|
| 139 |
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
def _extract_statement(body: str) -> str | None:
|
| 130 |
+
statement_match = re.search(r"statement\s*[:\-]\s*(.+?)(?=\s*(?:needs\s*vision|reason\s*[:\-]|$))", body, re.IGNORECASE | re.DOTALL)
|
| 131 |
+
if statement_match:
|
| 132 |
+
candidate = statement_match.group(1)
|
| 133 |
+
else:
|
| 134 |
+
# Fallback: take first sentence or line before metadata
|
| 135 |
+
candidate = re.split(r"(?i)needs\s*vision|reason\s*[:\-]", body)[0]
|
| 136 |
+
|
| 137 |
+
# Clean up the candidate
|
| 138 |
+
candidate = candidate.strip().rstrip(".,;:")
|
| 139 |
+
|
| 140 |
+
# If still empty or too short, return None
|
| 141 |
+
if not candidate or len(candidate) < 5:
|
| 142 |
return None
|
| 143 |
+
|
| 144 |
return _clean_sentence(candidate)
|
| 145 |
|
| 146 |
|
corgi/qwen_client.py
CHANGED
|
@@ -2,11 +2,14 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
| 4 |
from typing import List, Optional
|
|
|
|
| 5 |
|
| 6 |
import torch
|
| 7 |
from PIL import Image
|
| 8 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 9 |
|
|
|
|
|
|
|
| 10 |
try:
|
| 11 |
import spaces # type: ignore
|
| 12 |
except ImportError: # pragma: no cover - only available on HF Spaces
|
|
@@ -98,13 +101,25 @@ def _ensure_cuda(model: AutoModelForImageTextToText) -> AutoModelForImageTextToT
|
|
| 98 |
|
| 99 |
def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
|
| 100 |
if model_id not in _MODEL_CACHE:
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 103 |
model_id,
|
| 104 |
torch_dtype=torch_dtype,
|
| 105 |
-
device_map=
|
| 106 |
)
|
| 107 |
-
model =
|
| 108 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 109 |
_MODEL_CACHE[model_id] = model
|
| 110 |
_PROCESSOR_CACHE[model_id] = processor
|
|
@@ -113,7 +128,7 @@ def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProce
|
|
| 113 |
|
| 114 |
@dataclass
|
| 115 |
class QwenGenerationConfig:
|
| 116 |
-
model_id: str = "Qwen/Qwen3-VL-
|
| 117 |
max_new_tokens: int = 512
|
| 118 |
temperature: float | None = None
|
| 119 |
do_sample: bool = False
|
|
|
|
| 2 |
|
| 3 |
from dataclasses import dataclass
|
| 4 |
from typing import List, Optional
|
| 5 |
+
import logging
|
| 6 |
|
| 7 |
import torch
|
| 8 |
from PIL import Image
|
| 9 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 10 |
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
try:
|
| 14 |
import spaces # type: ignore
|
| 15 |
except ImportError: # pragma: no cover - only available on HF Spaces
|
|
|
|
| 101 |
|
| 102 |
def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
|
| 103 |
if model_id not in _MODEL_CACHE:
|
| 104 |
+
# Check if hardware supports bfloat16
|
| 105 |
+
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
|
| 106 |
+
torch_dtype = torch.bfloat16
|
| 107 |
+
logger.info("Using bfloat16 (hardware supported)")
|
| 108 |
+
elif torch.cuda.is_available():
|
| 109 |
+
torch_dtype = torch.float16 # Fallback to float16 if bfloat16 not supported
|
| 110 |
+
logger.info("Using float16 (bfloat16 not supported on this GPU)")
|
| 111 |
+
else:
|
| 112 |
+
torch_dtype = torch.float32
|
| 113 |
+
logger.info("Using float32 (CPU mode)")
|
| 114 |
+
|
| 115 |
+
# Use single GPU (cuda:0) instead of auto to avoid model sharding across multiple GPUs
|
| 116 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 117 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 118 |
model_id,
|
| 119 |
torch_dtype=torch_dtype,
|
| 120 |
+
device_map=device_map,
|
| 121 |
)
|
| 122 |
+
model = model.eval()
|
| 123 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 124 |
_MODEL_CACHE[model_id] = model
|
| 125 |
_PROCESSOR_CACHE[model_id] = processor
|
|
|
|
| 128 |
|
| 129 |
@dataclass
|
| 130 |
class QwenGenerationConfig:
|
| 131 |
+
model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
|
| 132 |
max_new_tokens: int = 512
|
| 133 |
temperature: float | None = None
|
| 134 |
do_sample: bool = False
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
accelerate>=0.34
|
| 2 |
-
transformers
|
| 3 |
pillow
|
| 4 |
torch
|
| 5 |
torchvision
|
|
@@ -7,3 +7,4 @@ gradio>=4.44
|
|
| 7 |
hydra-core
|
| 8 |
antlr4-python3-runtime
|
| 9 |
spaces
|
|
|
|
|
|
| 1 |
accelerate>=0.34
|
| 2 |
+
git+https://github.com/huggingface/transformers.git
|
| 3 |
pillow
|
| 4 |
torch
|
| 5 |
torchvision
|
|
|
|
| 7 |
hydra-core
|
| 8 |
antlr4-python3-runtime
|
| 9 |
spaces
|
| 10 |
+
qwen-vl-utils
|