dung-vpt-uney commited on
Commit
58fe08c
·
1 Parent(s): 9c4a163

Deploy CoRGI demo - 2025-10-29 14:17:23

Browse files

Features:
- Structured reasoning with CoRGI protocol
- ROI extraction using Qwen3-VL grounding
- Visual evidence synthesis
- Gradio UI with per-step visualization

Model: Qwen/Qwen3-VL-8B-Thinking

README.md CHANGED
@@ -12,7 +12,7 @@ license: apache-2.0
12
 
13
  # CoRGI Qwen3-VL Demo
14
 
15
- This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-8B-Thinking**.
16
  Upload an image, ask a visual question, and the app will:
17
 
18
  1. Generate structured reasoning steps with visual-verification flags.
@@ -24,7 +24,7 @@ Upload an image, ask a visual question, and the app will:
24
  ```bash
25
  pip install -r requirements.txt
26
  python examples/demo_qwen_corgi.py \
27
- --model-id Qwen/Qwen3-VL-8B-Thinking \
28
  --max-steps 3 \
29
  --max-regions 3
30
  ```
@@ -37,9 +37,11 @@ python app.py
37
 
38
  ## Configuration Notes
39
 
40
- - The Space queues requests sequentially on `cpu-basic` (ZeroGPU) hardware.
41
- - Set the `CORGI_QWEN_MODEL` environment variable to try another Qwen3-VL checkpoint (for example, `Qwen/Qwen3-VL-4B-Instruct`).
42
- - `max_steps` and `max_regions` sliders control how many reasoning steps and ROI candidates the model returns.
 
 
43
 
44
  ## UI Overview
45
 
 
12
 
13
  # CoRGI Qwen3-VL Demo
14
 
15
+ This Space showcases the CoRGI reasoning pipeline powered entirely by **Qwen/Qwen3-VL-4B-Instruct**.
16
  Upload an image, ask a visual question, and the app will:
17
 
18
  1. Generate structured reasoning steps with visual-verification flags.
 
24
  ```bash
25
  pip install -r requirements.txt
26
  python examples/demo_qwen_corgi.py \
27
+ --model-id Qwen/Qwen3-VL-4B-Instruct \
28
  --max-steps 3 \
29
  --max-regions 3
30
  ```
 
37
 
38
  ## Configuration Notes
39
 
40
+ - **Model**: Uses `Qwen/Qwen3-VL-4B-Instruct` (4B parameters, ~8GB VRAM)
41
+ - **Single GPU**: Model loads on single GPU (cuda:0) to avoid memory fragmentation
42
+ - **Hardware**: The Space runs on `cpu-basic` tier by default
43
+ - **Customization**: Set `CORGI_QWEN_MODEL` environment variable to use a different checkpoint
44
+ - **Sliders**: `max_steps` and `max_regions` control reasoning depth and ROI candidates
45
 
46
  ## UI Overview
47
 
corgi/__pycache__/cli.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/cli.cpython-312.pyc and b/corgi/__pycache__/cli.cpython-312.pyc differ
 
corgi/__pycache__/gradio_app.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/gradio_app.cpython-312.pyc and b/corgi/__pycache__/gradio_app.cpython-312.pyc differ
 
corgi/__pycache__/parsers.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/parsers.cpython-312.pyc and b/corgi/__pycache__/parsers.cpython-312.pyc differ
 
corgi/__pycache__/pipeline.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/pipeline.cpython-312.pyc and b/corgi/__pycache__/pipeline.cpython-312.pyc differ
 
corgi/__pycache__/qwen_client.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/qwen_client.cpython-312.pyc and b/corgi/__pycache__/qwen_client.cpython-312.pyc differ
 
corgi/__pycache__/types.cpython-312.pyc CHANGED
Binary files a/corgi/__pycache__/types.cpython-312.pyc and b/corgi/__pycache__/types.cpython-312.pyc differ
 
corgi/parsers.py CHANGED
@@ -127,13 +127,20 @@ def _normalize_step_markers(text: str) -> str:
127
 
128
 
129
  def _extract_statement(body: str) -> str | None:
130
- statement_match = re.search(r"statement\s*[:\-]\s*(.+)", body, re.IGNORECASE)
131
- candidate = statement_match.group(1) if statement_match else body
132
- # Remove trailing sections that describe vision or reason metadata.
133
- candidate = re.split(r"(?i)needs\s*vision|reason\s*[:\-]", candidate)[0]
134
- candidate = candidate.strip().strip(".")
135
- if not candidate:
 
 
 
 
 
 
136
  return None
 
137
  return _clean_sentence(candidate)
138
 
139
 
 
127
 
128
 
129
  def _extract_statement(body: str) -> str | None:
130
+ statement_match = re.search(r"statement\s*[:\-]\s*(.+?)(?=\s*(?:needs\s*vision|reason\s*[:\-]|$))", body, re.IGNORECASE | re.DOTALL)
131
+ if statement_match:
132
+ candidate = statement_match.group(1)
133
+ else:
134
+ # Fallback: take first sentence or line before metadata
135
+ candidate = re.split(r"(?i)needs\s*vision|reason\s*[:\-]", body)[0]
136
+
137
+ # Clean up the candidate
138
+ candidate = candidate.strip().rstrip(".,;:")
139
+
140
+ # If still empty or too short, return None
141
+ if not candidate or len(candidate) < 5:
142
  return None
143
+
144
  return _clean_sentence(candidate)
145
 
146
 
corgi/qwen_client.py CHANGED
@@ -2,11 +2,14 @@ from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
  from typing import List, Optional
 
5
 
6
  import torch
7
  from PIL import Image
8
  from transformers import AutoModelForImageTextToText, AutoProcessor
9
 
 
 
10
  try:
11
  import spaces # type: ignore
12
  except ImportError: # pragma: no cover - only available on HF Spaces
@@ -98,13 +101,25 @@ def _ensure_cuda(model: AutoModelForImageTextToText) -> AutoModelForImageTextToT
98
 
99
  def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
100
  if model_id not in _MODEL_CACHE:
101
- torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 
 
 
 
 
 
 
 
 
 
 
 
102
  model = AutoModelForImageTextToText.from_pretrained(
103
  model_id,
104
  torch_dtype=torch_dtype,
105
- device_map="auto",
106
  )
107
- model = _ensure_cuda(model).eval()
108
  processor = AutoProcessor.from_pretrained(model_id)
109
  _MODEL_CACHE[model_id] = model
110
  _PROCESSOR_CACHE[model_id] = processor
@@ -113,7 +128,7 @@ def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProce
113
 
114
  @dataclass
115
  class QwenGenerationConfig:
116
- model_id: str = "Qwen/Qwen3-VL-8B-Thinking"
117
  max_new_tokens: int = 512
118
  temperature: float | None = None
119
  do_sample: bool = False
 
2
 
3
  from dataclasses import dataclass
4
  from typing import List, Optional
5
+ import logging
6
 
7
  import torch
8
  from PIL import Image
9
  from transformers import AutoModelForImageTextToText, AutoProcessor
10
 
11
+ logger = logging.getLogger(__name__)
12
+
13
  try:
14
  import spaces # type: ignore
15
  except ImportError: # pragma: no cover - only available on HF Spaces
 
101
 
102
  def _load_backend(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
103
  if model_id not in _MODEL_CACHE:
104
+ # Check if hardware supports bfloat16
105
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
106
+ torch_dtype = torch.bfloat16
107
+ logger.info("Using bfloat16 (hardware supported)")
108
+ elif torch.cuda.is_available():
109
+ torch_dtype = torch.float16 # Fallback to float16 if bfloat16 not supported
110
+ logger.info("Using float16 (bfloat16 not supported on this GPU)")
111
+ else:
112
+ torch_dtype = torch.float32
113
+ logger.info("Using float32 (CPU mode)")
114
+
115
+ # Use single GPU (cuda:0) instead of auto to avoid model sharding across multiple GPUs
116
+ device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
117
  model = AutoModelForImageTextToText.from_pretrained(
118
  model_id,
119
  torch_dtype=torch_dtype,
120
+ device_map=device_map,
121
  )
122
+ model = model.eval()
123
  processor = AutoProcessor.from_pretrained(model_id)
124
  _MODEL_CACHE[model_id] = model
125
  _PROCESSOR_CACHE[model_id] = processor
 
128
 
129
  @dataclass
130
  class QwenGenerationConfig:
131
+ model_id: str = "Qwen/Qwen3-VL-4B-Instruct"
132
  max_new_tokens: int = 512
133
  temperature: float | None = None
134
  do_sample: bool = False
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  accelerate>=0.34
2
- transformers>=4.45
3
  pillow
4
  torch
5
  torchvision
@@ -7,3 +7,4 @@ gradio>=4.44
7
  hydra-core
8
  antlr4-python3-runtime
9
  spaces
 
 
1
  accelerate>=0.34
2
+ git+https://github.com/huggingface/transformers.git
3
  pillow
4
  torch
5
  torchvision
 
7
  hydra-core
8
  antlr4-python3-runtime
9
  spaces
10
+ qwen-vl-utils