dung-vpt-uney
Deploy latest CoRGI Gradio demo
b6a01d6
raw
history blame
4.23 kB
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Callable, Optional, TextIO
from PIL import Image
from .pipeline import CoRGIPipeline
from .qwen_client import Qwen3VLClient, QwenGenerationConfig
from .types import GroundedEvidence, ReasoningStep
DEFAULT_MODEL_ID = "Qwen/Qwen3-VL-8B-Thinking"
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="corgi-cli",
description="Run the CoRGI reasoning pipeline over an image/question pair.",
)
parser.add_argument("--image", type=Path, required=True, help="Path to the input image (jpg/png/etc.)")
parser.add_argument("--question", type=str, required=True, help="Visual question for the image")
parser.add_argument("--max-steps", type=int, default=4, help="Maximum number of reasoning steps to request")
parser.add_argument(
"--max-regions",
type=int,
default=4,
help="Maximum number of grounded regions per visual step",
)
parser.add_argument(
"--model-id",
type=str,
default=None,
help="Optional override for the Qwen3-VL model identifier",
)
parser.add_argument(
"--json-out",
type=Path,
default=None,
help="Optional path to write the pipeline result as JSON",
)
return parser
def _format_step(step: ReasoningStep) -> str:
needs = "yes" if step.needs_vision else "no"
suffix = f"; reason: {step.reason}" if step.reason else ""
return f"[{step.index}] {step.statement} (needs vision: {needs}{suffix})"
def _format_evidence_item(evidence: GroundedEvidence) -> str:
bbox = ", ".join(f"{coord:.2f}" for coord in evidence.bbox)
parts = [f"Step {evidence.step_index} | bbox=({bbox})"]
if evidence.description:
parts.append(f"desc: {evidence.description}")
if evidence.confidence is not None:
parts.append(f"conf: {evidence.confidence:.2f}")
return " | ".join(parts)
def _default_pipeline_factory(model_id: Optional[str]) -> CoRGIPipeline:
config = QwenGenerationConfig(model_id=model_id or DEFAULT_MODEL_ID)
client = Qwen3VLClient(config=config)
return CoRGIPipeline(vlm_client=client)
def execute_cli(
*,
image_path: Path,
question: str,
max_steps: int,
max_regions: int,
model_id: Optional[str],
json_out: Optional[Path],
pipeline_factory: Callable[[Optional[str]], CoRGIPipeline] | None = None,
output_stream: TextIO | None = None,
) -> None:
if output_stream is None:
output_stream = sys.stdout
factory = pipeline_factory or _default_pipeline_factory
with Image.open(image_path) as img:
image = img.convert("RGB")
pipeline = factory(model_id)
result = pipeline.run(
image=image,
question=question,
max_steps=max_steps,
max_regions=max_regions,
)
print(f"Question: {question}", file=output_stream)
print("-- Steps --", file=output_stream)
for step in result.steps:
print(_format_step(step), file=output_stream)
if not result.steps:
print("(no reasoning steps returned)", file=output_stream)
print("-- Evidence --", file=output_stream)
if result.evidence:
for evidence in result.evidence:
print(_format_evidence_item(evidence), file=output_stream)
else:
print("(no visual evidence)", file=output_stream)
print("-- Answer --", file=output_stream)
print(f"Answer: {result.answer}", file=output_stream)
if json_out is not None:
json_out.parent.mkdir(parents=True, exist_ok=True)
with json_out.open("w", encoding="utf-8") as handle:
json.dump(result.to_json(), handle, ensure_ascii=False, indent=2)
def main(argv: Optional[list[str]] = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
execute_cli(
image_path=args.image,
question=args.question,
max_steps=args.max_steps,
max_regions=args.max_regions,
model_id=args.model_id,
json_out=args.json_out,
)
return 0
__all__ = ["build_parser", "execute_cli", "main"]