Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,870 Bytes
bade21f f477c93 bade21f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import gradio as gr
import spaces
import torch
import math
import numpy as np
import os
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig
# =============================================================================
# InternVL‑3 preprocessing utilities (image‑only version)
# =============================================================================
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size: int = 448):
"""Return torchvision transform matching InternVL pre‑training."""
return T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
]
)
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
tgt_ar = ratio[0] / ratio[1]
diff = abs(aspect_ratio - tgt_ar)
if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size * image_size * ratio[0] * ratio[1]):
best_ratio_diff = diff
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
"""Split arbitrarily‑sized image into ≤12 tiles sized 448×448 (InternVL spec)."""
ow, oh = image.size
aspect_ratio = ow / oh
target_ratios = sorted(
{(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if min_num <= i * j <= max_num},
key=lambda x: x[0] * x[1],
)
ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, ow, oh, image_size)
tw, th = image_size * ratio[0], image_size * ratio[1]
blocks = ratio[0] * ratio[1]
resized = image.resize((tw, th))
tiles = [
resized.crop(
(
(idx % (tw // image_size)) * image_size,
(idx // (tw // image_size)) * image_size,
((idx % (tw // image_size)) + 1) * image_size,
((idx // (tw // image_size)) + 1) * image_size,
)
)
for idx in range(blocks)
]
if use_thumbnail and blocks != 1:
tiles.append(image.resize((image_size, image_size)))
return tiles
def load_image(path: str, input_size: int = 448, max_num: int = 12):
"""Return tensor of shape (N, 3, H, W) ready for InternVL."""
img = Image.open(path).convert("RGB")
transform = build_transform(input_size)
tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
return torch.stack([transform(t) for t in tiles])
# =============================================================================
# InternVL‑3‑14B model loading (multi‑GPU aware)
# =============================================================================
MODEL_ID = "OpenGVLab/InternVL3-14B"
def split_model(model_name: str):
"""Distribute LLM layers across GPUs, keeping vision encoder on GPU 0."""
n_gpu = torch.cuda.device_count()
if n_gpu < 2:
return "auto" # let transformers decide
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
n_layers = cfg.llm_config.num_hidden_layers # type: ignore[attr-defined]
# GPU0 does vision + some text layers => treat as 0.5 GPU
per_gpu = math.ceil(n_layers / (n_gpu - 0.5))
alloc = [per_gpu] * n_gpu
alloc[0] = math.ceil(alloc[0] * 0.5)
dmap = {
"vision_model": 0,
"mlp1": 0,
"language_model.model.tok_embeddings": 0,
"language_model.model.embed_tokens": 0,
"language_model.output": 0,
"language_model.model.norm": 0,
"language_model.model.rotary_emb": 0,
"language_model.lm_head": 0,
}
layer_idx = 0
for gpu, n in enumerate(alloc):
for _ in range(n):
if layer_idx >= n_layers:
break
dmap[f"language_model.model.layers.{layer_idx}"] = 0 if layer_idx == n_layers - 1 else gpu
layer_idx += 1
return dmap
device_map = split_model(MODEL_ID)
model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map,
).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
# =============================================================================
# Inference function (image‑only)
# =============================================================================
@spaces.GPU
def internvl_inference(image_path: str | None, text_input: str | None = None):
if image_path is None:
return "Please upload an image first."
pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()
prompt = f"<image>\n{text_input}" if text_input else "<image>\n"
gen_cfg = dict(max_new_tokens=1024, do_sample=True)
return model.chat(tokenizer, pixel_values, prompt, gen_cfg)
# =============================================================================
# Gradio UI (image‑only, Gradio 5 compatible)
# =============================================================================
DESCRIPTION = (
"[InternVL 3‑14B demo](https://huggingface.co/OpenGVLab/InternVL3-14B) — "
"upload an image and ask anything about it."
)
css = """
#output_text {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css, theme="origin") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
# Left column: image, question, submit button (stacked vertically)
with gr.Column(scale=1):
input_image = gr.Image(label="Upload Image", type="filepath")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button("Submit")
# Right column: model output
with gr.Column(scale=1):
output_text = gr.Textbox(label="Model Output", elem_id="output_text")
# 🔽 예제 추가
gr.Examples(
examples=[["example.webp", "explain this image"]],
inputs=[input_image, text_input],
outputs=output_text,
fn=internvl_inference, # 클릭 시 바로 실행하려면 지정
cache_examples=True, # 결과 캐시(선택)
label="Try an example" # 표기명(선택)
)
submit_btn.click(internvl_inference, [input_image, text_input], [output_text])
if __name__ == "__main__":
demo.launch() |