Spaces:
Running
on
L4
Running
on
L4
File size: 2,001 Bytes
2584554 ce191fc 2584554 ce191fc 2584554 6e91c12 ce191fc 31264bb ce191fc 31264bb 2584554 ce191fc 2584554 ce191fc 6e4bedb 549b56e 6e4bedb 2584554 6e4bedb 2584554 6e4bedb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from Models.modeling_llavaqw import LlavaQwModel
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
model_name = "torettomarui/Llava-qw"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
model = LlavaQwModel.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
).to(torch.bfloat16).eval().cuda()
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def preprocess_image(file_path, image_size=448):
transform = build_transform(image_size)
pixel_values = transform(file_path)
return torch.stack([pixel_values]).to(torch.bfloat16).cuda()
def generate_response(image, text):
pixel_values = preprocess_image(image)
generation_config = dict(max_new_tokens=2048, do_sample=False)
question = '<image>\n' + text
response = model.chat(tokenizer, pixel_values, question, generation_config)
return response
# 添加示例图像和文本
examples = [
["./text.png", "图中的文字是什么?"],
]
iface = gr.Interface(
fn=generate_response,
inputs=[
gr.Image(type="pil", label="上传图片"),
gr.Textbox(lines=2, placeholder="输入你的问题..."),
],
outputs="text",
title="Llava-QW",
description="上传一张图片并输入你的问题,模型将生成相应的回答。",
examples=examples # 添加示例
)
iface.launch() |