File size: 3,052 Bytes
dcef8cb 2c94591 3153182 b75d3d8 dcef8cb 758f4b1 dcef8cb 758f4b1 b75d3d8 3153182 b75d3d8 3153182 b75d3d8 758f4b1 b75d3d8 3153182 b75d3d8 dcef8cb b75d3d8 a675f47 b75d3d8 a675f47 b75d3d8 a675f47 dcef8cb b75d3d8 a675f47 b75d3d8 a675f47 758f4b1 b75d3d8 a675f47 758f4b1 b75d3d8 a675f47 b75d3d8 a675f47 b75d3d8 a675f47 758f4b1 b75d3d8 a675f47 b75d3d8 3153182 a675f47 dcef8cb 2c94591 b75d3d8 2c94591 b75d3d8 2c94591 b75d3d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
# ๆจกๅ่ทฏๅพ
model_path = "deepseek-ai/deepseek-vl-7b-chat"
# ==== BitsAndBytes 4-bit ้ๅ่จญๅฎ ====
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16, # ๅผทๅถ float16
bnb_4bit_use_double_quant=True
)
# ่ผๅ
ฅ processor ๅ tokenizer
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
# ่ผๅ
ฅๆจกๅ (4-bit ้ๅ + float16)
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
).eval()
# ==== ๅฎๅผตๅ็ๆจ็ๅฝๅผ ====
def chat_with_image(image, user_message):
try:
# ๅปบ็ซๅฐ่ฉฑๆ ผๅผ
conversation = [
{"role": "User", "content": "<image_placeholder>" + user_message, "images": [image]},
{"role": "Assistant", "content": ""}
]
# ่ผธๅ
ฅ่็
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to(vl_gpt.device)
# ๐จ ๆญฃ็ขบ dtype ่็
# ๅชๅฐ้่ฆ็ tensor ่ฝ float16๏ผinput_ids ๅฟ
้ ๆฏ long
new_inputs = {}
for k, v in prepare_inputs.items():
if torch.is_tensor(v):
if k in ["input_ids", "labels"]:
new_inputs[k] = v.to(torch.long)
else:
new_inputs[k] = v.to(torch.float16)
else:
new_inputs[k] = v
prepare_inputs = new_inputs
# ๅๅพ่ผธๅ
ฅ embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# ็ๆๅ็ญ
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs["attention_mask"],
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=128, # ้ไฝ็ๆ้ทๅบฆไปฅๆธๅฐ่จๆถ้ซ
do_sample=False,
use_cache=True
)
# ่งฃ็ขผ
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
except Exception as e:
return f"Error: {str(e)}"
# ==== Gradio Web UI ====
demo = gr.Interface(
fn=chat_with_image,
inputs=[gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Ask about the image...")],
outputs="text",
title="DeepSeek-VL-7B-Chat Demo (4-bit, float16)",
description="ไธๅณๅ็ไธฆ่ผธๅ
ฅๅ้ก๏ผๆจกๅๆ็ๆ่ๅ็็ธ้็ๅ็ญ"
)
if __name__ == "__main__":
demo.launch()
|