|
import gradio as gr |
|
from transformers import AutoProcessor, LlavaForConditionalGeneration |
|
import torch |
|
from PIL import Image |
|
|
|
|
|
model_name = "llava-hf/llava-1.5-13b" |
|
model = LlavaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
|
|
|
|
def chat(image, text): |
|
if image: |
|
image = Image.open(image) |
|
inputs = processor(image, text, return_tensors="pt").to("cuda") |
|
outputs = model.generate(**inputs) |
|
return processor.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
|
|
|
interface = gr.Interface( |
|
fn=chat, |
|
inputs=["image", "text"], |
|
outputs="text", |
|
title="LLaVA - Multimodale KI", |
|
description="Diese KI kann Chatten & Bilder analysieren.", |
|
) |
|
|
|
|
|
interface.launch() |
|
|
|
|