Spaces:
Running
Running
File size: 2,769 Bytes
f4ed07d de6fe0d f4ed07d de6fe0d 408282e de6fe0d 408282e de6fe0d f2209b5 de6fe0d f4ed07d dd3b350 4aa36bf 7ac54c1 de6fe0d 4aa36bf de6fe0d 4aa36bf de6fe0d 4aa36bf de6fe0d 4aa36bf de6fe0d f4ed07d 4aa36bf de6fe0d 78cefb8 4aa36bf dd3b350 4aa36bf dd3b350 4aa36bf f4ed07d 4aa36bf f4ed07d 4aa36bf f4ed07d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
from llava.model.builder import load_pretrained_model
from llava.mm_utils import process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from PIL import Image
import copy
import torch
import warnings
warnings.filterwarnings("ignore")
pretrained = "AI-Safeguard/Ivy-VL-llava"
model_name = "llava_qwen"
device = "cpu"
device_map = "auto"
# Load model, tokenizer, and image processor
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation="sdpa")
model.eval()
def respond(image, question, temperature, max_tokens):
try:
# Load and process the image
image_tensor = process_images([image], image_processor, model.config)
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
# Prepare the conversation template
conv_template = "qwen_1_5"
formatted_question = DEFAULT_IMAGE_TOKEN + "\n" + question
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], formatted_question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
# Tokenize input
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
image_sizes = [image.size]
# Generate response
cont = model.generate(
input_ids,
images=image_tensor,
image_sizes=image_sizes,
do_sample=False,
temperature=temperature,
max_new_tokens=max_tokens,
)
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
return text_outputs[0]
except Exception as e:
return f"Error: {str(e)}"
# Gradio Interface
def chat_interface(image, question, temperature, max_tokens):
if not image or not question:
return "Please provide both an image and a question."
return respond(image, question, temperature, max_tokens)
demo = gr.Interface(
fn=chat_interface,
inputs=[
gr.Image(type="pil", label="Input Image"),
gr.Textbox(label="Question"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max Tokens"),
],
outputs="text",
title="AI-Safeguard Ivy-VL-Llava Image Question Answering",
description="Upload an image and ask a question about it. The model will provide a response based on the visual and textual input."
)
if __name__ == "__main__":
demo.launch()
|