Co-Instruct / app.py
teowu's picture
Update app.py
bf92928 verified
raw
history blame
No virus
5.4 kB
import gradio as gr
import requests
from PIL import Image
import torch
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview",
trust_remote_code=True,
torch_dtype=torch.float16,
attn_implementation="eager",
device_map={"":"cuda:0"})
def chat(message, history, image_1, image_2, image_3, image_4):
print(history)
if history:
if image_1 is not None and image_2 is None:
past_message = "USER: The image: <|image|> " + history[0][0] + " ASSISTANT:" + history[0][1]
for i in range((len(history) - 1)):
past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
message = past_message + "USER:" + message + " ASSISTANT:"
images = [image_1]
if image_1 is not None and image_2 is not None:
if image_3 is None:
past_message = "USER: The first image: <|image|>\nThe second image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
for i in range((len(history) - 1)):
past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
message = past_message + "USER:" + message + " ASSISTANT:"
images = [image_1, image_2]
else:
if image_4 is None:
past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
for i in range((len(history) - 1)):
past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
message = past_message + "USER:" + message + " ASSISTANT:"
images = [image_1, image_2, image_3]
else:
past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
for i in range((len(history) - 1)):
past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
message = past_message + "USER:" + message + " ASSISTANT:"
images = [image_1, image_2, image_3, image_4]
else:
if image_1 is not None and image_2 is None:
message = "USER: The image: <|image|> " + message + " ASSISTANT:"
images = [image_1]
if image_1 is not None and image_2 is not None:
if image_3 is None:
message = "USER: The first image: <|image|>\nThe second image: <|image|>" + message + " ASSISTANT:"
images = [image_1, image_2]
else:
if image_4 is None:
message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + message + " ASSISTANT:"
images = [image_1, image_2, image_3]
else:
message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + message + " ASSISTANT:"
images = [image_1, image_2, image_3, image_4]
print(message)
return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=300).clamp(0, 100000))[0].split("ASSISTANT:")[-1]
with gr.Blocks(title="img") as demo:
title_markdown = ("""
<h3 align="center">*Super Version of Q-Instruct with Multi-image (up to 4, same as GPT-4V) Support!*</h3>
<h1 align="center"><a href="https://github.com/Q-Future/Q-Instruct"><img src="https://github.com/Q-Future/Q-Instruct/blob/main/q_instruct_logo.png?raw=true", alt="Q-Instruct (mPLUG-Owl-2)" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>
<h2 align="center">Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models</h2>
<h5 align="center"> Please find our more accurate visual scoring demo on <a href='https://huggingface.co/spaces/teowu/OneScorer'>[OneScorer]</a>!</h2>
<div align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
<a href='https://github.com/Q-Future/Q-Instruct'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
<a href="https://Q-Instruct.github.io/Q-Instruct/fig/Q_Instruct_v0_1_preview.pdf"><img src="https://img.shields.io/badge/Technical-Report-red"></a>
<a href='https://github.com/Q-Future/Q-Instruct/stargazers'><img src='https://img.shields.io/github/stars/Q-Future/Q-Instruct.svg?style=social'></a>
</div>
</div>
""")
gr.Markdown(title_markdown)
with gr.Row():
input_img_1 = gr.Image(type='pil', label="Image 1 (First image)")
input_img_2 = gr.Image(type='pil', label="Image 2 (Second image)")
input_img_3 = gr.Image(type='pil', label="Image 3 (Third image)")
input_img_4 = gr.Image(type='pil', label="Image 4 (Third image)")
gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2, input_img_3, input_img_4])
demo.launch(share=True)