|
import requests |
|
from PIL import Image |
|
import gradio as gr |
|
from transformers import AutoProcessor, Blip2ForConditionalGeneration |
|
import torch |
|
|
|
|
|
css = """ |
|
#column_container { |
|
position: relative; |
|
height: 800px; |
|
max-width: 700px; |
|
display: flex; |
|
flex-direction: column; |
|
background-color: lightgray; |
|
border: 1px solid gray; |
|
border-radius: 5px; |
|
padding: 10px; |
|
box-shadow: 2px 2px 5px gray; |
|
margin-left: auto; |
|
margin-right: auto; |
|
} |
|
#input_prompt { |
|
position: fixed; |
|
bottom: 0; |
|
max-width: 680px; |
|
} |
|
#chatbot-component { |
|
overflow: auto; |
|
} |
|
""" |
|
|
|
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") |
|
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
def upload_button_config(): |
|
return gr.update(visible=False) |
|
|
|
def update_textbox_config(text_in): |
|
return gr.update(visible=True) |
|
|
|
|
|
def predict(btn_upload, counter,image_hid, input, history): |
|
|
|
if counter == 0: |
|
image_in = Image.open(btn_upload) |
|
|
|
basewidth = 512 |
|
wpercent = (basewidth/float(image_in.size[0])) |
|
hsize = int((float(image_in.size[1])*float(wpercent))) |
|
image_in = image_in.resize((basewidth,hsize)) |
|
|
|
|
|
img_name = "uploaded_image.png" |
|
image_in.save(img_name) |
|
|
|
history = history or [] |
|
response = '<img src="/file=' + img_name + '">' |
|
history.append((input, response)) |
|
counter += 1 |
|
return history, history, img_name, counter, image_in |
|
|
|
|
|
print(f"prompt is :{input}") |
|
|
|
prompt = f"Question: {input} Answer: " |
|
inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16) |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=10) |
|
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() |
|
print(f"generated_text is : {generated_text}") |
|
|
|
|
|
history = history or [] |
|
response = generated_text |
|
history.append((input, response)) |
|
counter += 1 |
|
return history, history, "uploaded_image.png", counter, image_hid |
|
|
|
|
|
with gr.Blocks(css="#chatbot-component {height: 600px} #input_prompt {position: absolute; bottom: 0;}") as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;"> |
|
Bringing Visual Conversations to Life with BLIP2 |
|
</h1> |
|
</div> |
|
<p style="margin-bottom: 10px; font-size: 94%"> |
|
Blip2 is functioning as an <b>instructed zero-shot image-to-text generation</b> model using OPT-2.7B in this Space. |
|
It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling, |
|
personalized image-to-text generation etc.<br> |
|
BLIP-2 by <a href="https://huggingface.co/Salesforce" target="_blank">Salesforce</a> is now available in🤗Transformers! |
|
This model was contributed by <a href="https://twitter.com/NielsRogge" target="_blank">nielsr</a>. |
|
The BLIP-2 model was proposed in <a href="https://arxiv.org/abs/2301.12597" target="_blank">BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a> |
|
by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.<br><br> |
|
</p></div>""") |
|
gr.HTML("""<a href="https://huggingface.co/spaces/ysharma/InstructPix2Pix_Chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate Space with GPU Upgrade for fast Inference & no queue<br>""") |
|
|
|
with gr.Column(elem_id = "column_container", scale=2): |
|
|
|
btn_upload = gr.UploadButton("Upload image!", file_types=["image"], file_count="single", elem_id="upload_button") |
|
text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image') |
|
chatbot = gr.Chatbot(elem_id = 'chatbot-component', label='Converse with Images') |
|
state_in = gr.State() |
|
counter_out = gr.Number(visible=False, value=0, precision=0) |
|
text_out = gr.Textbox(visible=False) |
|
image_hid = gr.Image(visible=False) |
|
|
|
|
|
btn_upload.upload(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid]) |
|
btn_upload.upload(fn = update_textbox_config, inputs=text_in, outputs = text_in) |
|
|
|
text_in.submit(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid]) |
|
|
|
chatbot.change(fn = upload_button_config, outputs=btn_upload) |
|
|
|
demo.queue(concurrency_count=10) |
|
demo.launch(debug=True) |