Chat_With_Blip2 / app.py
ysharma's picture
ysharma HF staff
update
11419b6
import requests
from PIL import Image
import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
css = """
#column_container {
position: relative;
height: 800px;
max-width: 700px;
display: flex;
flex-direction: column;
background-color: lightgray;
border: 1px solid gray;
border-radius: 5px;
padding: 10px;
box-shadow: 2px 2px 5px gray;
margin-left: auto;
margin-right: auto;
}
#input_prompt {
position: fixed;
bottom: 0;
max-width: 680px;
}
#chatbot-component {
overflow: auto;
}
"""
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def upload_button_config():
return gr.update(visible=False)
def update_textbox_config(text_in):
return gr.update(visible=True)
#takes input and generates the Response
def predict(btn_upload, counter,image_hid, input, history):
if counter == 0:
image_in = Image.open(btn_upload)
#Resizing the image
basewidth = 512
wpercent = (basewidth/float(image_in.size[0]))
hsize = int((float(image_in.size[1])*float(wpercent)))
image_in = image_in.resize((basewidth,hsize)) #, Image.Resampling.LANCZOS)
# Save the image to the file-like object
#seed = random.randint(0, 1000000)
img_name = "uploaded_image.png" #f"./edited_image_{seed}.png"
image_in.save(img_name)
#add state
history = history or []
response = '<img src="/file=' + img_name + '">'
history.append((input, response))
counter += 1
return history, history, img_name, counter, image_in
#process the prompt
print(f"prompt is :{input}")
#Getting prompt in the format - Question: Is this photo unusual? Answer:
prompt = f"Question: {input} Answer: "
inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16)
#generate the response
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"generated_text is : {generated_text}")
#add state
history = history or []
response = generated_text
history.append((input, response))
counter += 1
return history, history, "uploaded_image.png", counter, image_hid
#Blocks Layout - leaving this here for moment - "#chatbot-component .overflow-y-auto{height:800px}"
with gr.Blocks(css="#chatbot-component {height: 600px} #input_prompt {position: absolute; bottom: 0;}") as demo:
with gr.Row():
with gr.Column(scale=1):
#with gr.Accordion("See details"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Bringing Visual Conversations to Life with BLIP2
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Blip2 is functioning as an <b>instructed zero-shot image-to-text generation</b> model using OPT-2.7B in this Space.
It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling,
personalized image-to-text generation etc.<br>
BLIP-2 by <a href="https://huggingface.co/Salesforce" target="_blank">Salesforce</a> is now available in🤗Transformers!
This model was contributed by <a href="https://twitter.com/NielsRogge" target="_blank">nielsr</a>.
The BLIP-2 model was proposed in <a href="https://arxiv.org/abs/2301.12597" target="_blank">BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.<br><br>
</p></div>""")
gr.HTML("""<a href="https://huggingface.co/spaces/ysharma/InstructPix2Pix_Chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate Space with GPU Upgrade for fast Inference & no queue<br>""")
with gr.Column(elem_id = "column_container", scale=2):
#text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
btn_upload = gr.UploadButton("Upload image!", file_types=["image"], file_count="single", elem_id="upload_button")
text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
chatbot = gr.Chatbot(elem_id = 'chatbot-component', label='Converse with Images')
state_in = gr.State()
counter_out = gr.Number(visible=False, value=0, precision=0)
text_out = gr.Textbox(visible=False) #getting image name out
image_hid = gr.Image(visible=False) #, type='pil')
#Using Event Listeners
btn_upload.upload(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
btn_upload.upload(fn = update_textbox_config, inputs=text_in, outputs = text_in)
text_in.submit(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
chatbot.change(fn = upload_button_config, outputs=btn_upload) #, scroll_to_output = True)
demo.queue(concurrency_count=10)
demo.launch(debug=True) #, width="80%", height=2000)