Chat_With_Blip2 / app.py
ysharma's picture
ysharma HF staff
update
11419b6
raw
history blame contribute delete
No virus
6.07 kB
import requests
from PIL import Image
import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
css = """
#column_container {
position: relative;
height: 800px;
max-width: 700px;
display: flex;
flex-direction: column;
background-color: lightgray;
border: 1px solid gray;
border-radius: 5px;
padding: 10px;
box-shadow: 2px 2px 5px gray;
margin-left: auto;
margin-right: auto;
}
#input_prompt {
position: fixed;
bottom: 0;
max-width: 680px;
}
#chatbot-component {
overflow: auto;
}
"""
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def upload_button_config():
return gr.update(visible=False)
def update_textbox_config(text_in):
return gr.update(visible=True)
#takes input and generates the Response
def predict(btn_upload, counter,image_hid, input, history):
if counter == 0:
image_in = Image.open(btn_upload)
#Resizing the image
basewidth = 512
wpercent = (basewidth/float(image_in.size[0]))
hsize = int((float(image_in.size[1])*float(wpercent)))
image_in = image_in.resize((basewidth,hsize)) #, Image.Resampling.LANCZOS)
# Save the image to the file-like object
#seed = random.randint(0, 1000000)
img_name = "uploaded_image.png" #f"./edited_image_{seed}.png"
image_in.save(img_name)
#add state
history = history or []
response = '<img src="/file=' + img_name + '">'
history.append((input, response))
counter += 1
return history, history, img_name, counter, image_in
#process the prompt
print(f"prompt is :{input}")
#Getting prompt in the format - Question: Is this photo unusual? Answer:
prompt = f"Question: {input} Answer: "
inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16)
#generate the response
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"generated_text is : {generated_text}")
#add state
history = history or []
response = generated_text
history.append((input, response))
counter += 1
return history, history, "uploaded_image.png", counter, image_hid
#Blocks Layout - leaving this here for moment - "#chatbot-component .overflow-y-auto{height:800px}"
with gr.Blocks(css="#chatbot-component {height: 600px} #input_prompt {position: absolute; bottom: 0;}") as demo:
with gr.Row():
with gr.Column(scale=1):
#with gr.Accordion("See details"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Bringing Visual Conversations to Life with BLIP2
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Blip2 is functioning as an <b>instructed zero-shot image-to-text generation</b> model using OPT-2.7B in this Space.
It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling,
personalized image-to-text generation etc.<br>
BLIP-2 by <a href="https://huggingface.co/Salesforce" target="_blank">Salesforce</a> is now available in🤗Transformers!
This model was contributed by <a href="https://twitter.com/NielsRogge" target="_blank">nielsr</a>.
The BLIP-2 model was proposed in <a href="https://arxiv.org/abs/2301.12597" target="_blank">BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.<br><br>
</p></div>""")
gr.HTML("""<a href="https://huggingface.co/spaces/ysharma/InstructPix2Pix_Chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate Space with GPU Upgrade for fast Inference & no queue<br>""")
with gr.Column(elem_id = "column_container", scale=2):
#text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
btn_upload = gr.UploadButton("Upload image!", file_types=["image"], file_count="single", elem_id="upload_button")
text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
chatbot = gr.Chatbot(elem_id = 'chatbot-component', label='Converse with Images')
state_in = gr.State()
counter_out = gr.Number(visible=False, value=0, precision=0)
text_out = gr.Textbox(visible=False) #getting image name out
image_hid = gr.Image(visible=False) #, type='pil')
#Using Event Listeners
btn_upload.upload(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
btn_upload.upload(fn = update_textbox_config, inputs=text_in, outputs = text_in)
text_in.submit(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
chatbot.change(fn = upload_button_config, outputs=btn_upload) #, scroll_to_output = True)
demo.queue(concurrency_count=10)
demo.launch(debug=True) #, width="80%", height=2000)