import requests
from PIL import Image
import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
css = """
#column_container {
position: relative;
height: 800px;
max-width: 700px;
display: flex;
flex-direction: column;
background-color: lightgray;
border: 1px solid gray;
border-radius: 5px;
padding: 10px;
box-shadow: 2px 2px 5px gray;
margin-left: auto;
margin-right: auto;
}
#input_prompt {
position: fixed;
bottom: 0;
max-width: 680px;
}
#chatbot-component {
overflow: auto;
}
"""
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def upload_button_config():
return gr.update(visible=False)
def update_textbox_config(text_in):
return gr.update(visible=True)
#takes input and generates the Response
def predict(btn_upload, counter,image_hid, input, history):
if counter == 0:
image_in = Image.open(btn_upload)
#Resizing the image
basewidth = 512
wpercent = (basewidth/float(image_in.size[0]))
hsize = int((float(image_in.size[1])*float(wpercent)))
image_in = image_in.resize((basewidth,hsize)) #, Image.Resampling.LANCZOS)
# Save the image to the file-like object
#seed = random.randint(0, 1000000)
img_name = "uploaded_image.png" #f"./edited_image_{seed}.png"
image_in.save(img_name)
#add state
history = history or []
response = ''
history.append((input, response))
counter += 1
return history, history, img_name, counter, image_in
#process the prompt
print(f"prompt is :{input}")
#Getting prompt in the format - Question: Is this photo unusual? Answer:
prompt = f"Question: {input} Answer: "
inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16)
#generate the response
generated_ids = model.generate(**inputs, max_new_tokens=10)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(f"generated_text is : {generated_text}")
#add state
history = history or []
response = generated_text
history.append((input, response))
counter += 1
return history, history, "uploaded_image.png", counter, image_hid
#Blocks Layout - leaving this here for moment - "#chatbot-component .overflow-y-auto{height:800px}"
with gr.Blocks(css="#chatbot-component {height: 600px} #input_prompt {position: absolute; bottom: 0;}") as demo:
with gr.Row():
with gr.Column(scale=1):
#with gr.Accordion("See details"):
gr.HTML("""
Blip2 is functioning as an instructed zero-shot image-to-text generation model using OPT-2.7B in this Space.
It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling,
personalized image-to-text generation etc.
BLIP-2 by Salesforce is now available in🤗Transformers!
This model was contributed by nielsr.
The BLIP-2 model was proposed in BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models
by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.