import requests from PIL import Image import gradio as gr from transformers import AutoProcessor, Blip2ForConditionalGeneration import torch css = """ #column_container { position: relative; height: 800px; max-width: 700px; display: flex; flex-direction: column; background-color: lightgray; border: 1px solid gray; border-radius: 5px; padding: 10px; box-shadow: 2px 2px 5px gray; margin-left: auto; margin-right: auto; } #input_prompt { position: fixed; bottom: 0; max-width: 680px; } #chatbot-component { overflow: auto; } """ processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) def upload_button_config(): return gr.update(visible=False) def update_textbox_config(text_in): return gr.update(visible=True) #takes input and generates the Response def predict(btn_upload, counter,image_hid, input, history): if counter == 0: image_in = Image.open(btn_upload) #Resizing the image basewidth = 512 wpercent = (basewidth/float(image_in.size[0])) hsize = int((float(image_in.size[1])*float(wpercent))) image_in = image_in.resize((basewidth,hsize)) #, Image.Resampling.LANCZOS) # Save the image to the file-like object #seed = random.randint(0, 1000000) img_name = "uploaded_image.png" #f"./edited_image_{seed}.png" image_in.save(img_name) #add state history = history or [] response = '

' history.append((input, response)) counter += 1 return history, history, img_name, counter, image_in #process the prompt print(f"prompt is :{input}") #Getting prompt in the format - Question: Is this photo unusual? Answer: prompt = f"Question: {input} Answer: " inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16) #generate the response generated_ids = model.generate(**inputs, max_new_tokens=10) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() print(f"generated_text is : {generated_text}") #add state history = history or [] response = generated_text history.append((input, response)) counter += 1 return history, history, "uploaded_image.png", counter, image_hid #Blocks Layout - leaving this here for moment - "#chatbot-component .overflow-y-auto{height:800px}" with gr.Blocks(css="#chatbot-component {height: 600px} #input_prompt {position: absolute; bottom: 0;}") as demo: with gr.Row(): with gr.Column(scale=1): #with gr.Accordion("See details"): gr.HTML("""

Bringing Visual Conversations to Life with BLIP2

Blip2 is functioning as an instructed zero-shot image-to-text generation model using OPT-2.7B in this Space. It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling, personalized image-to-text generation etc.
BLIP-2 by Salesforce is now available in🤗Transformers! This model was contributed by nielsr. The BLIP-2 model was proposed in BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.

""") gr.HTML("""

Duplicate Space with GPU Upgrade for fast Inference & no queue
""") with gr.Column(elem_id = "column_container", scale=2): #text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image') btn_upload = gr.UploadButton("Upload image!", file_types=["image"], file_count="single", elem_id="upload_button") text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image') chatbot = gr.Chatbot(elem_id = 'chatbot-component', label='Converse with Images') state_in = gr.State() counter_out = gr.Number(visible=False, value=0, precision=0) text_out = gr.Textbox(visible=False) #getting image name out image_hid = gr.Image(visible=False) #, type='pil') #Using Event Listeners btn_upload.upload(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid]) btn_upload.upload(fn = update_textbox_config, inputs=text_in, outputs = text_in) text_in.submit(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid]) chatbot.change(fn = upload_button_config, outputs=btn_upload) #, scroll_to_output = True) demo.queue(concurrency_count=10) demo.launch(debug=True) #, width="80%", height=2000)