from PIL import Image import requests import json import gradio as gr from io import BytesIO def encode_image(image): buffered = BytesIO(), format="JPEG") return buffered def query_api(image, prompt, decoding_method): # local host for testing url = "" headers = { 'User-Agent': 'BLIP-2 HuggingFace Space' } data = {"prompt": prompt, "use_nucleus_sampling": decoding_method == "Nucleus sampling"} image = encode_image(image) files = {"image": image} response =, data=data, files=files, headers=headers) if response.status_code == 200: return response.json() else: return "Error: " + response.text def prepend_question(text): text = text.strip().lower() return "question: " + text def prepend_answer(text): text = text.strip().lower() return "answer: " + text def get_prompt_from_history(history): prompts = [] for i in range(len(history)): if i % 2 == 0: prompts.append(prepend_question(history[i])) else: prompts.append(prepend_answer(history[i])) return "\n".join(prompts) def postp_answer(text): if text.startswith("answer: "): return text[8:] elif text.startswith("a: "): return text[2:] else: return text def prep_question(text): if text.startswith("question: "): text = text[10:] elif text.startswith("q: "): text = text[2:] if not text.endswith("?"): text += "?" return text def inference(image, text_input, decoding_method, history=[]): text_input = prep_question(text_input) history.append(text_input) # prompt = '\n'.join(history) prompt = get_prompt_from_history(history) # print("prompt: " + prompt) output = query_api(image, prompt, decoding_method) output = [postp_answer(output[0])] history += output chat = [(history[i], history[i+1]) for i in range(0, len(history)-1, 2)] # convert to tuples of list return chat, history inputs = [gr.inputs.Image(type='pil'), gr.inputs.Textbox(lines=2, label="Text input"), gr.inputs.Radio(choices=['Nucleus sampling','Beam search'], type="value", default="Nucleus sampling", label="Text Decoding Method"), "state", ] outputs = ["chatbot", "state"] title = "BLIP-2" description = """Gradio demo for BLIP-2, a multimodal chatbot from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Please visit our project webpage.

Disclaimer: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected.

""" article = "

BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models" iface = gr.Interface(inference, inputs, outputs, title=title, description=description, article=article) iface.launch(enable_queue=True)