OtterHD-Demo / app.py
luodian's picture
update
14b6c5b
import os
import datetime
import json
import base64
from PIL import Image
import gradio as gr
import hashlib
import requests
import io
# LOGDIR = "log"
# logger = build_logger("otter", LOGDIR)
# no_change_btn = gr.Button.update()
# enable_btn = gr.Button.update(interactive=True)
# disable_btn = gr.Button.update(interactive=False)
def decode_image(encoded_image: str) -> Image:
decoded_bytes = base64.b64decode(encoded_image.encode("utf-8"))
buffer = io.BytesIO(decoded_bytes)
image = Image.open(buffer)
return image
def encode_image(image: Image.Image, format: str = "PNG") -> str:
with io.BytesIO() as buffer:
image.save(buffer, format=format)
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
return encoded_image
def get_conv_log_filename():
t = datetime.datetime.now()
name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
return name
def get_conv_image_dir():
name = os.path.join(LOGDIR, "images")
os.makedirs(name, exist_ok=True)
return name
def get_image_name(image, image_dir=None):
buffer = io.BytesIO()
image.save(buffer, format="PNG")
image_bytes = buffer.getvalue()
md5 = hashlib.md5(image_bytes).hexdigest()
if image_dir is not None:
image_name = os.path.join(image_dir, md5 + ".png")
else:
image_name = md5 + ".png"
return image_name
def resize_image(image, max_size):
width, height = image.size
aspect_ratio = float(width) / float(height)
if width > height:
new_width = max_size
new_height = int(new_width / aspect_ratio)
else:
new_height = max_size
new_width = int(new_height * aspect_ratio)
resized_image = image.resize((new_width, new_height))
return resized_image
def http_bot(image_input, text_input, request: gr.Request):
print(f"http_bot. ip: {request.client.host}")
print(f"Prompt request: {text_input}")
base64_image_str = encode_image(image_input)
payload = {
"content": [
{
"prompt": text_input,
"image": base64_image_str,
}
],
"token": "sk-OtterHD",
}
print(
"request: ",
{
"prompt": text_input,
"image": base64_image_str[:10],
},
)
url = "https://rouge-surrey-katrina-signatures.trycloudflare.com/app/otter"
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, data=json.dumps(payload))
results = response.json()
print("response: ", {"result": results["result"]})
return results["result"]
title = """
# OTTER-HD: A High-Resolution Multi-modality Model
[[Otter Codebase]](https://github.com/Luodian/Otter) [[Paper]](https://arxiv.org/abs/2311.04219) [[Checkpoints & Benchmarks]](https://huggingface.co/Otter-AI)
**OtterHD** is a multimodal fine-tuned from [Fuyu-8B](https://huggingface.co/adept/fuyu-8b) to facilitate a more fine-grained interpretation of high-resolution visual input *without a explicit vision encoder module*. All image patches are linear transformed and processed together with text tokens. This is a very innovative and elegant exploration. We are fascinated and paved in this way, we opensourced the finetune script for Fuyu-8B and improve training throughput by 4-5 times faster with [Flash-Attention-2](https://github.com/Dao-AILab/flash-attention).
**Tips**:
- Since high-res images are large that may cause the longer transmit time from HF Space to our backend server. Please be kinda patient for the response.
- We are working on to finetune the model on LLaVA-1.5/LRV/LLaVAR data mixture and balance the detailed recognition and hallucination reduction. Stay tuned!
- Please do not upload any NSFW images and ask relevant questions. We will ban the IP address if we found any inappropriate usage.
"""
css = """
#mkd {
height: 1000px;
overflow: auto;
border: 1px solid #ccc;
}
"""
if __name__ == "__main__":
with gr.Blocks(css=css) as demo:
gr.Markdown(title)
dialog_state = gr.State()
input_state = gr.State()
with gr.Tab("Ask a Question"):
with gr.Row(equal_height=True):
with gr.Column(scale=2):
image_input = gr.Image(label="Upload a High-Res Image", type="pil")
with gr.Column(scale=1):
vqa_output = gr.Textbox(label="Output")
text_input = gr.Textbox(label="Ask a Question")
vqa_btn = gr.Button("Send It")
gr.Examples(
[
[
"./assets/IMG_00095.png",
"How many camels are inside this image?",
],
[
"./assets/IMG_00057.png",
"What's this image about?",
],
[
"./assets/IMG_00040.png",
"What are the scene texts in this image?",
],
[
"./assets/./IMG_00012.png",
"How many apples are there? Count them row by row.",
],
[
"./assets/IMG_00080.png",
"What is this and where is it from?",
],
[
"./assets/IMG_00041.png",
"What are the scene texts in this image?",
],
],
inputs=[image_input, text_input],
outputs=[vqa_output],
fn=http_bot,
label="Click on any Examples below👇",
)
vqa_btn.click(fn=http_bot, inputs=[image_input, text_input], outputs=vqa_output)
demo.launch()