Spaces:
Paused
Paused
import os | |
import datetime | |
import json | |
import base64 | |
from PIL import Image | |
import gradio as gr | |
import hashlib | |
import requests | |
import io | |
# LOGDIR = "log" | |
# logger = build_logger("otter", LOGDIR) | |
# no_change_btn = gr.Button.update() | |
# enable_btn = gr.Button.update(interactive=True) | |
# disable_btn = gr.Button.update(interactive=False) | |
def decode_image(encoded_image: str) -> Image: | |
decoded_bytes = base64.b64decode(encoded_image.encode("utf-8")) | |
buffer = io.BytesIO(decoded_bytes) | |
image = Image.open(buffer) | |
return image | |
def encode_image(image: Image.Image, format: str = "PNG") -> str: | |
with io.BytesIO() as buffer: | |
image.save(buffer, format=format) | |
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") | |
return encoded_image | |
def get_conv_log_filename(): | |
t = datetime.datetime.now() | |
name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json") | |
return name | |
def get_conv_image_dir(): | |
name = os.path.join(LOGDIR, "images") | |
os.makedirs(name, exist_ok=True) | |
return name | |
def get_image_name(image, image_dir=None): | |
buffer = io.BytesIO() | |
image.save(buffer, format="PNG") | |
image_bytes = buffer.getvalue() | |
md5 = hashlib.md5(image_bytes).hexdigest() | |
if image_dir is not None: | |
image_name = os.path.join(image_dir, md5 + ".png") | |
else: | |
image_name = md5 + ".png" | |
return image_name | |
def resize_image(image, max_size): | |
width, height = image.size | |
aspect_ratio = float(width) / float(height) | |
if width > height: | |
new_width = max_size | |
new_height = int(new_width / aspect_ratio) | |
else: | |
new_height = max_size | |
new_width = int(new_height * aspect_ratio) | |
resized_image = image.resize((new_width, new_height)) | |
return resized_image | |
def http_bot(image_input, text_input, request: gr.Request): | |
print(f"http_bot. ip: {request.client.host}") | |
print(f"Prompt request: {text_input}") | |
base64_image_str = encode_image(image_input) | |
payload = { | |
"content": [ | |
{ | |
"prompt": text_input, | |
"image": base64_image_str, | |
} | |
], | |
"token": "sk-OtterHD", | |
} | |
print( | |
"request: ", | |
{ | |
"prompt": text_input, | |
"image": base64_image_str[:10], | |
}, | |
) | |
url = "https://rouge-surrey-katrina-signatures.trycloudflare.com/app/otter" | |
headers = {"Content-Type": "application/json"} | |
response = requests.post(url, headers=headers, data=json.dumps(payload)) | |
results = response.json() | |
print("response: ", {"result": results["result"]}) | |
return results["result"] | |
title = """ | |
# OTTER-HD: A High-Resolution Multi-modality Model | |
[[Otter Codebase]](https://github.com/Luodian/Otter) [[Paper]](https://arxiv.org/abs/2311.04219) [[Checkpoints & Benchmarks]](https://huggingface.co/Otter-AI) | |
**OtterHD** is a multimodal fine-tuned from [Fuyu-8B](https://huggingface.co/adept/fuyu-8b) to facilitate a more fine-grained interpretation of high-resolution visual input *without a explicit vision encoder module*. All image patches are linear transformed and processed together with text tokens. This is a very innovative and elegant exploration. We are fascinated and paved in this way, we opensourced the finetune script for Fuyu-8B and improve training throughput by 4-5 times faster with [Flash-Attention-2](https://github.com/Dao-AILab/flash-attention). | |
**Tips**: | |
- Since high-res images are large that may cause the longer transmit time from HF Space to our backend server. Please be kinda patient for the response. | |
- We are working on to finetune the model on LLaVA-1.5/LRV/LLaVAR data mixture and balance the detailed recognition and hallucination reduction. Stay tuned! | |
- Please do not upload any NSFW images and ask relevant questions. We will ban the IP address if we found any inappropriate usage. | |
""" | |
css = """ | |
#mkd { | |
height: 1000px; | |
overflow: auto; | |
border: 1px solid #ccc; | |
} | |
""" | |
if __name__ == "__main__": | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(title) | |
dialog_state = gr.State() | |
input_state = gr.State() | |
with gr.Tab("Ask a Question"): | |
with gr.Row(equal_height=True): | |
with gr.Column(scale=2): | |
image_input = gr.Image(label="Upload a High-Res Image", type="pil") | |
with gr.Column(scale=1): | |
vqa_output = gr.Textbox(label="Output") | |
text_input = gr.Textbox(label="Ask a Question") | |
vqa_btn = gr.Button("Send It") | |
gr.Examples( | |
[ | |
[ | |
"./assets/IMG_00095.png", | |
"How many camels are inside this image?", | |
], | |
[ | |
"./assets/IMG_00057.png", | |
"What's this image about?", | |
], | |
[ | |
"./assets/IMG_00040.png", | |
"What are the scene texts in this image?", | |
], | |
[ | |
"./assets/./IMG_00012.png", | |
"How many apples are there? Count them row by row.", | |
], | |
[ | |
"./assets/IMG_00080.png", | |
"What is this and where is it from?", | |
], | |
[ | |
"./assets/IMG_00041.png", | |
"What are the scene texts in this image?", | |
], | |
], | |
inputs=[image_input, text_input], | |
outputs=[vqa_output], | |
fn=http_bot, | |
label="Click on any Examples below👇", | |
) | |
vqa_btn.click(fn=http_bot, inputs=[image_input, text_input], outputs=vqa_output) | |
demo.launch() | |