jbilcke-hf's picture
jbilcke-hf HF staff
Update app.py
566b8be verified
import torch
import re
import os
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import Image
from io import BytesIO
import base64
SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')
# Regex pattern to match data URI scheme
data_uri_pattern = re.compile(r'data:image/(png|jpeg|jpg|webp);base64,')
def readb64(b64):
# Remove any data URI scheme prefix with regex
b64 = data_uri_pattern.sub("", b64)
# Decode and open the image with PIL
img = Image.open(BytesIO(base64.b64decode(b64)))
return img
#
# this version work in the official demo but not when I fork it, doesn't work, and I'm not sure why
#
#import subprocess
#subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
#model_id = "vikhyatk/moondream2"
#revision = "2024-04-02"
#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
#moondream = AutoModelForCausalLM.from_pretrained(
# model_id, trust_remote_code=True, revision=revision,
# torch_dtype=torch.bfloat16, device_map={"": "cuda"},
# attn_implementation="flash_attention_2"
#)
#moondream.eval()
# so let's use an older version
if torch.cuda.is_available():
device, dtype = "cuda", torch.float16
else:
device, dtype = "cpu", torch.float32
model_id = "vikhyatk/moondream2"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-03-06")
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision="2024-03-06"
).to(device=device, dtype=dtype)
moondream.eval()
def answer_question(secret_token, input, prompt):
if secret_token != SECRET_TOKEN:
raise gr.Error(
f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
img = readb64(input)
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
# do we really need this?
clean_text = re.sub("<$|<END$", "", new_text)
buffer += new_text
return buffer.strip()
with gr.Blocks() as demo:
gr.HTML("""
<div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
<div style="text-align: center; color: black;">
<p style="color: black;">This space is a headless component of the cloud rendering engine used by AiTube.</p>
<p style="color: black;">It is not available for public use, but you can use the <a href="https://huggingface.co/spaces/vikhyatk/moondream2" target="_blank">original space</a>.</p>
</div>
</div>""")
token = gr.Textbox()
input = gr.Textbox()
prompt = gr.Textbox()
submit = gr.Button()
output = gr.Textbox()
submit.click(answer_question, [token, input, prompt], output)
demo.queue().launch()