ai-tube-model-moondream2

Paused

File size: 3,386 Bytes

5781b89
 
4a5cedb
5781b89
 
 
f6b324c
4a5cedb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566b8be
 
 
9cdab82
 
566b8be
 
 
 
 
 
 
 
 
5781b89
566b8be
 
 
 
 
5781b89
566b8be
5781b89
566b8be
 
5781b89
 
4a5cedb
 
 
 
 
 
 
5781b89
4a5cedb
5781b89
4a5cedb
5781b89
 
 
 
 
 
 
 
 
 
 
 
 
566b8be
 
 
 
6b26249
4a5cedb
566b8be
5781b89
 
4a5cedb
 
 
 
 
 
 
 
 
 
 
 
 
5781b89

import torch
import re
import os
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM

from PIL import Image
from io import BytesIO
import base64

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

# Regex pattern to match data URI scheme
data_uri_pattern = re.compile(r'data:image/(png|jpeg|jpg|webp);base64,')

def readb64(b64):
    # Remove any data URI scheme prefix with regex
    b64 = data_uri_pattern.sub("", b64)
    # Decode and open the image with PIL
    img = Image.open(BytesIO(base64.b64decode(b64)))
    return img


#
# this version work in the official demo but not when I fork it, doesn't work, and I'm not sure why
#
#import subprocess
#subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
#model_id = "vikhyatk/moondream2"
#revision = "2024-04-02"
#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
#moondream = AutoModelForCausalLM.from_pretrained(
#    model_id, trust_remote_code=True, revision=revision,
#    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
#    attn_implementation="flash_attention_2"
#)
#moondream.eval()

# so let's use an older version
if torch.cuda.is_available():
    device, dtype = "cuda", torch.float16
else:
    device, dtype = "cpu", torch.float32
model_id = "vikhyatk/moondream2"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-03-06")
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision="2024-03-06"
).to(device=device, dtype=dtype)
moondream.eval()

def answer_question(secret_token, input, prompt):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
        
    img = readb64(input)
    
    image_embeds = moondream.encode_image(img)
    
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    
    thread = Thread(
        target=moondream.answer_question,
        kwargs={
            "image_embeds": image_embeds,
            "question": prompt,
            "tokenizer": tokenizer,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        
        # do we really need this?
        clean_text = re.sub("<$|<END$", "", new_text)
        
        buffer += new_text
    
    return buffer.strip()

with gr.Blocks() as demo:
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a headless component of the cloud rendering engine used by AiTube.</p>
        <p style="color: black;">It is not available for public use, but you can use the <a href="https://huggingface.co/spaces/vikhyatk/moondream2" target="_blank">original space</a>.</p>
        </div>
        </div>""")
    token = gr.Textbox()
    input = gr.Textbox()
    prompt = gr.Textbox()
    submit = gr.Button()
    output = gr.Textbox()
    submit.click(answer_question, [token, input, prompt], output)

demo.queue().launch()