import spaces import argparse import torch import re import gradio as gr from threading import Thread from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM from PIL import Image parser = argparse.ArgumentParser() model_id = "vikhyat/moondream2" revision = "2024-04-02" tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) moondream = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, revision=revision, torch_dtype=torch.float32 ) moondream.eval() @spaces.GPU(duration=10) def answer_question(images, prompts): image_embeds = [moondream.encode_image(img) for img in images] image_embeds = torch.cat(image_embeds, dim=0) answers = moondream.batch_answer( images=image_embeds, prompts=prompts, tokenizer=tokenizer ) return [answer for answer in answers] with gr.Blocks() as demo: gr.Markdown( """ # 🌔 moondream2 A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream) """ ) with gr.Row(): prompts = gr.Textbox(label="Input", placeholder="Type here...", scale=4) submit = gr.Button("Submit") with gr.Row(): images = gr.Image(type="pil", label="Upload Images", multiple=True) output = gr.Textbox(label="Response", multiple=True) submit.click(answer_question, [images, prompts], output) prompts.submit(answer_question, [images, prompts], output) demo.queue().launch()