File size: 1,500 Bytes
eb99994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import spaces
import argparse
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import Image

parser = argparse.ArgumentParser()

model_id = "vikhyat/moondream2"
revision = "2024-04-02"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision,
    torch_dtype=torch.float32
)
moondream.eval()

@spaces.GPU(duration=10)
def answer_question(images, prompts):
    image_embeds = [moondream.encode_image(img) for img in images]
    image_embeds = torch.cat(image_embeds, dim=0)
    answers = moondream.batch_answer(
        images=image_embeds,
        prompts=prompts,
        tokenizer=tokenizer
    )
    return [answer for answer in answers]

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # πŸŒ” moondream2
        A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
        """
    )
    with gr.Row():
        prompts = gr.Textbox(label="Input", placeholder="Type here...", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        images = gr.Image(type="pil", label="Upload Images", multiple=True)
        output = gr.Textbox(label="Response", multiple=True)
    submit.click(answer_question, [images, prompts], output)
    prompts.submit(answer_question, [images, prompts], output)

demo.queue().launch()