File size: 3,023 Bytes
6a8ca1f
 
 
 
 
04fc1f1
6a8ca1f
 
 
 
 
 
 
 
 
 
e9cc0b5
 
6a8ca1f
 
de50a7e
04fc1f1
ee5e19e
e27d897
db2ea29
 
ee5e19e
e27d897
ee5e19e
db2ea29
 
 
 
 
 
 
 
 
 
 
 
576d10c
db2ea29
 
 
 
 
 
 
 
 
6a8ca1f
 
 
 
 
e27d897
 
8a8a62b
e27d897
8a8a62b
db2ea29
6a8ca1f
 
ee5e19e
fefde70
 
 
6a8ca1f
 
69cfbe8
e9ecb71
69cfbe8
6a8ca1f
69cfbe8
6a8ca1f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import spaces
import torch
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image

if torch.cuda.is_available():
    device, dtype = "cuda", torch.float16
else:
    device, dtype = "cpu", torch.float32

model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision, torch_dtype=dtype
).to(device=device)
moondream.eval()

@spaces.GPU
def answer_questions(image_tuples, prompt_text):
    result = ""
    
    prompts = [p.strip() for p in prompt_text.split(',')]  # Splitting and cleaning prompts
    print(f"prompts\n{prompts}\n")
    image_embeds = [img[0] for img in image_tuples if img[0] is not None]  # Extracting images from tuples, ignoring None
    
    # Check if the lengths of image_embeds and prompts are equal
    #if len(image_embeds) != len(prompts):
        #return ("Error: The number of images input and prompts input (seperate by commas in input text field) must be the same.")

    answers = []
    for prompt in prompts:
        image_answers = moondream.batch_answer(
            images=[img.convert("RGB") for img in image_embeds],
            prompts=[prompt] * len(image_embeds),
            tokenizer=tokenizer,
        )
        answers.append(image_answers)

    data = []
    for i in range(len(image_tuples)):
        image_name = f"image{i+1}"
        image_answers = [answer[i] for answer in answers]
        print(f"image{i+1}_answers \n {image_answers} \n")
        data.append([image_name] + image_answers)

    result = {'headers': prompts, 'data': data}
    return result
    '''
    answers = moondream.batch_answer(
        images=image_embeds,
        prompts=prompts,
        tokenizer=tokenizer,
    )
    
    for question, answer in zip(prompts, answers):
        result += (f"Q: {question}\nA: {answer}\n\n")
        
    return result
    '''

with gr.Blocks() as demo:
    gr.Markdown("# moondream2 unofficial batch processing demo")
    gr.Markdown("1. Select images\n2. Enter prompts (one prompt for each image provided) separated by commas. Ex: Describe this image, What is in this image?\n\n")
    gr.Markdown("*Tested and Running on free CPU space tier currently so results may take a bit to process compared to using GPU space hardware*")
    gr.Markdown("## πŸŒ” moondream2\nA tiny vision language model. [GitHub](https://github.com/vikhyatk/moondream)")
    with gr.Row():
        img = gr.Gallery(label="Upload Images", type="pil")
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by commas. Ex: Describe this image, What is in this image?", lines=8)
    with gr.Row():
        submit = gr.Button("Submit")
    output = gr.TextArea(label="Responses", lines=30)
    submit.click(answer_questions, [img, prompt], output)

demo.queue().launch()