File size: 3,034 Bytes
44c2fe5
4dfc194
44c2fe5
 
 
 
fb5ec17
44c2fe5
 
 
 
4dfc194
44c2fe5
 
 
 
fb5ec17
 
 
 
44c2fe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import spaces

from standalone_velvet import setup_models

models_dict = setup_models("visual_bloom.torch")
visual_bloom = models_dict["visual_bloom"].to('cuda')
tokenizer = models_dict["tokenizer"]
image_feature_collator = models_dict["image_feature_collator"]


@spaces.GPU
def run_inference(text_input, image_input):
    image_features, image_attentions = image_feature_collator([image_input])
    instruction_inputs = tokenizer([text_input], return_tensors="pt")
    language_output = visual_bloom.generate(
        image_features.to('cuda'),
        image_attentions.to('cuda'),
        instruction_inputs["input_ids"].to('cuda'),
        instruction_inputs["attention_mask"].to('cuda'),
    )

    human_output = tokenizer.decode(language_output[0], skip_special_tokens=True)
    return human_output.split(".")[0]


if __name__ == "__main__":
    markdown = """
    # Quick introduction

    We have proposed a prompting vision language model.
    The model can caption images and answer questions related to images.
    It is trained on  CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA.
    As the result of using Google Translate,
    these datasets collectively contain millions of image-text pairs in English and Vietnamese.

    For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction).

    # Usage

    ## Run with pre-defined examples

    1. Scroll to bottom of the page to see the examples.
    2. Click one of them.
    3. Click the `Run Inference` button.

    ## Run with user-defined inputs

    ### 1. Prepare text input

    Image captioning:
    - `Generate caption in en:`
    - `Generate caption in vi:`

    Visual question answering:
    - `Generate answer in en: <question>?`
    - `Generate answer in vi: <question>?`

    Don't forget to replace `<question>` with your own question either in English or Vietnamese.

    To write the prompt, one can refer to the examples at the bottom of the page.

    ### 2. Prepare image input

    You can do as said in Image Input box. Wide range of image types are supported by PIL.

    ### 3. Click the `Run Inference` button
    """
    examples = [
        ["Generate caption in en:", "examples/cat.png"],
        ["Generate caption in vi:", "examples/cat.png"],
        ["Generate answer in en: what is the color of the cat?", "examples/cat.png"],
        ["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"],
    ]

    with gr.Blocks() as demo:
        gr.Markdown(markdown)

        text_input = gr.Textbox(label="Text Input")
        image_input = gr.Image(label="Image Input", type="pil")

        text_output = gr.Textbox(label="Text Output")

        infer_button = gr.Button("Run Inference")
        infer_button.click(
            run_inference, inputs=[text_input, image_input], outputs=text_output
        )

        examples = gr.Examples(
            examples=examples,
            inputs=[text_input, image_input],
        )
    demo.launch()