import gradio as gr import spaces from standalone_velvet import setup_models models_dict = setup_models("visual_bloom.torch") visual_bloom = models_dict["visual_bloom"].to('cuda') tokenizer = models_dict["tokenizer"] image_feature_collator = models_dict["image_feature_collator"] @spaces.GPU def run_inference(text_input, image_input): image_features, image_attentions = image_feature_collator([image_input]) instruction_inputs = tokenizer([text_input], return_tensors="pt") language_output = visual_bloom.generate( image_features.to('cuda'), image_attentions.to('cuda'), instruction_inputs["input_ids"].to('cuda'), instruction_inputs["attention_mask"].to('cuda'), ) human_output = tokenizer.decode(language_output[0], skip_special_tokens=True) return human_output.split(".")[0] if __name__ == "__main__": markdown = """ # Quick introduction We have proposed a prompting vision language model. The model can caption images and answer questions related to images. It is trained on CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA. As the result of using Google Translate, these datasets collectively contain millions of image-text pairs in English and Vietnamese. For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction). # Usage ## Run with pre-defined examples 1. Scroll to bottom of the page to see the examples. 2. Click one of them. 3. Click the `Run Inference` button. ## Run with user-defined inputs ### 1. Prepare text input Image captioning: - `Generate caption in en:` - `Generate caption in vi:` Visual question answering: - `Generate answer in en: ?` - `Generate answer in vi: ?` Don't forget to replace `` with your own question either in English or Vietnamese. To write the prompt, one can refer to the examples at the bottom of the page. ### 2. Prepare image input You can do as said in Image Input box. Wide range of image types are supported by PIL. ### 3. Click the `Run Inference` button """ examples = [ ["Generate caption in en:", "examples/cat.png"], ["Generate caption in vi:", "examples/cat.png"], ["Generate answer in en: what is the color of the cat?", "examples/cat.png"], ["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"], ] with gr.Blocks() as demo: gr.Markdown(markdown) text_input = gr.Textbox(label="Text Input") image_input = gr.Image(label="Image Input", type="pil") text_output = gr.Textbox(label="Text Output") infer_button = gr.Button("Run Inference") infer_button.click( run_inference, inputs=[text_input, image_input], outputs=text_output ) examples = gr.Examples( examples=examples, inputs=[text_input, image_input], ) demo.launch()