nielsr HF staff commited on
Commit
2773523
1 Parent(s): ae381b2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCausalLM, BlipForConditionalGeneration
3
+ import torch
4
+
5
+ torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
6
+
7
+ git_processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
8
+ git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
9
+
10
+ blip_processor = AutoProcessor.from_pretrained("Salesfoce/blip-image-captioning-base")
11
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesfoce/blip-image-captioning-base")
12
+
13
+ def generate_caption(processor, model, image):
14
+ inputs = processor(image=image, return_tensors="pt")
15
+
16
+ generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
17
+
18
+ generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
19
+
20
+ return generated_caption
21
+
22
+
23
+ def generate_captions(image):
24
+ caption_git = generate_caption(git_processor, git_model, image)
25
+
26
+ caption_blip = generate_caption(blip_processor, blip_model, image)
27
+
28
+ return caption_git, caption_blip
29
+
30
+
31
+ examples = [["cats.jpg"]]
32
+
33
+ title = "Interactive demo: ViLT"
34
+ description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on VQAv2, a model that can answer questions from images. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
35
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.03334' target='_blank'>ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision</a> | <a href='https://github.com/dandelin/ViLT' target='_blank'>Github Repo</a></p>"
36
+
37
+ interface = gr.Interface(fn=answer_question,
38
+ inputs=gr.inputs.Image(type="pil"),
39
+ outputs=[gr.outputs.Textbox(label="Generated caption by GIT"), gr.outputs.Textbox(label="Generated caption by BLIP")],
40
+ examples=examples,
41
+ title=title,
42
+ description=description,
43
+ article=article,
44
+ enable_queue=True)
45
+ interface.launch(debug=True)