nielsr HF staff commited on
Commit
05c2134
·
1 Parent(s): d6b2a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -10
app.py CHANGED
@@ -5,11 +5,17 @@ import torch
5
  torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
6
  torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
7
 
8
- git_processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
9
- git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
10
 
11
- blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
12
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
 
 
 
 
 
13
 
14
  vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
15
  vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
@@ -17,8 +23,10 @@ vitgpt_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-capt
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
20
- git_model.to(device)
21
- blip_model.to(device)
 
 
22
  vitgpt_model.to(device)
23
 
24
  def generate_caption(processor, model, image, tokenizer=None):
@@ -35,16 +43,21 @@ def generate_caption(processor, model, image, tokenizer=None):
35
 
36
 
37
  def generate_captions(image):
38
- caption_git = generate_caption(git_processor, git_model, image)
 
 
 
 
39
 
40
- caption_blip = generate_caption(blip_processor, blip_model, image)
41
 
42
  caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)
43
 
44
- return caption_git, caption_blip, caption_vitgpt
45
 
46
 
47
  examples = [["cats.jpg"], ["stop_sign.png"]]
 
48
 
49
  title = "Interactive demo: comparing image captioning models"
50
  description = "Gradio Demo to compare GIT, BLIP and ViT-2-GPT2, 3 state-of-the-art captioning models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
@@ -52,7 +65,7 @@ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2102.033
52
 
53
  interface = gr.Interface(fn=generate_captions,
54
  inputs=gr.inputs.Image(type="pil"),
55
- outputs=[gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")],
56
  examples=examples,
57
  title=title,
58
  description=description,
 
5
  torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
6
  torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
7
 
8
+ git_processor_base = AutoProcessor.from_pretrained("microsoft/git-base-coco")
9
+ git_model_base = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
10
 
11
+ git_processor_large = AutoProcessor.from_pretrained("microsoft/git-large-coco")
12
+ git_model_large = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")
13
+
14
+ blip_processor_base = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
15
+ blip_model_base = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
16
+
17
+ blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
+ blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
19
 
20
  vitgpt_processor = AutoImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
21
  vitgpt_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
23
 
24
  device = "cuda" if torch.cuda.is_available() else "cpu"
25
 
26
+ git_model_base.to(device)
27
+ blip_model_base.to(device)
28
+ git_model_large.to(device)
29
+ blip_model_large.to(device)
30
  vitgpt_model.to(device)
31
 
32
  def generate_caption(processor, model, image, tokenizer=None):
 
43
 
44
 
45
  def generate_captions(image):
46
+ caption_git_base = generate_caption(git_processor_base, git_model_base, image)
47
+
48
+ caption_git_large = generate_caption(git_processor_large, git_model_large, image)
49
+
50
+ caption_blip_base = generate_caption(blip_processor_base, blip_model_base, image)
51
 
52
+ caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
53
 
54
  caption_vitgpt = generate_caption(vitgpt_processor, vitgpt_model, image, vitgpt_tokenizer)
55
 
56
+ return caption_git_base, caption_git_large, caption_blip_base, caption_blip_large, caption_vitgpt
57
 
58
 
59
  examples = [["cats.jpg"], ["stop_sign.png"]]
60
+ outputs = [gr.outputs.Textbox(label="Caption generated by GIT-base"), gr.outputs.Textbox(label="Caption generated by GIT-large"), gr.outputs.Textbox(label="Caption generated by BLIP-base"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by ViT+GPT-2")],
61
 
62
  title = "Interactive demo: comparing image captioning models"
63
  description = "Gradio Demo to compare GIT, BLIP and ViT-2-GPT2, 3 state-of-the-art captioning models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
 
65
 
66
  interface = gr.Interface(fn=generate_captions,
67
  inputs=gr.inputs.Image(type="pil"),
68
+ outputs=outputs,
69
  examples=examples,
70
  title=title,
71
  description=description,