File size: 1,116 Bytes
bee5682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce396c3
bee5682
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from PIL import Image
from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor , PreTrainedTokenizerFast
import gradio as gr

model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch32-224-in21k")
tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")


def caption_images(image):
  pixel_values = vit_feature_extractor(images=image,return_tensors="pt").pixel_values
  encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5)
  generated_sentence = tokenizer.batch_decode(encoder_outputs,skip_special_tokens=True)

  return (generated_sentence[0].strip())


inputs = [
    gr.components.Image(type='pil',label='Original Image')
]

outputs = [
    gr.components.Textbox(label='Caption')
]

title = "Simple Image captioning Application"
description = "Upload an image to see the caption generated"
example =['messi.jpg']

gr.Interface(
    caption_images,
    inputs,
    outputs,
    title=title,
    description = description,
    examples = example,
).launch(debug=True)