File size: 1,519 Bytes
dd33bd5
 
c8081ec
dd33bd5
c8081ec
56b9e35
d52fc21
 
 
c8081ec
dd33bd5
56b9e35
c8081ec
 
dd33bd5
c8081ec
dd33bd5
 
c8081ec
 
dd33bd5
 
 
 
49fe0a4
dd33bd5
8bbfb69
c8081ec
877b91d
e42c333
c8081ec
e50d169
60d48cd
c8081ec
 
 
 
 
ea5afdb
c8081ec
877b91d
 
c8081ec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import torch 
import re 
import gradio as gr
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 

device='cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)


def predict(image,max_length=64, num_beams=4):
  image = image.convert('RGB')
  image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
  clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
  caption_ids = model.generate(image, max_length = max_length)[0]
  caption_text = clean_text(tokenizer.decode(caption_ids))
  return caption_text 



input = gr.inputs.Image(label="Upload your Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="auto",label="Captions")
examples = [f"example{i}.jpg" for i in range(1,7)]

description= "Image captioning application made using transformers"
title = "Image Captioning 🖼️"

article = "Created By : Shreyas Dixit "

interface = gr.Interface(
        fn=predict,
        inputs = input,
        theme="grass",
        outputs=output,
        examples = examples,
        title=title,
        description=description,
        article = article,
    )
interface.launch(debug=True)