|
|
|
"""Copy of caption.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3 |
|
|
|
## Image Caption Generator |
|
|
|
We are going to use Transformers model to generate caption from an Image. |
|
|
|
### Installation |
|
|
|
|
|
|
|
1. Transformers |
|
2. Pytorch |
|
3. Image |
|
|
|
|
|
@misc {nlp_connect_2022, |
|
|
|
author = { {NLP Connect} }, |
|
title = { vit-gpt2-image-captioning (Revision 0e334c7) }, |
|
year = 2022, |
|
url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning }, |
|
doi = { 10.57967/hf/0222 }, |
|
publisher = { Hugging Face } |
|
} *italicized text* |
|
""" |
|
|
|
|
|
|
|
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer |
|
import torch |
|
from PIL import Image |
|
import pandas as pd |
|
|
|
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
max_length = 16 |
|
num_beams = 8 |
|
gen_kwargs = {"max_length": max_length, "num_beams": num_beams} |
|
|
|
def cap_generation(img,Num_of_captions): |
|
images = [] |
|
Num_of_captions = int(Num_of_captions) |
|
if img.mode != "RGB": |
|
img = img.convert(mode="RGB") |
|
width, height = img.size |
|
|
|
new_size = (int(width/4), int(height/4)) |
|
|
|
|
|
img = img.resize(new_size) |
|
|
|
images.append(img) |
|
|
|
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values |
|
pixel_values = pixel_values.to(device) |
|
if(Num_of_captions==1): |
|
output_ids = model.generate(pixel_values,**gen_kwargs) |
|
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) |
|
preds = [pred.strip() for pred in preds] |
|
result = [s.capitalize() + '.' for s in preds] |
|
data = {"No.": range(1, len(result)+1), "Captions": result} |
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
else: |
|
output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=Num_of_captions,do_sample=True) |
|
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) |
|
preds = [pred.strip() for pred in preds] |
|
result = [s.capitalize() + '.' for s in preds] |
|
data = {"No.": range(1, len(result)+1), "Captions": result} |
|
df = pd.DataFrame(data) |
|
return df |
|
|
|
|
|
import gradio as gr |
|
|
|
import gradio as gr |
|
inputs = [ |
|
gr.inputs.Image(type='pil',label = 'Original Image'), |
|
gr.inputs.Number(default = 1, label="Number Of Captions") |
|
] |
|
outputs=[gr.outputs.Dataframe(type="pandas")] |
|
|
|
title = "Image Captioning Using VIT-GPT2 " |
|
description = "Image Captioning with vit-gpt2" |
|
article = " <a href = 'https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'> Model </a>" |
|
'''examples = [ |
|
['Image3.png'] |
|
]''' |
|
|
|
interface = gr.Interface( |
|
cap_generation, |
|
inputs, |
|
outputs=outputs, |
|
title=title, |
|
description=description, |
|
article=article, |
|
theme="huggingface", |
|
) |
|
|
|
interface.launch() |
|
|
|
|