File size: 3,194 Bytes
0c25931 d960eba 0c25931 44d9d17 0c25931 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# -*- coding: utf-8 -*-
"""Copy of caption.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3
## Image Caption Generator
We are going to use Transformers model to generate caption from an Image.
### Installation
1. Transformers
2. Pytorch
3. Image
@misc {nlp_connect_2022,
author = { {NLP Connect} },
title = { vit-gpt2-image-captioning (Revision 0e334c7) },
year = 2022,
url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning },
doi = { 10.57967/hf/0222 },
publisher = { Hugging Face }
} *italicized text*
"""
#!pip install transformers
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image
import pandas as pd
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
max_length = 16
num_beams = 8
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def cap_generation(img,Num_of_captions):
images = []
Num_of_captions = int(Num_of_captions)
if img.mode != "RGB":
img = img.convert(mode="RGB")
width, height = img.size
new_size = (int(width/4), int(height/4))
# Resize the image for faster computation.
img = img.resize(new_size)
images.append(img)
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
if(Num_of_captions==1):
output_ids = model.generate(pixel_values,**gen_kwargs)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
result = [s.capitalize() + '.' for s in preds]
data = {"No.": range(1, len(result)+1), "Captions": result}
df = pd.DataFrame(data)
return df
else:
output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=Num_of_captions,do_sample=True)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
result = [s.capitalize() + '.' for s in preds]
data = {"No.": range(1, len(result)+1), "Captions": result}
df = pd.DataFrame(data)
return df
#!pip install gradio
import gradio as gr
import gradio as gr
inputs = [
gr.inputs.Image(type='pil',label = 'Original Image'),
gr.inputs.Number(default = 1, label="Number Of Captions")
]
outputs=[gr.outputs.Dataframe(type="pandas")]
title = "Image Captioning Using VIT-GPT2 "
description = "Image Captioning with vit-gpt2"
article = " <a href = 'https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'> Model </a>"
'''examples = [
['Image3.png']
]'''
interface = gr.Interface(
cap_generation,
inputs,
outputs=outputs,
title=title,
description=description,
article=article,
theme="huggingface",
)
interface.launch()
|