# -*- coding: utf-8 -*- """Copy of caption.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3 ## Image Caption Generator We are going to use Transformers model to generate caption from an Image. ### Installation 1. Transformers 2. Pytorch 3. Image @misc {nlp_connect_2022, author = { {NLP Connect} }, title = { vit-gpt2-image-captioning (Revision 0e334c7) }, year = 2022, url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning }, doi = { 10.57967/hf/0222 }, publisher = { Hugging Face } } *italicized text* """ #!pip install transformers from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer import torch from PIL import Image import pandas as pd model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) max_length = 16 num_beams = 8 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} def cap_generation(img,Num_of_captions): images = [] Num_of_captions = int(Num_of_captions) if img.mode != "RGB": img = img.convert(mode="RGB") width, height = img.size new_size = (int(width/4), int(height/4)) # Resize the image for faster computation. img = img.resize(new_size) images.append(img) pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values pixel_values = pixel_values.to(device) if(Num_of_captions==1): output_ids = model.generate(pixel_values,**gen_kwargs) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] result = [s.capitalize() + '.' for s in preds] data = {"No.": range(1, len(result)+1), "Captions": result} df = pd.DataFrame(data) return df else: output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=Num_of_captions,do_sample=True) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] result = [s.capitalize() + '.' for s in preds] data = {"No.": range(1, len(result)+1), "Captions": result} df = pd.DataFrame(data) return df #!pip install gradio import gradio as gr import gradio as gr inputs = [ gr.inputs.Image(type='pil',label = 'Original Image'), gr.inputs.Number(default = 1, label="Number Of Captions") ] outputs=[gr.outputs.Dataframe(type="pandas")] title = "Image Captioning Using VIT-GPT2 " description = "Image Captioning with vit-gpt2" article = " Model " '''examples = [ ['Image3.png'] ]''' interface = gr.Interface( cap_generation, inputs, outputs=outputs, title=title, description=description, article=article, theme="huggingface", ) interface.launch()