# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3 ## Image Caption Generator We are going to use Transformers model to generate caption from an Image. ### Installation 1. Transformers 2. Pytorch 3. Image For installation, please do pip install package_name In Colab, Pytorch comes preinstalled and same goes with PIL for Image. @misc {nlp_connect_2022, author = { {NLP Connect} }, title = { vit-gpt2-image-captioning (Revision 0e334c7) }, year = 2022, url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning }, doi = { 10.57967/hf/0222 }, publisher = { Hugging Face } } *italicized text* """ #!pip install transformers from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer import torch from PIL import Image import pandas as pd model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) max_length = 16 num_beams = 8 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} def cap_generation(img): images = [] if img.mode != "RGB": img = img.convert(mode="RGB") width, height = img.size new_size = (int(width/4), int(height/4)) # Resize the image for faster computation. img = img.resize(new_size) images.append(img) pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values pixel_values = pixel_values.to(device) output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=5,do_sample=True) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] result = [s.capitalize() + '.' for s in preds] return result #!pip install openai import openai openai.api_key = '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' model_engine = 'text-davinci-003' def captions(img,num_captions): Descriptions = cap_generation(img) if num_captions == 'Multiple': prompt = f""" I want to post an image on Instagram. \ I need a good caption for that image. A caption \ is a wity statement that decribes an image in a \ relatable manner. To Generate the captions \ I am using Vit-Gpt2 model to extract the \ descriptions of the image. \ I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \ descriptions are generated by an Ai model , all the descriptions might not be accurate. \ From all these descriptions undertand what the image is. After understanding the image generate some creative captions for \ that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \ Only return captions. ```{Descriptions}``` """ completion = openai.Completion.create( engine = model_engine, prompt = prompt, max_tokens = 2048, n= 1, stop = None,) return completion.choices[0].text else: prompt = f""" I want to post an image on Instagram. \ I need a good caption for that image. A caption \ is a wity statement that decribes an image in a \ relatable manner. To Generate the captions \ I am using Vit-Gpt2 model to extract the \ descriptions of the image. \ I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \ descriptions are generated by an Ai model , all the descriptions might not be accurate. \ From all these descriptions undertand what the image is. After understanding the image generate a single captions for \ that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \ Only return captions. ```{Descriptions}``` """ completion = openai.Completion.create( engine = model_engine, prompt = prompt, max_tokens = 1024, n= 1, stop = None,) return completion.choices[0].text #!pip install gradio import gradio as gr import gradio as gr inputs = [ gr.inputs.Image(type='pil',label = "Upload your Image here"), gr.inputs.Dropdown(choices=["Single", "Multiple"], label="Select how many captions you want",default = "Multiple") ] outputs=[gr.outputs.Textbox(label="text")] title = "Image Captioning" description = "Image Captioning with vit-gpt" article = " Model " interface = gr.Interface( captions, inputs, outputs=outputs, title=title, description=description, article=article, ) interface.launch()