Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""app.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3 | |
## Image Caption Generator | |
We are going to use Transformers model to generate caption from an Image. | |
### Installation | |
1. Transformers | |
2. Pytorch | |
3. Image | |
For installation, please do pip install package_name | |
In Colab, Pytorch comes preinstalled and same goes with PIL for Image. | |
@misc {nlp_connect_2022, | |
author = { {NLP Connect} }, | |
title = { vit-gpt2-image-captioning (Revision 0e334c7) }, | |
year = 2022, | |
url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning }, | |
doi = { 10.57967/hf/0222 }, | |
publisher = { Hugging Face } | |
} *italicized text* | |
""" | |
#!pip install transformers | |
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer | |
import torch | |
from PIL import Image | |
import pandas as pd | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
max_length = 16 | |
num_beams = 8 | |
gen_kwargs = {"max_length": max_length, "num_beams": num_beams} | |
def cap_generation(img): | |
images = [] | |
if img.mode != "RGB": | |
img = img.convert(mode="RGB") | |
width, height = img.size | |
new_size = (int(width/4), int(height/4)) | |
# Resize the image for faster computation. | |
img = img.resize(new_size) | |
images.append(img) | |
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values | |
pixel_values = pixel_values.to(device) | |
output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=5,do_sample=True) | |
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
preds = [pred.strip() for pred in preds] | |
result = [s.capitalize() + '.' for s in preds] | |
return result | |
#!pip install openai | |
import openai | |
openai.api_key = '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' | |
model_engine = 'text-davinci-003' | |
def captions(img,num_captions): | |
Descriptions = cap_generation(img) | |
if num_captions == 'Multiple': | |
prompt = f""" | |
I want to post an image on Instagram. \ | |
I need a good caption for that image. A caption \ | |
is a wity statement that decribes an image in a \ | |
relatable manner. To Generate the captions \ | |
I am using Vit-Gpt2 model to extract the \ | |
descriptions of the image. \ | |
I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \ | |
descriptions are generated by an Ai model , all the descriptions might not be accurate. \ | |
From all these descriptions undertand what the image is. | |
After understanding the image generate some creative captions for \ | |
that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \ | |
Only return captions. | |
```{Descriptions}``` | |
""" | |
completion = openai.Completion.create( | |
engine = model_engine, | |
prompt = prompt, | |
max_tokens = 2048, | |
n= 1, | |
stop = None,) | |
return completion.choices[0].text | |
else: | |
prompt = f""" | |
I want to post an image on Instagram. \ | |
I need a good caption for that image. A caption \ | |
is a wity statement that decribes an image in a \ | |
relatable manner. To Generate the captions \ | |
I am using Vit-Gpt2 model to extract the \ | |
descriptions of the image. \ | |
I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \ | |
descriptions are generated by an Ai model , all the descriptions might not be accurate. \ | |
From all these descriptions undertand what the image is. | |
After understanding the image generate a single captions for \ | |
that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \ | |
Only return captions. | |
```{Descriptions}``` | |
""" | |
completion = openai.Completion.create( | |
engine = model_engine, | |
prompt = prompt, | |
max_tokens = 1024, | |
n= 1, | |
stop = None,) | |
return completion.choices[0].text | |
#!pip install gradio | |
import gradio as gr | |
import gradio as gr | |
inputs = [ | |
gr.inputs.Image(type='pil',label = "Upload your Image here"), | |
gr.inputs.Dropdown(choices=["Single", "Multiple"], label="Select how many captions you want",default = "Multiple") | |
] | |
outputs=[gr.outputs.Textbox(label="text")] | |
title = "Image Captioning" | |
description = "Image Captioning with vit-gpt" | |
article = " <a href = 'https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'> Model </a>" | |
interface = gr.Interface( | |
captions, | |
inputs, | |
outputs=outputs, | |
title=title, | |
description=description, | |
article=article, | |
) | |
interface.launch() | |