Suhailshah's picture
Update app.py
0a7ae9a
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3
## Image Caption Generator
We are going to use Transformers model to generate caption from an Image.
### Installation
1. Transformers
2. Pytorch
3. Image
For installation, please do pip install package_name
In Colab, Pytorch comes preinstalled and same goes with PIL for Image.
@misc {nlp_connect_2022,
author = { {NLP Connect} },
title = { vit-gpt2-image-captioning (Revision 0e334c7) },
year = 2022,
url = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning },
doi = { 10.57967/hf/0222 },
publisher = { Hugging Face }
} *italicized text*
"""
#!pip install transformers
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image
import pandas as pd
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
max_length = 16
num_beams = 8
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def cap_generation(img):
images = []
if img.mode != "RGB":
img = img.convert(mode="RGB")
width, height = img.size
new_size = (int(width/4), int(height/4))
# Resize the image for faster computation.
img = img.resize(new_size)
images.append(img)
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=5,do_sample=True)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
result = [s.capitalize() + '.' for s in preds]
return result
#!pip install openai
import openai
openai.api_key = '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'
model_engine = 'text-davinci-003'
def captions(img,num_captions):
Descriptions = cap_generation(img)
if num_captions == 'Multiple':
prompt = f"""
I want to post an image on Instagram. \
I need a good caption for that image. A caption \
is a wity statement that decribes an image in a \
relatable manner. To Generate the captions \
I am using Vit-Gpt2 model to extract the \
descriptions of the image. \
I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \
descriptions are generated by an Ai model , all the descriptions might not be accurate. \
From all these descriptions undertand what the image is.
After understanding the image generate some creative captions for \
that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \
Only return captions.
```{Descriptions}```
"""
completion = openai.Completion.create(
engine = model_engine,
prompt = prompt,
max_tokens = 2048,
n= 1,
stop = None,)
return completion.choices[0].text
else:
prompt = f"""
I want to post an image on Instagram. \
I need a good caption for that image. A caption \
is a wity statement that decribes an image in a \
relatable manner. To Generate the captions \
I am using Vit-Gpt2 model to extract the \
descriptions of the image. \
I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \
descriptions are generated by an Ai model , all the descriptions might not be accurate. \
From all these descriptions undertand what the image is.
After understanding the image generate a single captions for \
that image ,the caption should include some relevent emojis also so, that I can post that on social media like instagram. \
Only return captions.
```{Descriptions}```
"""
completion = openai.Completion.create(
engine = model_engine,
prompt = prompt,
max_tokens = 1024,
n= 1,
stop = None,)
return completion.choices[0].text
#!pip install gradio
import gradio as gr
import gradio as gr
inputs = [
gr.inputs.Image(type='pil',label = "Upload your Image here"),
gr.inputs.Dropdown(choices=["Single", "Multiple"], label="Select how many captions you want",default = "Multiple")
]
outputs=[gr.outputs.Textbox(label="text")]
title = "Image Captioning"
description = "Image Captioning with vit-gpt"
article = " <a href = 'https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'> Model </a>"
interface = gr.Interface(
captions,
inputs,
outputs=outputs,
title=title,
description=description,
article=article,
)
interface.launch()