# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1nybx9b_W5IsJz9G0GHvDx6KQKiTv_gt3

## Image Caption Generator 

We are going to use Transformers model to generate caption from an Image.

### Installation


1.   Transformers
2.   Pytorch
3. Image 

For installation, please do pip install package_name

In Colab, Pytorch comes preinstalled and same goes with PIL for Image.

@misc {nlp_connect_2022,

	author       = { {NLP Connect} },
	title        = { vit-gpt2-image-captioning (Revision 0e334c7) },
	year         = 2022,
	url          = { https://huggingface.co/nlpconnect/vit-gpt2-image-captioning },
	doi          = { 10.57967/hf/0222 },
	publisher    = { Hugging Face }
} *italicized text*
"""

#!pip install transformers

from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image
import pandas as pd

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 16
num_beams = 8
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def cap_generation(img):
  images = []
  if img.mode != "RGB":
    img = img.convert(mode="RGB")
  width, height = img.size

  new_size = (int(width/4), int(height/4))

# Resize the image for faster computation.
  img = img.resize(new_size)

  images.append(img)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)
  
  output_ids = model.generate(pixel_values,max_length = 100,num_return_sequences=5,do_sample=True)
  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  result = [s.capitalize() + '.' for s in preds]
  return result

#!pip install openai

import openai
openai.api_key =  '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'

model_engine = 'text-davinci-003'
def captions(img,num_captions):
  Descriptions = cap_generation(img)
  if num_captions == 'Multiple':
    prompt = f"""
  I want to post an image on Instagram. \
  I need a good caption for that image. A caption \
  is a wity statement that decribes an image in a \
  relatable manner. To Generate the captions \
  I am using Vit-Gpt2 model to extract the \
  descriptions of the image. \
I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \
descriptions are generated by an Ai model , all the descriptions might not be accurate. \
From all these descriptions undertand what the image is.
After understanding the image generate some creative captions for \
that image ,the caption should include some relevent emojis also so, that I can post that  on social media like instagram. \
Only return captions.
```{Descriptions}```
"""
    completion = openai.Completion.create(
    engine = model_engine,
    prompt = prompt,
    max_tokens = 2048,
    n= 1,
    stop = None,)
    return completion.choices[0].text

  else:
    prompt = f"""
  I want to post an image on Instagram. \
  I need a good caption for that image. A caption \
  is a wity statement that decribes an image in a \
  relatable manner. To Generate the captions \
  I am using Vit-Gpt2 model to extract the \
  descriptions of the image. \
  I want you Read the list of descrptions generated by vit-gpt2 for an image delimited by triple backticks.Since the \
  descriptions are generated by an Ai model , all the descriptions might not be accurate. \
  From all these descriptions undertand what the image is.
  After understanding the image generate a single captions for \
  that image ,the caption should include some relevent emojis also so, that I can post that  on social media like instagram. \
  Only return captions.
  ```{Descriptions}```
  """
    completion = openai.Completion.create(
    engine = model_engine,
    prompt = prompt,
    max_tokens = 1024,
    n= 1,
    stop = None,)
    return completion.choices[0].text

#!pip install gradio
import gradio as gr

import gradio as gr
inputs = [
    gr.inputs.Image(type='pil',label = "Upload your Image here"),
    gr.inputs.Dropdown(choices=["Single", "Multiple"], label="Select how many captions you want",default = "Multiple")
]
outputs=[gr.outputs.Textbox(label="text")]

title = "Image Captioning"
description = "Image Captioning with vit-gpt"
article = " <a href  = 'https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'> Model </a>"

interface = gr.Interface(
    captions,
    inputs,
    outputs=outputs,
    title=title,
    description=description,
    article=article,
    )

interface.launch()