import gradio as gr
import requests
import os
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import time
import cv2
import numpy as np
import webcolors
import json
import re
from gradio_client import Client
import ast
import spaces
from profanityfilter import ProfanityFilter
import time

import torch
from diffusers import DiffusionPipeline, AutoencoderKL, DPMSolverMultistepScheduler

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device.type != 'cuda':
    raise ValueError("need to run on GPU")

vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
text2img_pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    vae=vae,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True,
).to(device)

text2img_pipe.scheduler = DPMSolverMultistepScheduler.from_config(text2img_pipe.scheduler.config)

# Performance/memory optimizations
text2img_pipe.enable_model_cpu_offload()
text2img_pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
# pipeline_img2img.enable_sequential_cpu_offload()
text2img_pipe.enable_vae_tiling()
text2img_pipe.enable_attention_slicing()

text2img_pipe.load_lora_weights('sessex/tabi-0-LoRA')
negative_prompt ="nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"

def prepare_image_for_watermark(image):
    # Create a new white background image
    background = Image.new('RGB', (1200, 1500), color='white')

     # Calculate the position to paste the image onto the white background
    x_offset = (1200 - image.width) // 2
    y_offset = 48 # position from top
    
    # Paste the resized image onto the white background at the specified position
    background.paste(image, (x_offset, y_offset))
    
    return background

# Initialize the profanity filter
pf = ProfanityFilter()
def filter_inappropriate(input_text):
    # Filter out inappropriate words
    pf.censor_char = ' '
    filtered_text = pf.censor(input_text)
    return filtered_text.strip()

# find the closest color name to rgb value
# def closest_color(rgb_color):
#     min_colors = {}
#     for key, name in webcolors.CSS3_HEX_TO_NAMES.items():
#         r_c, g_c, b_c = webcolors.hex_to_rgb(key)
#         rd = (r_c - rgb_color[0]) ** 2
#         gd = (g_c - rgb_color[1]) ** 2
#         bd = (b_c - rgb_color[2]) ** 2
#         min_colors[(rd + gd + bd)] = name
#     return min_colors[min(min_colors.keys())]

# def get_dominant_colors(img_filepath):
#   # Load the image from file path
#   img_data = Image.open(img_filepath)

#   # Convert the image to a NumPy array
#   img = np.array(img_data)

#   # k-means clustering to create palette of most dominant n_colors
#   pixels = np.float32(img.reshape(-1, 3))

#   n_colors = 2
#   criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
#   flags = cv2.KMEANS_RANDOM_CENTERS

#   _, labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

#   # get names of dominant colors
#   dominant_colors = []
#   for color in palette:
#         color_name = closest_color(color)
#         dominant_colors.append(color_name)

#   return dominant_colors

def get_image_caption(image):
    client = Client("https://vikhyatk-moondream1.hf.space/")
    result = client.predict(
		image,	# filepath  in 'image' Image component
        "What colors and patterns appear in this photo?",
		api_name="/answer_question"
    )
    # print(result)
    return result

def get_image_keywords(image):
  # get img2text description
  caption = get_image_caption(image)

  # get colors
  # colors_list = get_dominant_colors(image)
  # colors = ", ".join(colors_list)
  return caption, ""

from transformers import pipeline
pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

def construct_prompt(image_caption, image_colors, user_input):
    agent_maker_sys = f"""
You are a AI whose job is to help users create their own custom shoe image which will reflect the colors, characteristics, or aesthetics from an image described by users.
In particular, you need to respond succintly and write a prompt for an image generation model. The response must include to the word "mm-tabi" which will trigger the style of shoe. 
The response should avoid any descriptions of man or woman and don't include any articles of clothing or accessories from Caption.
The response should always start with "surreal photo of mm-tabi boot with split toe". 
The response should always end with "still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k". 
The response should only use one or two aspects from the Caption provided by the user that could easily be applied to a still life scene or characteristic of the shoe, like color or texture or an object. 
For example, if a user says, 
"Keywords: California dogs sunshine shopping 
/n Caption: The photo features a woman wearing a blue sweater with a red and white design. The sweater is a prominent feature in the image, and it is the main focus of the scene. The background is plain, with no other colors or patterns visible. The woman is standing in front of a building, which serves as a backdrop for the photo."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, surrounded by California dogs sunshine shopping, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, blue"

If a user says, 
"Keywords: Bunny 
/n Caption: The photo features a young woman wearing a black sweatshirt with a red and white pattern. She is standing in a large, empty room, which appears to be a mall or a similar public space. The room has a white ceiling and is decorated with various colors and patterns, creating a visually interesting environment. There is also a handbag visible in the scene, placed close to the woman."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, surrounded by bunny, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, red, white"

If a user says, 
"Keywords: Sun 
/n Caption: The photo features a woman wearing a striped shirt, which has a combination of black, white, and gray colors. She is also wearing glasses, and her smile adds a positive touch to the image. Additionally, she is holding a cell phone in her hand, which is being photographed. The background of the photo is plain, with no visible patterns or colors, allowing the focus to be on the woman and her attire."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, surrounded by sun, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k"

Here's another example, if a user says,
"Keywords: Chitose Abe 
/n Caption: The photo features a young man wearing a black jacket and a white baseball cap. He is smiling and posing for the camera, with a tan and black jacket and a white cap. The man is carrying a bag, which is visible in the image. The background of the photo is white, and there is a person standing behind the young man, possibly a friend or a passerby."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, surrounded by Chitose-Abe inspired, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, black, white"

Here's another example, if a user says,
"Keywords: Painted
/n Caption: The photo features a young woman with black hair, wearing a black shirt. She is holding a cupcake with green frosting and red candy on top. The cupcake itself has a green frosting and red candy, which adds a pop of color to the scene. The overall image showcases a combination of black, green, and red colors, along with the woman’s smiling expression, creating a visually appealing and vibrant scene."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, painted, surrounded by cupcake, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, green, red"

Here's another example, if a user says,
"Keywords: Sneaker
/n Caption: The photo features a woman wearing a black top with a white bottom. The top has a black collar, and the woman is posing in front of a large building. The building's interior is decorated with white tiles, which create a contrasting background for the woman's outfit."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi sneaker with split toe, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k, black"

Here's another example, if a user says,
"Keywords: Ruby black 
/n Caption: The photo features a woman wearing a dark blue sweater with a black and white checkered pattern. She is posing in front of a large, white building, which could be a hotel or a mall. The background is plain, with no visible patterns or colors. The woman is holding a handbag, which is also black and white in design."
, provide immediately an image prompt that describes a still life photo of a shoe corresponding to the keywords, color, and objects or stylistic elements from the caption provided.
Immediately STOP after that. It should be in this format:
"surreal photo of mm-tabi boot with split toe, dark blue, surrounded by ruby black, still life in the style of retrofuturism, unconventional, dreamy, fantasy, digital video distortion, lens aberration, highly detailed, hd, 8k"""


    instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""

    prompt = f"{instruction.strip()}\n Keywords: {user_input} \n Caption: {image_caption}</s>"
    print(f"INPUTS: \n Keywords: {user_input} \n Caption: {image_caption}")
    with torch.no_grad():
        outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    
    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)

    return cleaned_text.lstrip("\n")
    
def text2img_inference(prompt):
    gr.Info('Image generation request sent')
    with torch.no_grad():
        image = text2img_pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=1024,
            height=1024,
            num_inference_steps=30,
            guidance_scale=7.5
        ).images[0]

    return image

def generate_image(user_input, input_image, image_prompt):
    start_time = time.time()
    if image_prompt == "":
      # generate keywords from image
      gr.Info('Starting to generate caption for input image')
      img_caption, img_colors = get_image_keywords(input_image)
    
      # filter user input
      gr.Info('Processing keywords for inappropriate language')
      user_input = filter_inappropriate(user_input)
    
      # construct prompt from image caption, image colors, and user input
      gr.Info('Constructing prompt')
      full_prompt = construct_prompt(img_caption, img_colors, user_input)
      print(f"FULL PROMPT: {full_prompt}")

    prompt = image_prompt if image_prompt != "" else full_prompt

    # text2img generation with full prompt construction
    image = text2img_inference(prompt)
    watermarkable_image = prepare_image_for_watermark(image)

    end_time = time.time()

    torch.cuda.empty_cache()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time, "seconds")
    
    return watermarkable_image, prompt, image

gradio_app = gr.Interface(
    fn=generate_image,
    inputs=[gr.Text(label="User Keywords"), gr.Image(label="Input Image", type='filepath'), gr.Text(label="Generated Prompt")],
    outputs=[gr.Image(label="Image Generation"), gr.Text(label="Image Prompt"), gr.Image(label="Raw Image Generation")],
    title="Custom Tabi",
    description="Enter keywords and upload image to generate a custom Tabi boot"
)

gradio_app.launch(debug=True)