import gradio as gr
import openai
from dotenv import load_dotenv
import os

import torch

from diffusers import ControlNetModel, StableDiffusionControlNetPipeline, UniPCMultistepScheduler
import cv2
from PIL import Image
import numpy as np
from diffusers.utils import load_image

# Let's load the popular vermeer image
image = load_image(
    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
)
image = np.array(image)

low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)

controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble", torch_dtype=torch.float16).to("cuda")
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
).to("cuda")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# this command loads the individual model components on GPU on-demand.
pipe.enable_model_cpu_offload()

# prompt = "closeup face photo of caucasian lady in black clothes, night city street, bokeh"
# negative_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"

n_steps = 25

generator = torch.manual_seed(0)

# out_image = pipe(
#     prompt=prompt, num_inference_steps=20, generator=generator, image=canny_image
# ).images[0]

def predict(prompt,negative_prompt):
    # prompt, negative_prompt = inputs 
    image = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=n_steps,
        generator=generator, 
        image=canny_image
        # denoising_end=high_noise_frac,
        # image=seg_image,
        # output_type="latent",
    ).images[0]
    # image = refiner(
    #     prompt=prompt,
    #     num_inference_steps=n_steps,
    #     denoising_start=high_noise_frac,
    #     image=image,
    # ).images[0]
    return image


demo = gr.Interface(fn=predict, inputs=[gr.Textbox(value="prompt"), gr.Textbox(value="negative prompt")], outputs="image")

if __name__ == "__main__":
    demo.launch()