import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
import numpy as np
from transformers import pipeline

asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
zero = pipeline("zero-shot-image-classification")
#zero = gr.Interface.load("spaces/Datatrooper/zero-shot-image-classification")
#tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")

def text2image_latent(text, steps, width, height, images, diversity):
    print(text)
    results = latent(text, steps, width, height, images, diversity)
    image_paths = []
    for image in results[1]:
        image_str = image[0]
        image_str = image_str.replace("data:image/png;base64,","")
        decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
        img = Image.open(io.BytesIO(decoded_bytes))
        url = shortuuid.uuid()
        temp_dir = './tmp'
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir, exist_ok=True)
        image_path = f'{temp_dir}/{url}.png'
        img.save(f'{temp_dir}/{url}.png')
        image_paths.append(image_path)
    return(image_paths)


def speech_to_text(mic=None, file=None, state=""):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    transcription = asr(audio)["text"]
    state += text + " "
    return state, state

def zero_shot(image, text_input):
    PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
    labels = labels_text.split(",")
    res = pipe(images=PIL_image, 
           candidate_labels=labels,
           hypothesis_template= "This is a photo of a {}")
    return {dic["label"]: dic["score"] for dic in res}

def shot(image, labels_text):
    PIL_image = Image.fromarray(np.uint8(image)).convert('RGB')
    labels = labels_text.split(",")
    res = pipe(images= PIL_image, 
           candidate_labels=labels,
           hypothesis_template= "This is a photo of a {}")
    return {dic["label"]: dic["score"] for dic in res}
    
with gr.Blocks() as demo: 
    gr.Markdown( """
    - 🎤 Input voice/text 
    - ✨ Convert voice/text to image via Latent Diffusion
    - 🤖 Given list of labels and a selected image from gallery do zero-shot classification 
    - 🎛️ Coming soon: TTS(audio) your output label as: Your output looks like "label of zero-shot"
     """)    
    with gr.Row():
        with gr.Column():
            audio_file =[
            gr.Audio(source="microphone", type="filepath", optional=True), "state", 
            gr.Audio(source="upload", type="filepath", optional=True)]
            text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
            with gr.Row():
                speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"})        
        with gr.Column():
            steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to     generate",default=50,maximum=50,minimum=1,step=1)
            width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
            height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
            images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
            diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
            #gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")]
            gallery = gr.Gallery(label="Individual images")
            with gr.Row():
                get_image_latent = gr.Button("Generate Image go brr")         
        with gr.Column():
            text_input = gr.Textbox(label="Candidate labels", placeholder="input a list of labels separated by commas")
            label = gr.Label()
            with gr.Row():
                zero_shot_clf = gr.Button("Classify Image go brr")

        
    speech_to_text.click(speech_to_text, inputs=audio_file, outputs=[text,"state"])
    get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
    zero_shot_clf.click(zero_shot, inputs=[gallery, text_input], outputs=label)  
      
demo.launch(live=True)