Spaces:

Gradio-Blocks
/

magnificento

Runtime error

File size: 3,736 Bytes

ed93d68
 
 
 
 
 
c2377c4
ed93d68
ac022fd
974534b
ed93d68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac022fd
 
4581829
0283c36
5a9e78d
 
 
 
 
 
 
ed93d68
 
a16eddc
 
 
ac022fd
ed93d68
ac022fd
ed93d68
 
 
 
f1bb426
ed93d68
08b33e5
 
ed93d68
4581829
ac022fd
 
 
 
4581829
ed93d68
0283c36
ed93d68
08b33e5
4a1b464
ed93d68

import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
from transformers import pipeline

asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
zero = pipeline("zero-shot-image-classification")
#tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")

def text2image_latent(text, steps, width, height, images, diversity):
    print(text)
    results = latent(text, steps, width, height, images, diversity)
    image_paths = []
    for image in results[1]:
        image_str = image[0]
        image_str = image_str.replace("data:image/png;base64,","")
        decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
        img = Image.open(io.BytesIO(decoded_bytes))
        url = shortuuid.uuid()
        temp_dir = './tmp'
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir, exist_ok=True)
        image_path = f'{temp_dir}/{url}.png'
        img.save(f'{temp_dir}/{url}.png')
        image_paths.append(image_path)
    return(image_paths)


def speech_to_text(mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    transcription = asr(audio)["text"]
    return transcription

def zero_shot(image, text_input):
    results = zero(image, text_input)
    return {dic["label"]: dic["score"] for dic in results}
    
with gr.Blocks() as demo: 
    gr.Markdown( """
    #input voice/text 
    #convert text to image via dalle
    #given list of labels and a selected image from gallery do zero-shot classification 
    #Coming soon: tts your output label as: Your output looks like "label of zero-shot"
     """)    
    with gr.Row():
        with gr.Column():
            audio_file =[
            gr.Audio(source="microphone", type="filepath", optional=True),
            gr.Audio(source="upload", type="filepath", optional=True)]
            text = gr.Textbox(placeholder="If you dont want to record or upload your voice you can input text here")
            with gr.Row():
                speech_to_text = gr.Button("Speech to text go brrr", css={"margin-top": "1em"})        
        with gr.Column():
            steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to     generate",default=50,maximum=50,minimum=1,step=1)
            width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
            height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
            images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
            diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
            #gallery = [gr.outputs.Image(type="pil"),gr.outputs.Textbox(label="Error")]
            gallery = gr.Gallery(label="Individual images")
            with gr.Row():
                get_image_latent = gr.Button("Generate Image go brr")         
        with gr.Column():
            text_input = gr.Textbox(placeholder="input a list of labels separated by commas")
            label = gr.Label()
            with gr.Row():
                zero_shot_clf = gr.Button("Classify Image go brr")

        
    speech_to_text.click(speech_to_text, inputs=audio_file, outputs=text)
    get_image_latent.click(text2image_latent, inputs=[text,steps,width,height,images,diversity], outputs=gallery)
    zero_shot_clf.click(zero_shot, inputs=[gallery,text_input], outputs=label)

demo.launch(enable_queue=False)