Spaces:
Runtime error
Runtime error
| import io, os, base64 | |
| from PIL import Image | |
| import gradio as gr | |
| import shortuuid | |
| import numpy as np | |
| from transformers import pipeline | |
| asr = pipeline("automatic-speech-recognition") | |
| latent = gr.Interface.load("spaces/multimodalart/latentdiffusion") | |
| def text2image_latent(text, steps, width, height, images, diversity): | |
| print(text) | |
| results = latent(text, steps, width, height, images, diversity) | |
| image_paths = [] | |
| for image in results[1]: | |
| image_str = image[0] | |
| image_str = image_str.replace("data:image/png;base64,","") | |
| decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8")) | |
| img = Image.open(io.BytesIO(decoded_bytes)) | |
| url = shortuuid.uuid() | |
| temp_dir = './tmp' | |
| if not os.path.exists(temp_dir): | |
| os.makedirs(temp_dir, exist_ok=True) | |
| image_path = f'{temp_dir}/{url}.png' | |
| img.save(f'{temp_dir}/{url}.png') | |
| image_paths.append(image_path) | |
| return(image_paths) | |
| def speech_to_text(mic=None, file=None): | |
| if mic is not None: | |
| audio = mic | |
| elif file is not None: | |
| audio = file | |
| else: | |
| return "You must either provide a mic recording or a file" | |
| transcription = asr(audio)["text"] | |
| return transcription | |
| with gr.Blocks() as demo: | |
| gr.Markdown( """ | |
| # 🎤 Sing or tell your story and let this Space ✨ visualize your story along | |
| ## Inspired by this [tweet](https://twitter.com/karenxcheng/status/1516816114994454529?s=20&t=moq2vK5430JoerJXBTkIuA) | |
| ### Soon to be added: | |
| - Near real time(streaming option) | |
| - Option playback of you audio relayed with video | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_file =[ | |
| gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."), | |
| gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")] | |
| text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here") | |
| with gr.Row(): | |
| s2t = gr.Button("Speech to text go brrr") | |
| with gr.Column(): | |
| steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1) | |
| width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32) | |
| height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32) | |
| images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4) | |
| diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0) | |
| gallery = gr.Gallery(label="Individual images") | |
| with gr.Row(): | |
| get_image_latent = gr.Button("Generate Image go brr") | |
| s2t.click(speech_to_text, inputs=audio_file, outputs=text) | |
| get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery) | |
| demo.launch(enable_queue=True, debug=True) |