Spaces:
Runtime error
Runtime error
File size: 3,236 Bytes
ed93d68 f0e6744 ed93d68 c2377c4 ed93d68 6c39a0a eb87daa ed93d68 eb87daa ed93d68 6c39a0a ed93d68 6c39a0a ed93d68 0283c36 5a9e78d 2adde1f 74aea90 2adde1f 98f30e7 5a9e78d ed93d68 a16eddc eb87daa 3e0ffed 51da226 ed93d68 7c7a0de ed93d68 eb87daa 08b33e5 ed93d68 eb87daa 7c7a0de eb87daa c1113dc 626fbaf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
import numpy as np
from transformers import pipeline
asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
def text2image_latent(text, steps, width, height, images, diversity):
print(text)
results = latent(text, steps, width, height, images, diversity)
image_paths = []
for image in results[1]:
image_str = image[0]
image_str = image_str.replace("data:image/png;base64,","")
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
img = Image.open(io.BytesIO(decoded_bytes))
url = shortuuid.uuid()
temp_dir = './tmp'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir, exist_ok=True)
image_path = f'{temp_dir}/{url}.png'
img.save(f'{temp_dir}/{url}.png')
image_paths.append(image_path)
return(image_paths)
def speech_to_text(mic=None, file=None):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
return "You must either provide a mic recording or a file"
transcription = asr(audio)["text"]
return transcription
with gr.Blocks() as demo:
gr.Markdown( """
# 🎤 Sing or tell your story and let this Space ✨ visualize your story along
## Inspired by this [tweet](https://twitter.com/karenxcheng/status/1516816114994454529?s=20&t=moq2vK5430JoerJXBTkIuA)
### Soon to be added:
- Near real time(streaming option)
- Option playback of you audio relayed with video
""")
with gr.Row():
with gr.Column():
audio_file =[
gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
with gr.Row():
s2t = gr.Button("Speech to text go brrr")
with gr.Column():
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
gallery = gr.Gallery(label="Individual images")
with gr.Row():
get_image_latent = gr.Button("Generate Image go brr")
s2t.click(speech_to_text, inputs=audio_file, outputs=text)
get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
demo.launch(enable_queue=True, debug=True) |