File size: 3,236 Bytes
ed93d68
 
 
 
f0e6744
ed93d68
 
c2377c4
ed93d68
6c39a0a
eb87daa
 
 
 
 
 
ed93d68
 
 
 
eb87daa
 
 
 
 
 
 
 
 
ed93d68
6c39a0a
ed93d68
 
 
 
 
 
 
6c39a0a
ed93d68
0283c36
5a9e78d
 
2adde1f
74aea90
 
2adde1f
98f30e7
5a9e78d
ed93d68
 
a16eddc
eb87daa
3e0ffed
51da226
ed93d68
7c7a0de
ed93d68
eb87daa
 
 
 
 
08b33e5
ed93d68
eb87daa
 
7c7a0de
eb87daa
c1113dc
626fbaf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import io, os, base64
from PIL import Image
import gradio as gr
import shortuuid
import numpy as np
from transformers import pipeline

asr = pipeline("automatic-speech-recognition")
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")


def text2image_latent(text, steps, width, height, images, diversity):
    print(text)
    results = latent(text, steps, width, height, images, diversity)
    image_paths = []
    for image in results[1]:
        image_str = image[0]
        image_str = image_str.replace("data:image/png;base64,","")
        decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
        img = Image.open(io.BytesIO(decoded_bytes))
        url = shortuuid.uuid()
        temp_dir = './tmp'
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir, exist_ok=True)
        image_path = f'{temp_dir}/{url}.png'
        img.save(f'{temp_dir}/{url}.png')
        image_paths.append(image_path)
    return(image_paths)


def speech_to_text(mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    transcription = asr(audio)["text"]
    return transcription

    
with gr.Blocks() as demo: 
    gr.Markdown( """
    # 🎤 Sing or tell your story and let this Space ✨  visualize your story along 
    ## Inspired by this [tweet](https://twitter.com/karenxcheng/status/1516816114994454529?s=20&t=moq2vK5430JoerJXBTkIuA)
    ### Soon to be added:    
    - Near real time(streaming option)
    - Option playback of you audio relayed with video
     """)    
    with gr.Row():
        with gr.Column():
            audio_file =[
            gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
            gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
            text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
            with gr.Row():
                s2t = gr.Button("Speech to text go brrr")
        with gr.Column():
            steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to     generate",default=1,maximum=50,minimum=1,step=1)
            width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
            height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
            images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
            diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
            gallery = gr.Gallery(label="Individual images")
            with gr.Row():
                get_image_latent = gr.Button("Generate Image go brr")         
        
    s2t.click(speech_to_text, inputs=audio_file, outputs=text)
    get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
      
demo.launch(enable_queue=True, debug=True)