Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,29 +4,29 @@ import gradio as gr
|
|
4 |
import shortuuid
|
5 |
import numpy as np
|
6 |
from transformers import pipeline
|
7 |
-
from moviepy.editor import AudioFileClip, ImageClip
|
8 |
|
9 |
asr = pipeline("automatic-speech-recognition")
|
10 |
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
|
11 |
|
12 |
-
|
13 |
-
def text2image_latent(text):
|
14 |
-
|
15 |
-
width
|
16 |
-
|
17 |
-
|
18 |
-
diversity=5
|
19 |
-
image_bytes = latent(text, steps, width, height, num_images, diversity)
|
20 |
-
|
21 |
-
generated_images = []
|
22 |
-
for image in image_bytes[1]:
|
23 |
image_str = image[0]
|
24 |
image_str = image_str.replace("data:image/png;base64,","")
|
25 |
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
26 |
img = Image.open(io.BytesIO(decoded_bytes))
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def speech_to_text(mic=None, file=None):
|
32 |
if mic is not None:
|
@@ -38,22 +38,6 @@ def speech_to_text(mic=None, file=None):
|
|
38 |
transcription = asr(audio)["text"]
|
39 |
return transcription
|
40 |
|
41 |
-
def combine_audio_image(audio_file, gallery):
|
42 |
-
"Create and rerturn a combined image from the audio and image"
|
43 |
-
generated_images = []
|
44 |
-
for image_str in gallery:
|
45 |
-
image_str = image_str.replace("data:image/png;base64,","")
|
46 |
-
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
47 |
-
img = Image.open(io.BytesIO(decoded_bytes))
|
48 |
-
generated_images.append(img)
|
49 |
-
|
50 |
-
# combine generated images with audio file and return "out.mp4"
|
51 |
-
audio_clip = AudioFileClip(audio_file)
|
52 |
-
generated_images_clip = ImageClip(np.array(generated_images))
|
53 |
-
final_clip = audio_clip.set_audio(generated_images_clip.audio).set_duration(audio_clip.duration)
|
54 |
-
final_clip.write_videofile("out.mp4")
|
55 |
-
|
56 |
-
return "out.mp4"
|
57 |
|
58 |
with gr.Blocks() as demo:
|
59 |
gr.Markdown( """
|
@@ -66,22 +50,22 @@ with gr.Blocks() as demo:
|
|
66 |
with gr.Row():
|
67 |
with gr.Column():
|
68 |
audio_file =[
|
69 |
-
gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
|
70 |
gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
|
71 |
text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
|
72 |
with gr.Row():
|
73 |
s2t = gr.Button("Speech to text go brrr")
|
74 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
75 |
gallery = gr.Gallery(label="Individual images")
|
76 |
with gr.Row():
|
77 |
-
get_image_latent = gr.Button("Generate Image go brr")
|
78 |
-
|
79 |
-
video = gr.Video(label="Video with audio")
|
80 |
-
with gr.Row():
|
81 |
-
get_video_latent = gr.Button("Generate Video go brr")
|
82 |
-
|
83 |
s2t.click(speech_to_text, inputs=audio_file, outputs=text)
|
84 |
-
get_image_latent.click(text2image_latent, inputs=text, outputs=gallery)
|
85 |
-
get_video_latent.click(combine_audio_image, inputs=[audio_file, gallery], outputs=video)
|
86 |
|
87 |
demo.launch(enable_queue=True, debug=True)
|
|
|
4 |
import shortuuid
|
5 |
import numpy as np
|
6 |
from transformers import pipeline
|
|
|
7 |
|
8 |
asr = pipeline("automatic-speech-recognition")
|
9 |
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
|
10 |
|
11 |
+
|
12 |
+
def text2image_latent(text, steps, width, height, images, diversity):
|
13 |
+
print(text)
|
14 |
+
results = latent(text, steps, width, height, images, diversity)
|
15 |
+
image_paths = []
|
16 |
+
for image in results[1]:
|
|
|
|
|
|
|
|
|
|
|
17 |
image_str = image[0]
|
18 |
image_str = image_str.replace("data:image/png;base64,","")
|
19 |
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
20 |
img = Image.open(io.BytesIO(decoded_bytes))
|
21 |
+
url = shortuuid.uuid()
|
22 |
+
temp_dir = './tmp'
|
23 |
+
if not os.path.exists(temp_dir):
|
24 |
+
os.makedirs(temp_dir, exist_ok=True)
|
25 |
+
image_path = f'{temp_dir}/{url}.png'
|
26 |
+
img.save(f'{temp_dir}/{url}.png')
|
27 |
+
image_paths.append(image_path)
|
28 |
+
return(image_paths)
|
29 |
+
|
30 |
|
31 |
def speech_to_text(mic=None, file=None):
|
32 |
if mic is not None:
|
|
|
38 |
transcription = asr(audio)["text"]
|
39 |
return transcription
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
with gr.Blocks() as demo:
|
43 |
gr.Markdown( """
|
|
|
50 |
with gr.Row():
|
51 |
with gr.Column():
|
52 |
audio_file =[
|
53 |
+
gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
|
54 |
gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
|
55 |
text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
|
56 |
with gr.Row():
|
57 |
s2t = gr.Button("Speech to text go brrr")
|
58 |
with gr.Column():
|
59 |
+
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
|
60 |
+
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
|
61 |
+
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
|
62 |
+
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
|
63 |
+
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
|
64 |
gallery = gr.Gallery(label="Individual images")
|
65 |
with gr.Row():
|
66 |
+
get_image_latent = gr.Button("Generate Image go brr")
|
67 |
+
|
|
|
|
|
|
|
|
|
68 |
s2t.click(speech_to_text, inputs=audio_file, outputs=text)
|
69 |
+
get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
|
|
|
70 |
|
71 |
demo.launch(enable_queue=True, debug=True)
|