Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,31 +4,30 @@ import gradio as gr
|
|
4 |
import shortuuid
|
5 |
import numpy as np
|
6 |
from transformers import pipeline
|
|
|
7 |
|
8 |
asr = pipeline("automatic-speech-recognition")
|
9 |
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
20 |
image_str = image[0]
|
21 |
image_str = image_str.replace("data:image/png;base64,","")
|
22 |
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
23 |
img = Image.open(io.BytesIO(decoded_bytes))
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
os.makedirs(temp_dir, exist_ok=True)
|
28 |
-
image_path = f'{temp_dir}/{url}.png'
|
29 |
-
img.save(f'{temp_dir}/{url}.png')
|
30 |
-
image_paths.append(image_path)
|
31 |
-
return(image_paths)
|
32 |
|
33 |
|
34 |
def speech_to_text(mic=None, file=None):
|
@@ -41,6 +40,24 @@ def speech_to_text(mic=None, file=None):
|
|
41 |
transcription = asr(audio)["text"]
|
42 |
return transcription
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
with gr.Blocks() as demo:
|
46 |
gr.Markdown( """
|
@@ -53,22 +70,22 @@ with gr.Blocks() as demo:
|
|
53 |
with gr.Row():
|
54 |
with gr.Column():
|
55 |
audio_file =[
|
56 |
-
gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
|
57 |
gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
|
58 |
text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
|
59 |
with gr.Row():
|
60 |
s2t = gr.Button("Speech to text go brrr")
|
61 |
with gr.Column():
|
62 |
-
steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
|
63 |
-
width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
|
64 |
-
height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
|
65 |
-
images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
|
66 |
-
diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
|
67 |
gallery = gr.Gallery(label="Individual images")
|
68 |
with gr.Row():
|
69 |
-
get_image_latent = gr.Button("Generate Image go brr")
|
70 |
-
|
|
|
|
|
|
|
|
|
71 |
s2t.click(speech_to_text, inputs=audio_file, outputs=text)
|
72 |
-
get_image_latent.click(text2image_latent, inputs=
|
|
|
73 |
|
74 |
demo.launch(enable_queue=True, debug=True)
|
|
|
4 |
import shortuuid
|
5 |
import numpy as np
|
6 |
from transformers import pipeline
|
7 |
+
from moviepy.editor import AudioFileClip, ImageClip
|
8 |
|
9 |
asr = pipeline("automatic-speech-recognition")
|
10 |
latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
|
11 |
|
12 |
+
# function by Epoching
|
13 |
+
def text2image_latent(text):
|
14 |
+
steps=25
|
15 |
+
width=256
|
16 |
+
height=256
|
17 |
+
num_images=1
|
18 |
+
diversity=5
|
19 |
+
image_bytes = latent(text, steps, width, height, num_images, diversity)
|
20 |
+
|
21 |
+
# Algo from spaces/Gradio-Blocks/latent_gpt2_story/blob/main/app.py
|
22 |
+
generated_images = []
|
23 |
+
for image in image_bytes[1]:
|
24 |
image_str = image[0]
|
25 |
image_str = image_str.replace("data:image/png;base64,","")
|
26 |
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
27 |
img = Image.open(io.BytesIO(decoded_bytes))
|
28 |
+
generated_images.append(img)
|
29 |
+
|
30 |
+
return generated_images
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
def speech_to_text(mic=None, file=None):
|
|
|
40 |
transcription = asr(audio)["text"]
|
41 |
return transcription
|
42 |
|
43 |
+
|
44 |
+
|
45 |
+
def combine_audio_image(audio_file, gallery):
|
46 |
+
"Create and rerturn a combined image from the audio and image"
|
47 |
+
generated_images = []
|
48 |
+
for image_str in gallery:
|
49 |
+
image_str = image_str.replace("data:image/png;base64,","")
|
50 |
+
decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
|
51 |
+
img = Image.open(io.BytesIO(decoded_bytes))
|
52 |
+
generated_images.append(img)
|
53 |
+
|
54 |
+
# combine generated images with audio file and return "out.mp4"
|
55 |
+
audio_clip = AudioFileClip(audio_file)
|
56 |
+
generated_images_clip = ImageClip(np.array(generated_images))
|
57 |
+
final_clip = audio_clip.set_audio(generated_images_clip.audio).set_duration(audio_clip.duration)
|
58 |
+
final_clip.write_videofile("out.mp4")
|
59 |
+
|
60 |
+
return "out.mp4"
|
61 |
|
62 |
with gr.Blocks() as demo:
|
63 |
gr.Markdown( """
|
|
|
70 |
with gr.Row():
|
71 |
with gr.Column():
|
72 |
audio_file =[
|
73 |
+
gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
|
74 |
gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
|
75 |
text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
|
76 |
with gr.Row():
|
77 |
s2t = gr.Button("Speech to text go brrr")
|
78 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
79 |
gallery = gr.Gallery(label="Individual images")
|
80 |
with gr.Row():
|
81 |
+
get_image_latent = gr.Button("Generate Image go brr")
|
82 |
+
with gr.Column():
|
83 |
+
video = gr.Video(label="Video with audio")
|
84 |
+
with gr.Row():
|
85 |
+
get_video_latent = gr.Button("Generate Video go brr")
|
86 |
+
|
87 |
s2t.click(speech_to_text, inputs=audio_file, outputs=text)
|
88 |
+
get_image_latent.click(text2image_latent, inputs=text, outputs=gallery)
|
89 |
+
get_video_latent.click(combine_audio_image, inputs=[audio_file, gallery], output=video)
|
90 |
|
91 |
demo.launch(enable_queue=True, debug=True)
|