Spaces:

Gradio-Blocks
/

magnificento

Runtime error

App Files Files Community

muhtasham commited on May 20, 2022

Commit

eb87daa

•

1 Parent(s): f8fe0f3

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -40

app.py CHANGED Viewed

@@ -4,29 +4,29 @@ import gradio as gr
 import shortuuid
 import numpy as np
 from transformers import pipeline
-from moviepy.editor import AudioFileClip, ImageClip
 asr = pipeline("automatic-speech-recognition")
 latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
-# function by Epoching
-def text2image_latent(text):
-    steps=25
-    width=256
-    height=256
-    num_images=1
-    diversity=5
-    image_bytes = latent(text, steps, width, height, num_images, diversity)
-    generated_images = []
-    for image in image_bytes[1]:
         image_str = image[0]
         image_str = image_str.replace("data:image/png;base64,","")
         decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
         img = Image.open(io.BytesIO(decoded_bytes))
-        generated_images.append(img)
-    return generated_images
 def speech_to_text(mic=None, file=None):
     if mic is not None:
@@ -38,22 +38,6 @@ def speech_to_text(mic=None, file=None):
     transcription = asr(audio)["text"]
     return transcription
-def combine_audio_image(audio_file, gallery):
-    "Create and rerturn a combined image from the audio and image"
-    generated_images = []
-    for image_str in gallery:
-        image_str = image_str.replace("data:image/png;base64,","")
-        decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
-        img = Image.open(io.BytesIO(decoded_bytes))
-        generated_images.append(img)
-    # combine generated images with audio file and return "out.mp4"
-    audio_clip = AudioFileClip(audio_file)
-    generated_images_clip = ImageClip(np.array(generated_images))
-    final_clip = audio_clip.set_audio(generated_images_clip.audio).set_duration(audio_clip.duration)
-    final_clip.write_videofile("out.mp4")
-    return "out.mp4"
 with gr.Blocks() as demo:
     gr.Markdown( """
@@ -66,22 +50,22 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             audio_file =[
-            gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
             gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
             text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
             with gr.Row():
                 s2t = gr.Button("Speech to text go brrr")
         with gr.Column():
             gallery = gr.Gallery(label="Individual images")
             with gr.Row():
-                get_image_latent = gr.Button("Generate Image go brr")
-        with gr.Column():
-            video = gr.Video(label="Video with audio")
-            with gr.Row():
-                get_video_latent = gr.Button("Generate Video go brr")
     s2t.click(speech_to_text, inputs=audio_file, outputs=text)
-    get_image_latent.click(text2image_latent, inputs=text, outputs=gallery)
-    get_video_latent.click(combine_audio_image, inputs=[audio_file, gallery], outputs=video)
 demo.launch(enable_queue=True, debug=True)

 import shortuuid
 import numpy as np
 from transformers import pipeline
 asr = pipeline("automatic-speech-recognition")
 latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
+def text2image_latent(text, steps, width, height, images, diversity):
+    print(text)
+    results = latent(text, steps, width, height, images, diversity)
+    image_paths = []
+    for image in results[1]:
         image_str = image[0]
         image_str = image_str.replace("data:image/png;base64,","")
         decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
         img = Image.open(io.BytesIO(decoded_bytes))
+        url = shortuuid.uuid()
+        temp_dir = './tmp'
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir, exist_ok=True)
+        image_path = f'{temp_dir}/{url}.png'
+        img.save(f'{temp_dir}/{url}.png')
+        image_paths.append(image_path)
+    return(image_paths)
 def speech_to_text(mic=None, file=None):
     if mic is not None:
     transcription = asr(audio)["text"]
     return transcription
 with gr.Blocks() as demo:
     gr.Markdown( """
     with gr.Row():
         with gr.Column():
             audio_file =[
+            gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
             gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
             text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
             with gr.Row():
                 s2t = gr.Button("Speech to text go brrr")
         with gr.Column():
+            steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to     generate",default=1,maximum=50,minimum=1,step=1)
+            width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
+            height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
+            images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
+            diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
             gallery = gr.Gallery(label="Individual images")
             with gr.Row():
+                get_image_latent = gr.Button("Generate Image go brr")
     s2t.click(speech_to_text, inputs=audio_file, outputs=text)
+    get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
 demo.launch(enable_queue=True, debug=True)