muhtasham commited on
Commit
eb87daa
1 Parent(s): f8fe0f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -40
app.py CHANGED
@@ -4,29 +4,29 @@ import gradio as gr
4
  import shortuuid
5
  import numpy as np
6
  from transformers import pipeline
7
- from moviepy.editor import AudioFileClip, ImageClip
8
 
9
  asr = pipeline("automatic-speech-recognition")
10
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
11
 
12
- # function by Epoching
13
- def text2image_latent(text):
14
- steps=25
15
- width=256
16
- height=256
17
- num_images=1
18
- diversity=5
19
- image_bytes = latent(text, steps, width, height, num_images, diversity)
20
-
21
- generated_images = []
22
- for image in image_bytes[1]:
23
  image_str = image[0]
24
  image_str = image_str.replace("data:image/png;base64,","")
25
  decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
26
  img = Image.open(io.BytesIO(decoded_bytes))
27
- generated_images.append(img)
28
-
29
- return generated_images
 
 
 
 
 
 
30
 
31
  def speech_to_text(mic=None, file=None):
32
  if mic is not None:
@@ -38,22 +38,6 @@ def speech_to_text(mic=None, file=None):
38
  transcription = asr(audio)["text"]
39
  return transcription
40
 
41
- def combine_audio_image(audio_file, gallery):
42
- "Create and rerturn a combined image from the audio and image"
43
- generated_images = []
44
- for image_str in gallery:
45
- image_str = image_str.replace("data:image/png;base64,","")
46
- decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
47
- img = Image.open(io.BytesIO(decoded_bytes))
48
- generated_images.append(img)
49
-
50
- # combine generated images with audio file and return "out.mp4"
51
- audio_clip = AudioFileClip(audio_file)
52
- generated_images_clip = ImageClip(np.array(generated_images))
53
- final_clip = audio_clip.set_audio(generated_images_clip.audio).set_duration(audio_clip.duration)
54
- final_clip.write_videofile("out.mp4")
55
-
56
- return "out.mp4"
57
 
58
  with gr.Blocks() as demo:
59
  gr.Markdown( """
@@ -66,22 +50,22 @@ with gr.Blocks() as demo:
66
  with gr.Row():
67
  with gr.Column():
68
  audio_file =[
69
- gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
70
  gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
71
  text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
72
  with gr.Row():
73
  s2t = gr.Button("Speech to text go brrr")
74
  with gr.Column():
 
 
 
 
 
75
  gallery = gr.Gallery(label="Individual images")
76
  with gr.Row():
77
- get_image_latent = gr.Button("Generate Image go brr")
78
- with gr.Column():
79
- video = gr.Video(label="Video with audio")
80
- with gr.Row():
81
- get_video_latent = gr.Button("Generate Video go brr")
82
-
83
  s2t.click(speech_to_text, inputs=audio_file, outputs=text)
84
- get_image_latent.click(text2image_latent, inputs=text, outputs=gallery)
85
- get_video_latent.click(combine_audio_image, inputs=[audio_file, gallery], outputs=video)
86
 
87
  demo.launch(enable_queue=True, debug=True)
 
4
  import shortuuid
5
  import numpy as np
6
  from transformers import pipeline
 
7
 
8
  asr = pipeline("automatic-speech-recognition")
9
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
10
 
11
+
12
+ def text2image_latent(text, steps, width, height, images, diversity):
13
+ print(text)
14
+ results = latent(text, steps, width, height, images, diversity)
15
+ image_paths = []
16
+ for image in results[1]:
 
 
 
 
 
17
  image_str = image[0]
18
  image_str = image_str.replace("data:image/png;base64,","")
19
  decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
20
  img = Image.open(io.BytesIO(decoded_bytes))
21
+ url = shortuuid.uuid()
22
+ temp_dir = './tmp'
23
+ if not os.path.exists(temp_dir):
24
+ os.makedirs(temp_dir, exist_ok=True)
25
+ image_path = f'{temp_dir}/{url}.png'
26
+ img.save(f'{temp_dir}/{url}.png')
27
+ image_paths.append(image_path)
28
+ return(image_paths)
29
+
30
 
31
  def speech_to_text(mic=None, file=None):
32
  if mic is not None:
 
38
  transcription = asr(audio)["text"]
39
  return transcription
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  with gr.Blocks() as demo:
43
  gr.Markdown( """
 
50
  with gr.Row():
51
  with gr.Column():
52
  audio_file =[
53
+ gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
54
  gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
55
  text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
56
  with gr.Row():
57
  s2t = gr.Button("Speech to text go brrr")
58
  with gr.Column():
59
+ steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
60
+ width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
61
+ height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
62
+ images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
63
+ diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
64
  gallery = gr.Gallery(label="Individual images")
65
  with gr.Row():
66
+ get_image_latent = gr.Button("Generate Image go brr")
67
+
 
 
 
 
68
  s2t.click(speech_to_text, inputs=audio_file, outputs=text)
69
+ get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
 
70
 
71
  demo.launch(enable_queue=True, debug=True)