muhtasham commited on
Commit
6d981ed
1 Parent(s): a32bdc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -26
app.py CHANGED
@@ -4,31 +4,30 @@ import gradio as gr
4
  import shortuuid
5
  import numpy as np
6
  from transformers import pipeline
 
7
 
8
  asr = pipeline("automatic-speech-recognition")
9
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
10
 
11
- #zero = pipeline("zero-shot-image-classification")
12
- #zero = gr.Interface.load("spaces/Datatrooper/zero-shot-image-classification")
13
- #tts = gr.Interface.load("spaces/osanseviero/tortoisse-tts")
14
-
15
- def text2image_latent(text, steps, width, height, images, diversity):
16
- print(text)
17
- results = latent(text, steps, width, height, images, diversity)
18
- image_paths = []
19
- for image in results[1]:
 
 
 
20
  image_str = image[0]
21
  image_str = image_str.replace("data:image/png;base64,","")
22
  decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
23
  img = Image.open(io.BytesIO(decoded_bytes))
24
- url = shortuuid.uuid()
25
- temp_dir = './tmp'
26
- if not os.path.exists(temp_dir):
27
- os.makedirs(temp_dir, exist_ok=True)
28
- image_path = f'{temp_dir}/{url}.png'
29
- img.save(f'{temp_dir}/{url}.png')
30
- image_paths.append(image_path)
31
- return(image_paths)
32
 
33
 
34
  def speech_to_text(mic=None, file=None):
@@ -41,6 +40,24 @@ def speech_to_text(mic=None, file=None):
41
  transcription = asr(audio)["text"]
42
  return transcription
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  with gr.Blocks() as demo:
46
  gr.Markdown( """
@@ -53,22 +70,22 @@ with gr.Blocks() as demo:
53
  with gr.Row():
54
  with gr.Column():
55
  audio_file =[
56
- gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
57
  gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
58
  text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
59
  with gr.Row():
60
  s2t = gr.Button("Speech to text go brrr")
61
  with gr.Column():
62
- steps = gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=1,maximum=50,minimum=1,step=1)
63
- width = gr.inputs.Slider(label="Width", default=256, step=32, maximum=256, minimum=32)
64
- height = gr.inputs.Slider(label="Height", default=256, step=32, maximum = 256, minimum=32)
65
- images = gr.inputs.Slider(label="Images - How many images you wish to generate", default=1, step=1, minimum=1, maximum=4)
66
- diversity = gr.inputs.Slider(label="Diversity scale - How different from one another you wish the images to be",default=15.0, minimum=1.0, maximum=15.0)
67
  gallery = gr.Gallery(label="Individual images")
68
  with gr.Row():
69
- get_image_latent = gr.Button("Generate Image go brr")
70
-
 
 
 
 
71
  s2t.click(speech_to_text, inputs=audio_file, outputs=text)
72
- get_image_latent.click(text2image_latent, inputs=[text, steps, width, height, images, diversity], outputs=gallery)
 
73
 
74
  demo.launch(enable_queue=True, debug=True)
 
4
  import shortuuid
5
  import numpy as np
6
  from transformers import pipeline
7
+ from moviepy.editor import AudioFileClip, ImageClip
8
 
9
  asr = pipeline("automatic-speech-recognition")
10
  latent = gr.Interface.load("spaces/multimodalart/latentdiffusion")
11
 
12
+ # function by Epoching
13
+ def text2image_latent(text):
14
+ steps=25
15
+ width=256
16
+ height=256
17
+ num_images=1
18
+ diversity=5
19
+ image_bytes = latent(text, steps, width, height, num_images, diversity)
20
+
21
+ # Algo from spaces/Gradio-Blocks/latent_gpt2_story/blob/main/app.py
22
+ generated_images = []
23
+ for image in image_bytes[1]:
24
  image_str = image[0]
25
  image_str = image_str.replace("data:image/png;base64,","")
26
  decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
27
  img = Image.open(io.BytesIO(decoded_bytes))
28
+ generated_images.append(img)
29
+
30
+ return generated_images
 
 
 
 
 
31
 
32
 
33
  def speech_to_text(mic=None, file=None):
 
40
  transcription = asr(audio)["text"]
41
  return transcription
42
 
43
+
44
+
45
+ def combine_audio_image(audio_file, gallery):
46
+ "Create and rerturn a combined image from the audio and image"
47
+ generated_images = []
48
+ for image_str in gallery:
49
+ image_str = image_str.replace("data:image/png;base64,","")
50
+ decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8"))
51
+ img = Image.open(io.BytesIO(decoded_bytes))
52
+ generated_images.append(img)
53
+
54
+ # combine generated images with audio file and return "out.mp4"
55
+ audio_clip = AudioFileClip(audio_file)
56
+ generated_images_clip = ImageClip(np.array(generated_images))
57
+ final_clip = audio_clip.set_audio(generated_images_clip.audio).set_duration(audio_clip.duration)
58
+ final_clip.write_videofile("out.mp4")
59
+
60
+ return "out.mp4"
61
 
62
  with gr.Blocks() as demo:
63
  gr.Markdown( """
 
70
  with gr.Row():
71
  with gr.Column():
72
  audio_file =[
73
+ gr.Audio(source="microphone", type="filepath", optional=True, label="Speak here..."),
74
  gr.Audio(source="upload", type="filepath", optional=True, label="Or if you want upload here...")]
75
  text = gr.Textbox(label="Text", placeholder="If you dont want to record or upload your voice you can input text here")
76
  with gr.Row():
77
  s2t = gr.Button("Speech to text go brrr")
78
  with gr.Column():
 
 
 
 
 
79
  gallery = gr.Gallery(label="Individual images")
80
  with gr.Row():
81
+ get_image_latent = gr.Button("Generate Image go brr")
82
+ with gr.Column():
83
+ video = gr.Video(label="Video with audio")
84
+ with gr.Row():
85
+ get_video_latent = gr.Button("Generate Video go brr")
86
+
87
  s2t.click(speech_to_text, inputs=audio_file, outputs=text)
88
+ get_image_latent.click(text2image_latent, inputs=text, outputs=gallery)
89
+ get_video_latent.click(combine_audio_image, inputs=[audio_file, gallery], output=video)
90
 
91
  demo.launch(enable_queue=True, debug=True)