Dhruv Diddi commited on
Commit
e1d4069
β€’
1 Parent(s): a8c30fe

any text to stable diffusion

Browse files
Files changed (1) hide show
  1. app.py +16 -191
app.py CHANGED
@@ -1,117 +1,19 @@
1
  import gradio as gr
2
- #import torch
3
- import whisper
4
  from datetime import datetime
5
  from PIL import Image
6
  import flag
7
  import os
8
- #MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
9
-
10
- #from diffusers import StableDiffusionPipeline
11
 
12
  stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
13
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
14
 
15
- title="Whisper to Stable Diffusion"
16
-
17
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
18
-
19
- whisper_model = whisper.load_model("small")
20
-
21
- #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
-
23
- #pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
24
- #pipe.to(device)
25
-
26
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
27
 
28
  def get_images(prompt):
29
  gallery_dir = stable_diffusion(prompt, fn_index=2)
30
  return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
31
 
32
 
33
- def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
34
-
35
- whisper_results = translate(audio)
36
- prompt = whisper_results[2]
37
- images = get_images(prompt)
38
-
39
- return whisper_results[0], whisper_results[1], whisper_results[2], images
40
-
41
- #def diffuse(prompt, guidance_scale, nb_iterations, seed):
42
- #
43
- # generator = torch.Generator(device=device).manual_seed(int(seed))
44
- #
45
- # print("""
46
- # β€”
47
- # Sending prompt to Stable Diffusion ...
48
- # β€”
49
- # """)
50
- # print("prompt: " + prompt)
51
- # print("guidance scale: " + str(guidance_scale))
52
- # print("inference steps: " + str(nb_iterations))
53
- # print("seed: " + str(seed))
54
- #
55
- # images_list = pipe(
56
- # [prompt] * 2,
57
- # guidance_scale=guidance_scale,
58
- # num_inference_steps=nb_iterations,
59
- # generator=generator
60
- # )
61
- #
62
- # images = []
63
- #
64
- # safe_image = Image.open(r"unsafe.png")
65
- #
66
- # for i, image in enumerate(images_list["sample"]):
67
- # if(images_list["nsfw_content_detected"][i]):
68
- # images.append(safe_image)
69
- # else:
70
- # images.append(image)
71
- #
72
- #
73
- # print("Stable Diffusion has finished")
74
- # print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
75
- #
76
- # return images
77
-
78
- def translate(audio):
79
- print("""
80
- β€”
81
- Sending audio to Whisper ...
82
- β€”
83
- """)
84
- # current dateTime
85
- now = datetime.now()
86
- # convert to string
87
- date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
88
- print('DateTime String:', date_time_str)
89
-
90
- audio = whisper.load_audio(audio)
91
- audio = whisper.pad_or_trim(audio)
92
-
93
- mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
94
-
95
- _, probs = whisper_model.detect_language(mel)
96
-
97
- transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
98
- translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
99
-
100
- transcription = whisper.decode(whisper_model, mel, transcript_options)
101
- translation = whisper.decode(whisper_model, mel, translate_options)
102
-
103
- print("language spoken: " + transcription.language)
104
- print("transcript: " + transcription.text)
105
- print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
106
- print("translated: " + translation.text)
107
- if transcription.language == "en":
108
- tr_flag = flag.flag('GB')
109
- else:
110
- tr_flag = flag.flag(transcription.language)
111
- return tr_flag, transcription.text, translation.text
112
-
113
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
114
-
115
  css = """
116
  .container {
117
  max-width: 880px;
@@ -274,15 +176,14 @@ with gr.Blocks(css=css) as demo:
274
  with gr.Column():
275
  gr.HTML('''
276
  <h1>
277
- Whisper to Stable Diffusion
278
  </h1>
279
  <p style='text-align: center;'>
280
- Ask stable diffusion for images by speaking (or singing πŸ€—) in your native language ! Try it in French πŸ˜‰
281
  </p>
282
 
283
  <p style='text-align: center;'>
284
- This demo is wired to the official SD Space β€’ Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> β€’ <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br />
285
- β€”
286
  </p>
287
 
288
  ''')
@@ -291,35 +192,12 @@ with gr.Blocks(css=css) as demo:
291
 
292
  gr.Markdown(
293
  """
294
-
295
- ## 1. Record audio or Upload an audio file:
296
  """
297
  )
298
 
299
- with gr.Tab(label="Record audio input", elem_id="record_tab"):
300
- with gr.Column():
301
- record_input = gr.Audio(
302
- source="microphone",
303
- type="filepath",
304
- show_label=False,
305
- elem_id="record_btn"
306
- )
307
- with gr.Row():
308
- audio_r_translate = gr.Button("Check Whisper first ? πŸ‘", elem_id="check_btn_1")
309
- audio_r_direct_sd = gr.Button("Magic Whisper β€Ί SD right now!", elem_id="magic_btn_1")
310
-
311
- with gr.Tab(label="Upload audio input", elem_id="upload_tab"):
312
- with gr.Column():
313
- upload_input = gr.Audio(
314
- source="upload",
315
- type="filepath",
316
- show_label=False,
317
- elem_id="upload_area"
318
- )
319
- with gr.Row():
320
- audio_u_translate = gr.Button("Check Whisper first ? πŸ‘", elem_id="check_btn_2")
321
- audio_u_direct_sd = gr.Button("Magic Whisper β€Ί SD right now!", elem_id="magic_btn_2")
322
 
 
323
  with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
324
  with gr.Row():
325
  guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
@@ -328,28 +206,27 @@ with gr.Blocks(css=css) as demo:
328
 
329
  gr.Markdown(
330
  """
331
- ## 2. Check Whisper output, correct it if necessary:
332
  """
333
  )
334
 
335
  with gr.Row():
336
 
337
  transcripted_output = gr.Textbox(
338
- label="Transcription in your detected spoken language",
339
  lines=3,
340
- elem_id="transcripted"
341
  )
342
- language_detected_output = gr.Textbox(label="Native language", elem_id="spoken_lang",lines=3)
343
-
344
  with gr.Column():
345
  translated_output = gr.Textbox(
346
- label="Transcript translated in English by Whisper",
347
  lines=4,
348
  elem_id="translated"
349
  )
350
  with gr.Row():
351
  clear_btn = gr.Button(value="Clear")
352
- diffuse_btn = gr.Button(value="OK, Diffuse this prompt !", elem_id="diffuse_btn")
353
 
354
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
355
 
@@ -362,8 +239,8 @@ with gr.Blocks(css=css) as demo:
362
 
363
 
364
  gr.Markdown("""
365
- ## 3. Wait for Stable Diffusion Results β˜•οΈ
366
- Inference time is about ~20-30 seconds, when it's your turn 😬
367
  """
368
  )
369
 
@@ -371,11 +248,8 @@ with gr.Blocks(css=css) as demo:
371
 
372
 
373
  gr.Markdown("""
374
- ### πŸ“Œ About the models
375
- <p style='font-size: 1em;line-height: 1.5em;'>
376
- <strong>Whisper</strong> is a general-purpose speech recognition model.<br /><br />
377
- It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. <br />
378
- β€”
379
  </p>
380
  <p style='font-size: 1em;line-height: 1.5em;'>
381
  <strong>Stable Diffusion</strong> is a state of the art text-to-image model that generates images from text.
@@ -406,49 +280,6 @@ with gr.Blocks(css=css) as demo:
406
 
407
  """, elem_id="about")
408
 
409
- audio_r_translate.click(translate,
410
- inputs = record_input,
411
- outputs = [
412
- language_detected_output,
413
- transcripted_output,
414
- translated_output
415
- ])
416
-
417
- audio_u_translate.click(translate,
418
- inputs = upload_input,
419
- outputs = [
420
- language_detected_output,
421
- transcripted_output,
422
- translated_output
423
- ])
424
-
425
- audio_r_direct_sd.click(magic_whisper_to_sd,
426
- inputs = [
427
- record_input,
428
- guidance_scale,
429
- nb_iterations,
430
- seed
431
- ],
432
- outputs = [
433
- language_detected_output,
434
- transcripted_output,
435
- translated_output,
436
- sd_output
437
- ])
438
-
439
- audio_u_direct_sd.click(magic_whisper_to_sd,
440
- inputs = [
441
- upload_input,
442
- guidance_scale,
443
- nb_iterations,
444
- seed
445
- ],
446
- outputs = [
447
- language_detected_output,
448
- transcripted_output,
449
- translated_output,
450
- sd_output
451
- ])
452
 
453
  diffuse_btn.click(get_images,
454
  inputs = [
@@ -456,12 +287,6 @@ with gr.Blocks(css=css) as demo:
456
  ],
457
  outputs = sd_output
458
  )
459
- gr.HTML('''
460
- <div class="footer">
461
- <p>Whisper by <a href="https://github.com/openai/whisper" target="_blank">OpenAI</a> - Stable Diffusion by <a href="https://huggingface.co/CompVis" target="_blank">CompVis</a> and <a href="https://huggingface.co/stabilityai" target="_blank">Stability AI</a>
462
- </p>
463
- </div>
464
- ''')
465
 
466
 
467
  if __name__ == "__main__":
 
1
  import gradio as gr
 
 
2
  from datetime import datetime
3
  from PIL import Image
4
  import flag
5
  import os
 
 
 
6
 
7
  stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
8
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
9
 
10
+ title="Any Text to Stable Diffusion"
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def get_images(prompt):
13
  gallery_dir = stable_diffusion(prompt, fn_index=2)
14
  return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  css = """
18
  .container {
19
  max-width: 880px;
 
176
  with gr.Column():
177
  gr.HTML('''
178
  <h1>
179
+ Any Text to Stable Diffusion
180
  </h1>
181
  <p style='text-align: center;'>
182
+ Ask stable diffusion in any language !
183
  </p>
184
 
185
  <p style='text-align: center;'>
186
+ This demo is connected to StableDiffusion Space β€’ Offered by ddiddi <br />
 
187
  </p>
188
 
189
  ''')
 
192
 
193
  gr.Markdown(
194
  """
195
+ ## 1. Stable Diffusion Config
 
196
  """
197
  )
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+
201
  with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
202
  with gr.Row():
203
  guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
 
206
 
207
  gr.Markdown(
208
  """
209
+ ## 2. Enter prompt
210
  """
211
  )
212
 
213
  with gr.Row():
214
 
215
  transcripted_output = gr.Textbox(
216
+ label="Enter prompt",
217
  lines=3,
218
+ elem_id="transcript"
219
  )
220
+
 
221
  with gr.Column():
222
  translated_output = gr.Textbox(
223
+ label="in English",
224
  lines=4,
225
  elem_id="translated"
226
  )
227
  with gr.Row():
228
  clear_btn = gr.Button(value="Clear")
229
+ diffuse_btn = gr.Button(value="YES", elem_id="diffuse_btn")
230
 
231
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
232
 
 
239
 
240
 
241
  gr.Markdown("""
242
+ ## 3. Stable Diffusion Results
243
+ Inference time is about ~30-40 seconds
244
  """
245
  )
246
 
 
248
 
249
 
250
  gr.Markdown("""
251
+ ### πŸ“Œ Resources
252
+
 
 
 
253
  </p>
254
  <p style='font-size: 1em;line-height: 1.5em;'>
255
  <strong>Stable Diffusion</strong> is a state of the art text-to-image model that generates images from text.
 
280
 
281
  """, elem_id="about")
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  diffuse_btn.click(get_images,
285
  inputs = [
 
287
  ],
288
  outputs = sd_output
289
  )
 
 
 
 
 
 
290
 
291
 
292
  if __name__ == "__main__":