Yusin commited on
Commit
ccbd1ab
β€’
1 Parent(s): 79d11aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -322
app.py CHANGED
@@ -1,36 +1,17 @@
1
  import gradio as gr
2
- #import torch
3
- #import whisper
4
- #from datetime import datetime
5
  from PIL import Image
6
- #import flag
7
  import os
8
- #MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
9
 
10
  #from diffusers import StableDiffusionPipeline
11
-
12
  whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
13
  stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
14
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
15
-
16
- title="Whisper to Stable Diffusion"
17
-
18
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
19
-
20
- #whisper_model = whisper.load_model("small")
21
-
22
- #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
-
24
- #pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
25
- #pipe.to(device)
26
-
27
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
28
-
29
  def get_images(prompt):
30
  gallery_dir = stable_diffusion(prompt, fn_index=2)
31
  return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
32
 
33
-
34
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
35
 
36
  whisper_results = translate_better(audio)
@@ -39,42 +20,7 @@ def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
39
 
40
  return whisper_results[0], whisper_results[1], images
41
 
42
- #def diffuse(prompt, guidance_scale, nb_iterations, seed):
43
- #
44
- # generator = torch.Generator(device=device).manual_seed(int(seed))
45
- #
46
- # print("""
47
- # β€”
48
- # Sending prompt to Stable Diffusion ...
49
- # β€”
50
- # """)
51
- # print("prompt: " + prompt)
52
- # print("guidance scale: " + str(guidance_scale))
53
- # print("inference steps: " + str(nb_iterations))
54
- # print("seed: " + str(seed))
55
- #
56
- # images_list = pipe(
57
- # [prompt] * 2,
58
- # guidance_scale=guidance_scale,
59
- # num_inference_steps=nb_iterations,
60
- # generator=generator
61
- # )
62
- #
63
- # images = []
64
- #
65
- # safe_image = Image.open(r"unsafe.png")
66
- #
67
- # for i, image in enumerate(images_list["sample"]):
68
- # if(images_list["nsfw_content_detected"][i]):
69
- # images.append(safe_image)
70
- # else:
71
- # images.append(image)
72
- #
73
- #
74
- # print("Stable Diffusion has finished")
75
- # print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
76
- #
77
- # return images
78
 
79
  def translate_better(audio):
80
  print("""
@@ -91,227 +37,14 @@ def translate_better(audio):
91
  return transcribe_text_result, translate_text_result
92
 
93
 
94
- #def translate(audio):
95
- # print("""
96
- # β€”
97
- # Sending audio to Whisper ...
98
- # β€”
99
- # """)
100
- # # current dateTime
101
- # now = datetime.now()
102
- # # convert to string
103
- # date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
104
- # print('DateTime String:', date_time_str)
105
- #
106
- # audio = whisper.load_audio(audio)
107
- # audio = whisper.pad_or_trim(audio)
108
- #
109
- # mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
110
- #
111
- # _, probs = whisper_model.detect_language(mel)
112
- #
113
- # transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
114
- # translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
115
- #
116
- # transcription = whisper.decode(whisper_model, mel, transcript_options)
117
- # translation = whisper.decode(whisper_model, mel, translate_options)
118
- #
119
- # print("language spoken: " + transcription.language)
120
- # print("transcript: " + transcription.text)
121
- # print("β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”")
122
- # print("translated: " + translation.text)
123
- # if transcription.language == "en":
124
- # tr_flag = flag.flag('GB')
125
- # else:
126
- # tr_flag = flag.flag(transcription.language)
127
- # return tr_flag, transcription.text, translation.text
128
-
129
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
130
-
131
- css = """
132
- .container {
133
- max-width: 780px;
134
- margin: auto;
135
- padding-top: 1.5rem;
136
- }
137
- a {
138
- text-decoration: underline;
139
- }
140
- h1 {
141
- font-weight: 900;
142
- margin-bottom: 7px;
143
- text-align: center;
144
- font-size: 2em;
145
- margin-bottom: 1em;
146
- }
147
- #w2sd_container{
148
- margin-top: 20px;
149
- }
150
- .footer {
151
- margin-bottom: 45px;
152
- margin-top: 35px;
153
- text-align: center;
154
- border-bottom: 1px solid #e5e5e5;
155
- }
156
- .footer>p {
157
- font-size: .8rem;
158
- display: inline-block;
159
- padding: 0 10px;
160
- transform: translateY(10px);
161
- background: white;
162
- }
163
- .dark .footer {
164
- border-color: #303030;
165
- }
166
- .dark .footer>p {
167
- background: #0b0f19;
168
- }
169
- .tabitem {
170
- border-bottom-left-radius: 10px;
171
- border-bottom-right-radius: 10px;
172
- }
173
- #record_tab, #upload_tab {
174
- font-size: 1.2em;
175
- }
176
- #record_btn{
177
-
178
- }
179
- #record_btn > div > button > span {
180
- width: 2.375rem;
181
- height: 2.375rem;
182
- }
183
- #record_btn > div > button > span > span {
184
- width: 2.375rem;
185
- height: 2.375rem;
186
- }
187
- audio {
188
- margin-bottom: 10px;
189
- }
190
- div#record_btn > .mt-6{
191
- margin-top: 0!important;
192
- }
193
- div#record_btn > .mt-6 button {
194
- font-size: 2em;
195
- width: 100%;
196
- padding: 20px;
197
- height: 160px;
198
- }
199
- div#upload_area {
200
- height: 11.1rem;
201
- }
202
- div#upload_area > div.w-full > div {
203
- min-height: 9rem;
204
- }
205
- #check_btn_1, #check_btn_2{
206
- color: #fff;
207
- --tw-gradient-from: #4caf50;
208
- --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to);
209
- --tw-gradient-to: #4caf50;
210
- border-color: #8bc34a;
211
- }
212
- #magic_btn_1, #magic_btn_2{
213
- color: #fff;
214
- --tw-gradient-from: #f44336;
215
- --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to);
216
- --tw-gradient-to: #ff9800;
217
- border-color: #ff9800;
218
- }
219
- input::-webkit-inner-spin-button, input::-webkit-outer-spin-button {
220
- -webkit-appearance: none;
221
- }
222
- input[type=number] {
223
- -moz-appearance: textfield;
224
- }
225
- input[type=range] {
226
- -webkit-appearance: none;
227
- cursor: pointer;
228
- height: 1px;
229
- background: currentColor;
230
- }
231
- input[type=range]::-webkit-slider-thumb {
232
- -webkit-appearance: none;
233
- width: 0.5em;
234
- height: 1.2em;
235
- border-radius: 10px;
236
- background: currentColor;
237
- }
238
- input[type=range]::-moz-range-thumb{
239
- width: 0.5em;
240
- height: 1.2em;
241
- border-radius: 10px;
242
- background: currentColor;
243
- }
244
- div#spoken_lang textarea {
245
- font-size: 4em;
246
- line-height: 1em;
247
- text-align: center;
248
- }
249
- div#transcripted {
250
- flex: 4;
251
- }
252
- div#translated textarea {
253
- font-size: 1.5em;
254
- line-height: 1.25em;
255
- }
256
- #sd_settings {
257
- margin-bottom: 20px;
258
- }
259
- #diffuse_btn {
260
- color: #fff;
261
- font-size: 1em;
262
- margin-bottom: 20px;
263
- --tw-gradient-from: #4caf50;
264
- --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to);
265
- --tw-gradient-to: #4caf50;
266
- border-color: #8bc34a;
267
- }
268
- #notice {
269
- padding: 20px 14px 10px;
270
- display: flex;
271
- align-content: space-evenly;
272
- gap: 20px;
273
- line-height: 1em;
274
- font-size: .8em;
275
- border: 1px solid #374151;
276
- border-radius: 10px;
277
- }
278
- #about {
279
- padding: 20px;
280
- }
281
- #notice > div {
282
- flex: 1;
283
- }
284
-
285
- """
286
-
287
- ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
288
 
289
  with gr.Blocks(css=css) as demo:
290
- with gr.Column():
291
- gr.HTML('''
292
- <h1>
293
- Whisper to Stable Diffusion
294
- </h1>
295
- <p style='text-align: center;'>
296
- Ask stable diffusion for images by speaking (or singing πŸ€—) in your native language ! Try it in French πŸ˜‰
297
- </p>
298
-
299
- <p style='text-align: center;'>
300
- This demo is wired to the official SD Space β€’ Offered by Sylvain <a href='https://twitter.com/fffiloni' target='_blank'>@fffiloni</a> β€’ <img id='visitor-badge' alt='visitor badge' src='https://visitor-badge.glitch.me/badge?page_id=gradio-blocks.whisper-to-stable-diffusion' style='display: inline-block' /><br />
301
- β€”
302
- </p>
303
-
304
- ''')
305
- # with gr.Row(elem_id="w2sd_container"):
306
- # with gr.Column():
307
-
308
- gr.Markdown(
309
  """
310
-
311
- ## 1. Record audio or Upload an audio file:
312
  """
313
  )
314
-
315
  with gr.Tab(label="Record audio input", elem_id="record_tab"):
316
  with gr.Column():
317
  record_input = gr.Audio(
@@ -369,13 +102,7 @@ with gr.Blocks(css=css) as demo:
369
 
370
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
371
 
372
-
373
-
374
-
375
-
376
- # with gr.Column():
377
-
378
-
379
 
380
  gr.Markdown("""
381
  ## 3. Wait for Stable Diffusion Results β˜•οΈ
@@ -386,42 +113,6 @@ with gr.Blocks(css=css) as demo:
386
  sd_output = gr.Gallery().style(grid=2, height="auto")
387
 
388
 
389
- gr.Markdown("""
390
- ### πŸ“Œ About the models
391
- <p style='font-size: 1em;line-height: 1.5em;'>
392
- <strong>Whisper</strong> is a general-purpose speech recognition model.<br /><br />
393
- It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. <br />
394
- β€”
395
- </p>
396
- <p style='font-size: 1em;line-height: 1.5em;'>
397
- <strong>Stable Diffusion</strong> is a state of the art text-to-image model that generates images from text.
398
- </p>
399
- <div id="notice">
400
- <div>
401
- LICENSE
402
- <p style='font-size: 0.8em;'>
403
- The model is licensed with a <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" target="_blank">CreativeML Open RAIL-M</a> license.</p>
404
- <p style='font-size: 0.8em;'>
405
- The authors claim no rights on the outputs you generate, you are free to use them and are accountable for their use which must not go against the provisions set in this license.</p>
406
- <p style='font-size: 0.8em;'>
407
- The license forbids you from sharing any content that violates any laws, produce any harm to a person, disseminate any personal information that would be meant for harm, spread misinformation and target vulnerable groups.</p>
408
- <p style='font-size: 0.8em;'>
409
- For the full list of restrictions please <a href="https://huggingface.co/spaces/CompVis/stable-diffusion-license" target="_blank" target="_blank">read the license</a>.
410
- </p>
411
- </div>
412
- <div>
413
- Biases and content acknowledgment
414
- <p style='font-size: 0.8em;'>
415
- Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography and violence.</p>
416
- <p style='font-size: 0.8em;'>
417
- The model was trained on the <a href="https://laion.ai/blog/laion-5b/" target="_blank">LAION-5B dataset</a>, which scraped non-curated image-text-pairs from the internet (the exception being the removal of illegal content) and is meant for research purposes.</p>
418
- <p style='font-size: 0.8em;'> You can read more in the <a href="https://huggingface.co/CompVis/stable-diffusion-v1-4" target="_blank">model card</a>.
419
- </p>
420
- </div>
421
- </div>
422
-
423
- """, elem_id="about")
424
-
425
  audio_r_translate.click(translate_better,
426
  inputs = record_input,
427
  outputs = [
@@ -472,13 +163,6 @@ with gr.Blocks(css=css) as demo:
472
  ],
473
  outputs = sd_output
474
  )
475
- gr.HTML('''
476
- <div class="footer">
477
- <p>Whisper by <a href="https://github.com/openai/whisper" target="_blank">OpenAI</a> - Stable Diffusion by <a href="https://huggingface.co/CompVis" target="_blank">CompVis</a> and <a href="https://huggingface.co/stabilityai" target="_blank">Stability AI</a>
478
- </p>
479
- </div>
480
- ''')
481
-
482
 
483
  if __name__ == "__main__":
484
  demo.queue(max_size=32, concurrency_count=20).launch()
 
1
  import gradio as gr
 
 
 
2
  from PIL import Image
 
3
  import os
 
4
 
5
  #from diffusers import StableDiffusionPipeline
 
6
  whisper = gr.Interface.load(name="spaces/sanchit-gandhi/whisper-large-v2")
7
  stable_diffusion = gr.Blocks.load(name="spaces/runwayml/stable-diffusion-v1-5")
8
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
9
+ title="Talking to Stable Diffusion"
 
 
10
  ### β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
 
 
 
 
 
 
 
 
11
  def get_images(prompt):
12
  gallery_dir = stable_diffusion(prompt, fn_index=2)
13
  return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
14
 
 
15
  def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
16
 
17
  whisper_results = translate_better(audio)
 
20
 
21
  return whisper_results[0], whisper_results[1], images
22
 
23
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def translate_better(audio):
26
  print("""
 
37
  return transcribe_text_result, translate_text_result
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  with gr.Blocks(css=css) as demo:
42
+ gr.Markdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
+ ## 1. Say what you want:
 
45
  """
46
  )
47
+ with gr.Column():
48
  with gr.Tab(label="Record audio input", elem_id="record_tab"):
49
  with gr.Column():
50
  record_input = gr.Audio(
 
102
 
103
  clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
104
 
105
+
 
 
 
 
 
 
106
 
107
  gr.Markdown("""
108
  ## 3. Wait for Stable Diffusion Results β˜•οΈ
 
113
  sd_output = gr.Gallery().style(grid=2, height="auto")
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  audio_r_translate.click(translate_better,
117
  inputs = record_input,
118
  outputs = [
 
163
  ],
164
  outputs = sd_output
165
  )
 
 
 
 
 
 
 
166
 
167
  if __name__ == "__main__":
168
  demo.queue(max_size=32, concurrency_count=20).launch()