vignesh584 commited on
Commit
0cf81b0
·
verified ·
1 Parent(s): ecd92b1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +466 -0
app.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client, handle_file
3
+
4
+ import os
5
+ import shutil
6
+ from huggingface_hub import snapshot_download
7
+ import gradio as gr
8
+ from gradio_client import Client, handle_file
9
+ from mutagen.mp3 import MP3
10
+ from pydub import AudioSegment
11
+ from PIL import Image
12
+ import ffmpeg
13
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
14
+ from scripts.inference import inference_process
15
+ import argparse
16
+ import uuid
17
+
18
+
19
+
20
+ #hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
21
+
22
+ AUDIO_MAX_DURATION = 40000
23
+
24
+ #############
25
+ # UTILITIES #
26
+ #############
27
+
28
+ def is_mp3(file_path):
29
+ try:
30
+ audio = MP3(file_path)
31
+ return True
32
+ except Exception as e:
33
+ return False
34
+
35
+ def convert_mp3_to_wav(mp3_file_path, wav_file_path):
36
+ # Load the MP3 file
37
+ audio = AudioSegment.from_mp3(mp3_file_path)
38
+ # Export as WAV file
39
+ audio.export(wav_file_path, format="wav")
40
+ return wav_file_path
41
+
42
+
43
+ def trim_audio(file_path, output_path, max_duration):
44
+ # Load the audio file
45
+ audio = AudioSegment.from_wav(file_path)
46
+
47
+ # Check the length of the audio in milliseconds
48
+ audio_length = len(audio)
49
+
50
+ # If the audio is longer than the maximum duration, trim it
51
+ if audio_length > max_duration:
52
+ trimmed_audio = audio[:max_duration]
53
+ else:
54
+ trimmed_audio = audio
55
+
56
+ # Export the trimmed audio to a new file
57
+ trimmed_audio.export(output_path, format="wav")
58
+
59
+ return output_path
60
+
61
+
62
+ def add_silence_to_wav(wav_file_path, duration_s=1):
63
+ # Load the WAV file
64
+ audio = AudioSegment.from_wav(wav_file_path)
65
+ # Create 1 second of silence
66
+ silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds
67
+ # Add silence to the end of the audio file
68
+ audio_with_silence = audio + silence
69
+ # Export the modified audio
70
+ audio_with_silence.export(wav_file_path, format="wav")
71
+ return wav_file_path
72
+
73
+ def check_mp3(file_path):
74
+
75
+ if is_mp3(file_path):
76
+ unique_id = uuid.uuid4()
77
+ wav_file_path = f"{os.path.splitext(file_path)[0]}-{unique_id}.wav"
78
+ converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
79
+ print(f"File converted to {wav_file_path}")
80
+
81
+ return converted_audio, gr.update(value=converted_audio, visible=True)
82
+ else:
83
+ print("The file is not an MP3 file.")
84
+
85
+ return file_path, gr.update(value=file_path, visible=True)
86
+
87
+ def check_and_convert_webp_to_png(input_path, output_path):
88
+ try:
89
+ # Open the image file
90
+ with Image.open(input_path) as img:
91
+ # Check if the image is in WebP format
92
+ if img.format == 'WEBP':
93
+ # Convert and save as PNG
94
+ img.save(output_path, 'PNG')
95
+ print(f"Converted {input_path} to {output_path}")
96
+ return output_path
97
+ else:
98
+ print(f"The file {input_path} is not in WebP format.")
99
+ return input_path
100
+ except IOError:
101
+ print(f"Cannot open {input_path}. The file might not exist or is not an image.")
102
+
103
+ def convert_user_uploded_webp(input_path):
104
+
105
+ # convert to png if necessary
106
+ input_file = input_path
107
+ unique_id = uuid.uuid4()
108
+ output_file = f"converted_to_png_portrait-{unique_id}.png"
109
+ ready_png = check_and_convert_webp_to_png(input_file, output_file)
110
+ print(f"PORTRAIT PNG FILE: {ready_png}")
111
+ return ready_png
112
+
113
+ def clear_audio_elms():
114
+ return gr.update(value=None, visible=False)
115
+
116
+ def change_video_codec(input_file, output_file, codec='libx264', audio_codec='aac'):
117
+ try:
118
+ (
119
+ ffmpeg
120
+ .input(input_file)
121
+ .output(output_file, vcodec=codec, acodec=audio_codec)
122
+ .run(overwrite_output=True)
123
+ )
124
+ print(f'Successfully changed codec of {input_file} and saved as {output_file}')
125
+ except ffmpeg.Error as e:
126
+ print(f'Error occurred: {e.stderr.decode()}')
127
+
128
+
129
+
130
+ def get_talk(image_in, speech):
131
+ client = Client("fffiloni/dreamtalk")
132
+ result = client.predict(
133
+ audio_input=handle_file(speech),
134
+ image_path=handle_file(image_in),
135
+ emotional_style="M030_front_neutral_level1_001.mat",
136
+ api_name="/infer"
137
+ )
138
+ print(result)
139
+ return result['video']
140
+
141
+ #######################################################
142
+ # Gradio APIs for optional image and voice generation #
143
+ #######################################################
144
+
145
+ def generate_portrait(prompt_image):
146
+ if prompt_image is None or prompt_image == "":
147
+ raise gr.Error("Can't generate a portrait without a prompt !")
148
+
149
+ try:
150
+ client = Client("ByteDance/SDXL-Lightning")
151
+ except:
152
+ raise gr.Error(f"ByteDance/SDXL-Lightning space's api might not be ready, please wait, or upload an image instead.")
153
+
154
+ result = client.predict(
155
+ prompt = prompt_image,
156
+ ckpt = "4-Step",
157
+ api_name = "/generate_image"
158
+ )
159
+ print(result)
160
+
161
+ # convert to png if necessary
162
+ input_file = result
163
+ unique_id = uuid.uuid4()
164
+ output_file = f"converted_to_png_portrait-{unique_id}.png"
165
+ ready_png = check_and_convert_webp_to_png(input_file, output_file)
166
+ print(f"PORTRAIT PNG FILE: {ready_png}")
167
+
168
+ return ready_png
169
+
170
+ def generate_voice_with_parler(prompt_audio, voice_description):
171
+ if prompt_audio is None or prompt_audio == "" :
172
+ raise gr.Error(f"Can't generate a voice without text to synthetize !")
173
+ if voice_description is None or voice_description == "":
174
+ gr.Info(
175
+ "For better control, You may want to provide a voice character description next time.",
176
+ duration = 10,
177
+ visible = True
178
+ )
179
+ try:
180
+ client = Client("parler-tts/parler_tts_mini")
181
+ except:
182
+ raise gr.Error(f"parler-tts/parler_tts_mini space's api might not be ready, please wait, or upload an audio instead.")
183
+
184
+ result = client.predict(
185
+ text = prompt_audio,
186
+ description = voice_description,
187
+ api_name = "/gen_tts"
188
+ )
189
+ print(result)
190
+ return result, gr.update(value=result, visible=True)
191
+
192
+ def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
193
+ try:
194
+ client = Client("collabora/WhisperSpeech")
195
+ except:
196
+ raise gr.Error(f"collabora/WhisperSpeech space's api might not be ready, please wait, or upload an audio instead.")
197
+
198
+ result = client.predict(
199
+ multilingual_text = prompt_audio_whisperspeech,
200
+ speaker_audio = handle_file(audio_to_clone),
201
+ speaker_url = "",
202
+ cps = 14,
203
+ api_name = "/whisper_speech_demo"
204
+ )
205
+ print(result)
206
+ return result, gr.update(value=result, visible=True)
207
+
208
+
209
+ ########################
210
+ # TALKING PORTRAIT GEN #
211
+ ########################
212
+
213
+
214
+
215
+ def pipe (voice, image_in):
216
+
217
+ talking_portrait_vid = get_talk(portrait, ready_audio)
218
+
219
+ # Convert video to readable format
220
+
221
+ final_output_file = f"converted_{talking_portrait_vid}"
222
+ change_video_codec(talking_portrait_vid, final_output_file)
223
+
224
+ return final_output_file
225
+
226
+
227
+
228
+
229
+
230
+ css = '''
231
+ #col-container {
232
+ margin: 0 auto;
233
+ }
234
+ #column-names {
235
+ margin-top: 50px;
236
+ }
237
+ #main-group {
238
+ background-color: none;
239
+ }
240
+ .tabs {
241
+ background-color: unset;
242
+ }
243
+ #image-block {
244
+ flex: 1;
245
+ }
246
+ #video-block {
247
+ flex: 9;
248
+ }
249
+ #audio-block, #audio-clone-elm {
250
+ flex: 1;
251
+ }
252
+ div#audio-clone-elm > .audio-container > button {
253
+ height: 180px!important;
254
+ }
255
+ div#audio-clone-elm > .audio-container > button > .wrap {
256
+ font-size: 0.9em;
257
+ }
258
+ #text-synth, #voice-desc{
259
+ height: 130px;
260
+ }
261
+ #text-synth-wsp {
262
+ height: 120px;
263
+ }
264
+ #audio-column, #result-column {
265
+ display: flex;
266
+ }
267
+ #gen-voice-btn {
268
+ flex: 1;
269
+ }
270
+ #parler-tab, #whisperspeech-tab {
271
+ padding: 0;
272
+ }
273
+ #main-submit{
274
+ flex: 1;
275
+ }
276
+ #pro-tips {
277
+ margin-top: 50px;
278
+ }
279
+ div#warning-ready {
280
+ background-color: #ecfdf5;
281
+ padding: 0 16px 16px;
282
+ margin: 20px 0;
283
+ color: #030303!important;
284
+ }
285
+ div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
286
+ color: #057857!important;
287
+ }
288
+ div#warning-duplicate {
289
+ background-color: #ebf5ff;
290
+ padding: 0 16px 16px;
291
+ margin: 20px 0;
292
+ color: #030303!important;
293
+ }
294
+ div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
295
+ color: #0f4592!important;
296
+ }
297
+ div#warning-duplicate strong {
298
+ color: #0f4592;
299
+ }
300
+ p.actions {
301
+ display: flex;
302
+ align-items: center;
303
+ margin: 20px 0;
304
+ }
305
+ div#warning-duplicate .actions a {
306
+ display: inline-block;
307
+ margin-right: 10px;
308
+ }
309
+ .dark #warning-duplicate {
310
+ background-color: #0c0c0c !important;
311
+ border: 1px solid white !important;
312
+ }
313
+ '''
314
+
315
+ with gr.Blocks(css=css) as demo:
316
+ with gr.Column(elem_id="col-container"):
317
+ gr.Markdown("""
318
+ # CPS - 584 Deep Learning Project by Vignesh Yanamalamanda and Srija Tatineni
319
+
320
+ This can be achieved with the help of several open-source model: Stable Diffusiion XL Lightning | Parler TextToSpeec | WhisperSpeech | Hallo
321
+
322
+
323
+ Thanks to Professor Mehdi For Inspiring Us to be creative while learning. and FYI. 4-5 seconds of audio will take ~5 minutes per inference, please be patient.
324
+ """)
325
+ with gr.Row(elem_id="column-names"):
326
+ gr.Markdown("## 1. Load Image or Type")
327
+ gr.Markdown("## 2. Load Voice or Type")
328
+ gr.Markdown("## 3. Result")
329
+ with gr.Group(elem_id="main-group"):
330
+ with gr.Row():
331
+ with gr.Column():
332
+
333
+ portrait = gr.Image(
334
+ sources = ["upload"],
335
+ type = "filepath",
336
+ format = "png",
337
+ elem_id = "image-block"
338
+ )
339
+
340
+ prompt_image = gr.Textbox(
341
+ label = "Generate image",
342
+ lines = 2,
343
+ max_lines = 2
344
+ )
345
+
346
+ gen_image_btn = gr.Button("Generate portrait (optional)")
347
+
348
+ with gr.Column(elem_id="audio-column"):
349
+
350
+ voice = gr.Audio(
351
+ type = "filepath",
352
+ elem_id = "audio-block"
353
+ )
354
+
355
+ preprocess_audio_file = gr.File(visible=False)
356
+
357
+
358
+ with gr.Tab("Parler TTS", elem_id="parler-tab"):
359
+
360
+ prompt_audio = gr.Textbox(
361
+ label = "Text to synthetize",
362
+ lines = 3,
363
+ max_lines = 3,
364
+ elem_id = "text-synth"
365
+ )
366
+
367
+ voice_description = gr.Textbox(
368
+ label = "Voice description",
369
+ lines = 3,
370
+ max_lines = 3,
371
+ elem_id = "voice-desc"
372
+ )
373
+
374
+ gen_voice_btn = gr.Button("Generate voice (optional)")
375
+
376
+ with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
377
+ prompt_audio_whisperspeech = gr.Textbox(
378
+ label = "Text to synthetize",
379
+ lines = 2,
380
+ max_lines = 2,
381
+ elem_id = "text-synth-wsp"
382
+ )
383
+ audio_to_clone = gr.Audio(
384
+ label = "Voice to clone",
385
+ type = "filepath",
386
+ elem_id = "audio-clone-elm"
387
+ )
388
+ gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
389
+
390
+ with gr.Column(elem_id="result-column"):
391
+
392
+ result = gr.Video(
393
+ elem_id="video-block"
394
+ )
395
+
396
+ submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit")
397
+
398
+ with gr.Row(elem_id="pro-tips"):
399
+ gr.Markdown("""
400
+ # Project done in Summer 2024 at University of Dayton, Dayton, OH
401
+
402
+
403
+ """)
404
+
405
+ gr.Markdown("""
406
+ # Application is made on Gradio and Follow up with files for reference
407
+
408
+ """)
409
+
410
+ portrait.upload(
411
+ fn = convert_user_uploded_webp,
412
+ inputs = [portrait],
413
+ outputs = [portrait],
414
+ queue = False,
415
+ show_api = False
416
+ )
417
+
418
+ voice.upload(
419
+ fn = check_mp3,
420
+ inputs = [voice],
421
+ outputs = [voice, preprocess_audio_file],
422
+ queue = False,
423
+ show_api = False
424
+ )
425
+
426
+ voice.clear(
427
+ fn = clear_audio_elms,
428
+ inputs = None,
429
+ outputs = [preprocess_audio_file],
430
+ queue = False,
431
+ show_api = False
432
+ )
433
+
434
+ gen_image_btn.click(
435
+ fn = generate_portrait,
436
+ inputs = [prompt_image],
437
+ outputs = [portrait],
438
+ queue = False,
439
+ show_api = False
440
+ )
441
+
442
+ gen_voice_btn.click(
443
+ fn = generate_voice_with_parler,
444
+ inputs = [prompt_audio, voice_description],
445
+ outputs = [voice, preprocess_audio_file],
446
+ queue = False,
447
+ show_api = False
448
+ )
449
+
450
+ gen_wsp_voice_btn.click(
451
+ fn = get_whisperspeech,
452
+ inputs = [prompt_audio_whisperspeech, audio_to_clone],
453
+ outputs = [voice, preprocess_audio_file],
454
+ queue = False,
455
+ show_api = False
456
+ )
457
+
458
+ submit_btn.click(
459
+ fn = generate_talking_portrait,
460
+ inputs = [portrait, voice],
461
+ outputs = [result],
462
+ show_api = False
463
+ )
464
+
465
+
466
+ demo.queue(max_size=100).launch(show_error=True, show_api=False, share =True)