RO-Rtechs commited on
Commit
ffed168
·
verified ·
1 Parent(s): e5bb6b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -115
app.py CHANGED
@@ -12,12 +12,11 @@ import edge_tts
12
  import asyncio
13
  import librosa
14
  import traceback
15
- import soundfile as sf
16
  from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter
17
  from pedalboard.io import AudioFile
18
  from pydub import AudioSegment
19
  import noisereduce as nr
20
- import numpy as np
21
 
22
  logging.getLogger("infer_rvc_python").setLevel(logging.ERROR)
23
 
@@ -32,7 +31,7 @@ PITCH_ALGO_OPT = [
32
  "harvest",
33
  "crepe",
34
  "rmvpe",
35
- "rmvpe+",
36
  ]
37
 
38
 
@@ -137,43 +136,71 @@ def add_audio_effects(audio_list):
137
 
138
 
139
  def apply_noisereduce(audio_list):
140
- # https://github.com/sa-if/Audio-Denoiser
141
- print("Noice reduce")
142
 
143
  result = []
144
  for audio_path in audio_list:
145
  out_path = f'{os.path.splitext(audio_path)[0]}_noisereduce.wav'
146
-
147
  try:
148
  # Load audio file
149
  audio = AudioSegment.from_file(audio_path)
150
-
151
  # Convert audio to numpy array
152
  samples = np.array(audio.get_array_of_samples())
153
-
154
  # Reduce noise
155
  reduced_noise = nr.reduce_noise(samples, sr=audio.frame_rate, prop_decrease=0.6)
156
-
157
  # Convert reduced noise signal back to audio
158
  reduced_audio = AudioSegment(
159
- reduced_noise.tobytes(),
160
- frame_rate=audio.frame_rate,
161
  sample_width=audio.sample_width,
162
  channels=audio.channels
163
  )
164
-
165
  # Save reduced audio to file
166
  reduced_audio.export(out_path, format="wav")
167
  result.append(out_path)
168
-
169
  except Exception as e:
170
  traceback.print_exc()
171
- print(f"Error noisereduce: {str(e)}")
172
  result.append(audio_path)
173
 
174
  return result
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  @spaces.GPU()
178
  def convert_now(audio_files, random_tag, converter):
179
  return converter(
@@ -196,10 +223,11 @@ def run(
196
  c_b_p,
197
  active_noise_reduce,
198
  audio_effects,
 
199
  ):
200
  if not audio_files:
201
- raise ValueError("The audio pls")
202
-
203
  if isinstance(audio_files, str):
204
  audio_files = [audio_files]
205
 
@@ -207,7 +235,7 @@ def run(
207
  file_m, file_index = find_my_model(file_m, file_index)
208
  print(file_m, file_index)
209
 
210
- random_tag = "USER_"+str(random.randint(10000000, 99999999))
211
 
212
  converter.apply_conf(
213
  tag=random_tag,
@@ -219,18 +247,23 @@ def run(
219
  respiration_median_filtering=r_m_f,
220
  envelope_ratio=e_r,
221
  consonant_breath_protection=c_b_p,
222
- resample_sr=44100 if audio_files[0].endswith('.mp3') else 0,
223
  )
224
  time.sleep(0.1)
225
 
226
- result = convert_now(audio_files, random_tag, converter)
 
 
 
 
 
227
 
228
  if active_noise_reduce:
229
  result = apply_noisereduce(result)
230
 
231
  if audio_effects:
232
  result = add_audio_effects(result)
233
-
234
  return result
235
 
236
 
@@ -340,15 +373,19 @@ def active_tts_conf():
340
  return gr.Checkbox(
341
  False,
342
  label="TTS",
343
- # info="",
344
  container=False,
345
  )
346
 
347
 
348
  def tts_voice_conf():
349
  return gr.Dropdown(
350
- label="tts voice",
351
- choices=voices,
 
 
 
 
 
352
  visible=False,
353
  value="en-US-EmmaMultilingualNeural-Female",
354
  )
@@ -371,12 +408,11 @@ def tts_button_conf():
371
  visible=False,
372
  )
373
 
374
-
375
  def tts_play_conf():
376
  return gr.Checkbox(
377
  False,
378
  label="Play",
379
- # info="",
380
  container=False,
381
  visible=False,
382
  )
@@ -386,7 +422,6 @@ def sound_gui():
386
  return gr.Audio(
387
  value=None,
388
  type="filepath",
389
- # format="mp3",
390
  autoplay=True,
391
  visible=False,
392
  )
@@ -396,7 +431,6 @@ def denoise_conf():
396
  return gr.Checkbox(
397
  False,
398
  label="Denoise",
399
- # info="",
400
  container=False,
401
  visible=True,
402
  )
@@ -406,7 +440,6 @@ def effects_conf():
406
  return gr.Checkbox(
407
  False,
408
  label="Effects",
409
- # info="",
410
  container=False,
411
  visible=True,
412
  )
@@ -414,12 +447,12 @@ def effects_conf():
414
 
415
  def infer_tts_audio(tts_voice, tts_text, play_tts):
416
  out_dir = "output"
417
- folder_tts = "USER_"+str(random.randint(10000, 99999))
418
-
419
  os.makedirs(out_dir, exist_ok=True)
420
  os.makedirs(os.path.join(out_dir, folder_tts), exist_ok=True)
421
  out_path = os.path.join(out_dir, folder_tts, "tts.mp3")
422
-
423
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(out_path))
424
  if play_tts:
425
  return [out_path], out_path
@@ -437,7 +470,7 @@ def show_components_tts(value_active):
437
  visible=value_active
438
  )
439
 
440
-
441
  def get_gui(theme):
442
  with gr.Blocks(theme=theme) as app:
443
  gr.Markdown(title)
@@ -482,70 +515,12 @@ def get_gui(theme):
482
  res_fc = respiration_filter_conf()
483
  envel_r = envelope_ratio_conf()
484
  const = consonant_protec_conf()
485
- with gr.Row():
486
- with gr.Column():
487
- with gr.Row():
488
- denoise_gui = denoise_conf()
489
- effects_gui = effects_conf()
490
- button_base = button_conf()
491
- output_base = output_conf()
492
-
493
- button_base.click(
494
- run,
495
- inputs=[
496
- aud,
497
- model,
498
- algo,
499
- algo_lvl,
500
- indx,
501
- indx_inf,
502
- res_fc,
503
- envel_r,
504
- const,
505
- denoise_gui,
506
- effects_gui,
507
- ],
508
- outputs=[output_base],
509
- )
510
 
511
-
512
- gr.Examples(
513
- examples=[
514
- [
515
- ["./test.ogg"],
516
- "./model.pth",
517
- "rmvpe+",
518
- 0,
519
- "./model.index",
520
- 0.75,
521
- 3,
522
- 0.25,
523
- 0.50,
524
- ],
525
- [
526
- ["./example2/test2.ogg"],
527
- "./example2/model_link.txt",
528
- "rmvpe+",
529
- 0,
530
- "./example2/index_link.txt",
531
- 0.75,
532
- 3,
533
- 0.25,
534
- 0.50,
535
- ],
536
- [
537
- ["./example3/test3.wav"],
538
- "./example3/zip_link.txt",
539
- "rmvpe+",
540
- 0,
541
- None,
542
- 0.75,
543
- 3,
544
- 0.25,
545
- 0.50,
546
- ],
547
-
548
- ],
549
  fn=run,
550
  inputs=[
551
  aud,
@@ -557,27 +532,12 @@ def get_gui(theme):
557
  res_fc,
558
  envel_r,
559
  const,
 
 
560
  ],
561
- outputs=[output_base],
562
- cache_examples=False,
563
  )
564
 
565
- return app
566
-
567
 
568
- if __name__ == "__main__":
569
-
570
- tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
571
- voices = sorted([f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list])
572
-
573
- app = get_gui(theme)
574
-
575
- app.queue(default_concurrency_limit=40)
576
-
577
- app.launch(
578
- max_threads=40,
579
- share=False,
580
- show_error=True,
581
- quiet=False,
582
- debug=False,
583
- )
 
12
  import asyncio
13
  import librosa
14
  import traceback
15
+ import numpy as np
16
  from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter
17
  from pedalboard.io import AudioFile
18
  from pydub import AudioSegment
19
  import noisereduce as nr
 
20
 
21
  logging.getLogger("infer_rvc_python").setLevel(logging.ERROR)
22
 
 
31
  "harvest",
32
  "crepe",
33
  "rmvpe",
34
+ "rmvpe+"
35
  ]
36
 
37
 
 
136
 
137
 
138
  def apply_noisereduce(audio_list):
139
+ # https://github.com/saif/Audio-Denoiser
140
+ print("Noise reduction")
141
 
142
  result = []
143
  for audio_path in audio_list:
144
  out_path = f'{os.path.splitext(audio_path)[0]}_noisereduce.wav'
145
+
146
  try:
147
  # Load audio file
148
  audio = AudioSegment.from_file(audio_path)
149
+
150
  # Convert audio to numpy array
151
  samples = np.array(audio.get_array_of_samples())
152
+
153
  # Reduce noise
154
  reduced_noise = nr.reduce_noise(samples, sr=audio.frame_rate, prop_decrease=0.6)
155
+
156
  # Convert reduced noise signal back to audio
157
  reduced_audio = AudioSegment(
158
+ reduced_noise.tobytes(),
159
+ frame_rate=audio.frame_rate,
160
  sample_width=audio.sample_width,
161
  channels=audio.channels
162
  )
163
+
164
  # Save reduced audio to file
165
  reduced_audio.export(out_path, format="wav")
166
  result.append(out_path)
167
+
168
  except Exception as e:
169
  traceback.print_exc()
170
+ print(f"Error in noise reduction: {str(e)}")
171
  result.append(audio_path)
172
 
173
  return result
174
 
175
 
176
+ def split_audio_into_chunks(audio_file, chunk_length_ms=30000):
177
+ """
178
+ Splits an audio file into smaller chunks.
179
+ :param audio_file: Path to the input audio file.
180
+ :param chunk_length_ms: Length of each chunk in milliseconds (default is 30 seconds).
181
+ :return: List of chunk file paths.
182
+ """
183
+ try:
184
+ audio = AudioSegment.from_file(audio_file)
185
+ chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
186
+ chunk_paths = []
187
+
188
+ base_name = os.path.splitext(os.path.basename(audio_file))[0]
189
+ output_dir = os.path.join(os.path.dirname(audio_file), f"{base_name}_chunks")
190
+ os.makedirs(output_dir, exist_ok=True)
191
+
192
+ for index, chunk in enumerate(chunks):
193
+ chunk_path = os.path.join(output_dir, f"{base_name}_chunk_{index + 1}.wav")
194
+ chunk.export(chunk_path, format="wav")
195
+ chunk_paths.append(chunk_path)
196
+
197
+ return chunk_paths
198
+ except Exception as e:
199
+ traceback.print_exc()
200
+ print(f"Error splitting audio into chunks: {str(e)}")
201
+ return [audio_file]
202
+
203
+
204
  @spaces.GPU()
205
  def convert_now(audio_files, random_tag, converter):
206
  return converter(
 
223
  c_b_p,
224
  active_noise_reduce,
225
  audio_effects,
226
+ chunk_length_ms=30000
227
  ):
228
  if not audio_files:
229
+ raise ValueError("Please provide audio files")
230
+
231
  if isinstance(audio_files, str):
232
  audio_files = [audio_files]
233
 
 
235
  file_m, file_index = find_my_model(file_m, file_index)
236
  print(file_m, file_index)
237
 
238
+ random_tag = "USER_" + str(random.randint(10000000, 99999999))
239
 
240
  converter.apply_conf(
241
  tag=random_tag,
 
247
  respiration_median_filtering=r_m_f,
248
  envelope_ratio=e_r,
249
  consonant_breath_protection=c_b_p,
250
+ resample_sr=44100 if audio_files[0].endswith('.mp3') else 0,
251
  )
252
  time.sleep(0.1)
253
 
254
+ # Split each audio file into chunks
255
+ chunked_audio_files = []
256
+ for audio_file in audio_files:
257
+ chunked_audio_files.extend(split_audio_into_chunks(audio_file, chunk_length_ms))
258
+
259
+ result = convert_now(chunked_audio_files, random_tag, converter)
260
 
261
  if active_noise_reduce:
262
  result = apply_noisereduce(result)
263
 
264
  if audio_effects:
265
  result = add_audio_effects(result)
266
+
267
  return result
268
 
269
 
 
373
  return gr.Checkbox(
374
  False,
375
  label="TTS",
 
376
  container=False,
377
  )
378
 
379
 
380
  def tts_voice_conf():
381
  return gr.Dropdown(
382
+ label="TTS Voice",
383
+ choices=[
384
+ "en-US-EmmaMultilingualNeural-Female",
385
+ "en-US-GuyMultilingualNeural-Male",
386
+ "en-GB-SoniaNeural-Female",
387
+ "fr-FR-DeniseNeural-Female"
388
+ ],
389
  visible=False,
390
  value="en-US-EmmaMultilingualNeural-Female",
391
  )
 
408
  visible=False,
409
  )
410
 
411
+
412
  def tts_play_conf():
413
  return gr.Checkbox(
414
  False,
415
  label="Play",
 
416
  container=False,
417
  visible=False,
418
  )
 
422
  return gr.Audio(
423
  value=None,
424
  type="filepath",
 
425
  autoplay=True,
426
  visible=False,
427
  )
 
431
  return gr.Checkbox(
432
  False,
433
  label="Denoise",
 
434
  container=False,
435
  visible=True,
436
  )
 
440
  return gr.Checkbox(
441
  False,
442
  label="Effects",
 
443
  container=False,
444
  visible=True,
445
  )
 
447
 
448
  def infer_tts_audio(tts_voice, tts_text, play_tts):
449
  out_dir = "output"
450
+ folder_tts = "USER_" + str(random.randint(10000, 99999))
451
+
452
  os.makedirs(out_dir, exist_ok=True)
453
  os.makedirs(os.path.join(out_dir, folder_tts), exist_ok=True)
454
  out_path = os.path.join(out_dir, folder_tts, "tts.mp3")
455
+
456
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(out_path))
457
  if play_tts:
458
  return [out_path], out_path
 
470
  visible=value_active
471
  )
472
 
473
+
474
  def get_gui(theme):
475
  with gr.Blocks(theme=theme) as app:
476
  gr.Markdown(title)
 
515
  res_fc = respiration_filter_conf()
516
  envel_r = envelope_ratio_conf()
517
  const = consonant_protec_conf()
518
+ denoise = denoise_conf()
519
+ effects = effects_conf()
520
+ inference_button = button_conf()
521
+ output = output_conf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
+ inference_button.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  fn=run,
525
  inputs=[
526
  aud,
 
532
  res_fc,
533
  envel_r,
534
  const,
535
+ denoise,
536
+ effects,
537
  ],
538
+ outputs=[output],
 
539
  )
540
 
541
+ app.launch()
 
542
 
543
+ get_gui(theme=theme)