Spaces:
kevinwang676
/
Runtime error

ggoknar commited on
Commit
0db6209
1 Parent(s): 33aef52

use ffmpeg to denoise microphone input

Browse files
Files changed (3) hide show
  1. app.py +44 -5
  2. ffmpeg.zip +3 -0
  3. packages.txt +1 -0
app.py CHANGED
@@ -1,22 +1,36 @@
1
  import sys
2
- import os
 
 
 
3
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
4
  os.environ["COQUI_TOS_AGREED"] = "1"
5
 
 
 
6
  import langid
7
 
8
  import gradio as gr
9
  from TTS.api import TTS
10
-
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
  from huggingface_hub import HfApi
13
  # will use api to restart space on a unrecoverable error
14
  api = HfApi(token=HF_TOKEN)
15
  repo_id = "coqui/xtts"
16
 
 
 
 
 
 
 
 
 
17
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
18
  tts.to("cuda")
19
 
 
 
20
  DEVICE_ASSERT_DETECTED=0
21
  DEVICE_ASSERT_PROMPT=None
22
  DEVICE_ASSERT_LANG=None
@@ -35,13 +49,17 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
35
 
36
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
37
 
 
38
  if language_predicted == "zh":
39
  #we use zh-cn
40
  language_predicted = "zh-cn"
41
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
42
 
 
43
  if len(prompt)>15:
44
- #allow any language for short text as some may be common
 
 
45
  if language_predicted != language and not no_lang_auto_detect:
46
  #Please duplicate and remove this check if you really want this
47
  #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
@@ -55,7 +73,26 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
55
 
56
  if use_mic == True:
57
  if mic_file_path is not None:
58
- speaker_wav=mic_file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  else:
60
  gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
61
  return (
@@ -65,6 +102,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
65
 
66
  else:
67
  speaker_wav=audio_file_pth
 
68
 
69
  if len(prompt)<2:
70
  gr.Warning("Please give a longer prompt text")
@@ -323,4 +361,5 @@ gr.Interface(
323
  description=description,
324
  article=article,
325
  examples=examples,
326
- ).queue().launch(debug=True)
 
 
1
  import sys
2
+ import os,stat
3
+ import subprocess
4
+ from zipfile import ZipFile
5
+
6
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
7
  os.environ["COQUI_TOS_AGREED"] = "1"
8
 
9
+ # langid is used to detect language for longer text
10
+ # Most users expect text to be their own language, there is checkbox to disable it
11
  import langid
12
 
13
  import gradio as gr
14
  from TTS.api import TTS
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
  from huggingface_hub import HfApi
17
  # will use api to restart space on a unrecoverable error
18
  api = HfApi(token=HF_TOKEN)
19
  repo_id = "coqui/xtts"
20
 
21
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
22
+ print("Export newer ffmpeg binary for denoise filter")
23
+ ZipFile("ffmpeg.zip").extractall()
24
+ print("Make ffmpeg binary executable")
25
+ st = os.stat('ffmpeg')
26
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
27
+
28
+ # Load TTS
29
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
30
  tts.to("cuda")
31
 
32
+
33
+ # This is for debugging purposes only
34
  DEVICE_ASSERT_DETECTED=0
35
  DEVICE_ASSERT_PROMPT=None
36
  DEVICE_ASSERT_LANG=None
 
49
 
50
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
51
 
52
+ # tts expects chinese as zh-cn
53
  if language_predicted == "zh":
54
  #we use zh-cn
55
  language_predicted = "zh-cn"
56
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
57
 
58
+ # After text character length 15 trigger language detection
59
  if len(prompt)>15:
60
+ # allow any language for short text as some may be common
61
+ # If user unchecks language autodetection it will not trigger
62
+ # You may remove this completely for own use
63
  if language_predicted != language and not no_lang_auto_detect:
64
  #Please duplicate and remove this check if you really want this
65
  #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
 
73
 
74
  if use_mic == True:
75
  if mic_file_path is not None:
76
+ try:
77
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
78
+ # This is fast filtering not perfect
79
+ lowpass_highpass="lowpass=1000,highpass=200" #too bass
80
+
81
+ fast_denoise="afftdn=nr=12:nf=-25"
82
+ # better to remove silence in beginning and end for microphone
83
+ trim_silence="areverse,atrim=start=0.2,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,atrim=start=0.2,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
84
+ out_filename = mic_file_path +".wav" #ffmpeg to know output format
85
+
86
+ #we will use newer ffmpeg as that has afftn denoise filter
87
+ shell_command = f"./ffmpeg -y -i {mic_file_path} -af {lowpass_highpass}{fast_denoise},{trim_silence},loudnorm {out_filename}".split(" ")
88
+
89
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
90
+ speaker_wav=out_filename
91
+ print("Filtered microphone input")
92
+ except subprocess.CalledProcessError:
93
+ # There was an error - command exited with non-zero code
94
+ print("Error: failed filtering, use original microphone input")
95
+ speaker_wav=mic_file_path
96
  else:
97
  gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
98
  return (
 
102
 
103
  else:
104
  speaker_wav=audio_file_pth
105
+
106
 
107
  if len(prompt)<2:
108
  gr.Warning("Please give a longer prompt text")
 
361
  description=description,
362
  article=article,
363
  examples=examples,
364
+ ).queue().launch(debug=True)
365
+
ffmpeg.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c04aa2958762686cf94a3bd1456b4738fd537d19bb0a9b622fc788a5e4ce723
3
+ size 29207056
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ unzip