ggoknar
commited on
Commit
Β·
0db6209
1
Parent(s):
33aef52
use ffmpeg to denoise microphone input
Browse files- app.py +44 -5
- ffmpeg.zip +3 -0
- packages.txt +1 -0
app.py
CHANGED
@@ -1,22 +1,36 @@
|
|
1 |
import sys
|
2 |
-
import os
|
|
|
|
|
|
|
3 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
4 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
5 |
|
|
|
|
|
6 |
import langid
|
7 |
|
8 |
import gradio as gr
|
9 |
from TTS.api import TTS
|
10 |
-
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
12 |
from huggingface_hub import HfApi
|
13 |
# will use api to restart space on a unrecoverable error
|
14 |
api = HfApi(token=HF_TOKEN)
|
15 |
repo_id = "coqui/xtts"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
|
18 |
tts.to("cuda")
|
19 |
|
|
|
|
|
20 |
DEVICE_ASSERT_DETECTED=0
|
21 |
DEVICE_ASSERT_PROMPT=None
|
22 |
DEVICE_ASSERT_LANG=None
|
@@ -35,13 +49,17 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
|
|
35 |
|
36 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
37 |
|
|
|
38 |
if language_predicted == "zh":
|
39 |
#we use zh-cn
|
40 |
language_predicted = "zh-cn"
|
41 |
print(f"Detected language:{language_predicted}, Chosen language:{language}")
|
42 |
|
|
|
43 |
if len(prompt)>15:
|
44 |
-
#allow any language for short text as some may be common
|
|
|
|
|
45 |
if language_predicted != language and not no_lang_auto_detect:
|
46 |
#Please duplicate and remove this check if you really want this
|
47 |
#Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
|
@@ -55,7 +73,26 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
|
|
55 |
|
56 |
if use_mic == True:
|
57 |
if mic_file_path is not None:
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
else:
|
60 |
gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
|
61 |
return (
|
@@ -65,6 +102,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic,no_lang_aut
|
|
65 |
|
66 |
else:
|
67 |
speaker_wav=audio_file_pth
|
|
|
68 |
|
69 |
if len(prompt)<2:
|
70 |
gr.Warning("Please give a longer prompt text")
|
@@ -323,4 +361,5 @@ gr.Interface(
|
|
323 |
description=description,
|
324 |
article=article,
|
325 |
examples=examples,
|
326 |
-
).queue().launch(debug=True)
|
|
|
|
1 |
import sys
|
2 |
+
import os,stat
|
3 |
+
import subprocess
|
4 |
+
from zipfile import ZipFile
|
5 |
+
|
6 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
7 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
8 |
|
9 |
+
# langid is used to detect language for longer text
|
10 |
+
# Most users expect text to be their own language, there is checkbox to disable it
|
11 |
import langid
|
12 |
|
13 |
import gradio as gr
|
14 |
from TTS.api import TTS
|
|
|
15 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
16 |
from huggingface_hub import HfApi
|
17 |
# will use api to restart space on a unrecoverable error
|
18 |
api = HfApi(token=HF_TOKEN)
|
19 |
repo_id = "coqui/xtts"
|
20 |
|
21 |
+
# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
|
22 |
+
print("Export newer ffmpeg binary for denoise filter")
|
23 |
+
ZipFile("ffmpeg.zip").extractall()
|
24 |
+
print("Make ffmpeg binary executable")
|
25 |
+
st = os.stat('ffmpeg')
|
26 |
+
os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
|
27 |
+
|
28 |
+
# Load TTS
|
29 |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
|
30 |
tts.to("cuda")
|
31 |
|
32 |
+
|
33 |
+
# This is for debugging purposes only
|
34 |
DEVICE_ASSERT_DETECTED=0
|
35 |
DEVICE_ASSERT_PROMPT=None
|
36 |
DEVICE_ASSERT_LANG=None
|
|
|
49 |
|
50 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
51 |
|
52 |
+
# tts expects chinese as zh-cn
|
53 |
if language_predicted == "zh":
|
54 |
#we use zh-cn
|
55 |
language_predicted = "zh-cn"
|
56 |
print(f"Detected language:{language_predicted}, Chosen language:{language}")
|
57 |
|
58 |
+
# After text character length 15 trigger language detection
|
59 |
if len(prompt)>15:
|
60 |
+
# allow any language for short text as some may be common
|
61 |
+
# If user unchecks language autodetection it will not trigger
|
62 |
+
# You may remove this completely for own use
|
63 |
if language_predicted != language and not no_lang_auto_detect:
|
64 |
#Please duplicate and remove this check if you really want this
|
65 |
#Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
|
|
|
73 |
|
74 |
if use_mic == True:
|
75 |
if mic_file_path is not None:
|
76 |
+
try:
|
77 |
+
# Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
|
78 |
+
# This is fast filtering not perfect
|
79 |
+
lowpass_highpass="lowpass=1000,highpass=200" #too bass
|
80 |
+
|
81 |
+
fast_denoise="afftdn=nr=12:nf=-25"
|
82 |
+
# better to remove silence in beginning and end for microphone
|
83 |
+
trim_silence="areverse,atrim=start=0.2,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,atrim=start=0.2,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
84 |
+
out_filename = mic_file_path +".wav" #ffmpeg to know output format
|
85 |
+
|
86 |
+
#we will use newer ffmpeg as that has afftn denoise filter
|
87 |
+
shell_command = f"./ffmpeg -y -i {mic_file_path} -af {lowpass_highpass}{fast_denoise},{trim_silence},loudnorm {out_filename}".split(" ")
|
88 |
+
|
89 |
+
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
|
90 |
+
speaker_wav=out_filename
|
91 |
+
print("Filtered microphone input")
|
92 |
+
except subprocess.CalledProcessError:
|
93 |
+
# There was an error - command exited with non-zero code
|
94 |
+
print("Error: failed filtering, use original microphone input")
|
95 |
+
speaker_wav=mic_file_path
|
96 |
else:
|
97 |
gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
|
98 |
return (
|
|
|
102 |
|
103 |
else:
|
104 |
speaker_wav=audio_file_pth
|
105 |
+
|
106 |
|
107 |
if len(prompt)<2:
|
108 |
gr.Warning("Please give a longer prompt text")
|
|
|
361 |
description=description,
|
362 |
article=article,
|
363 |
examples=examples,
|
364 |
+
).queue().launch(debug=True)
|
365 |
+
|
ffmpeg.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c04aa2958762686cf94a3bd1456b4738fd537d19bb0a9b622fc788a5e4ce723
|
3 |
+
size 29207056
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
unzip
|