Spaces:
Running
Running
save Nix-Stochastic as ogg
Browse files- .gitignore +1 -0
- app.py +8 -42
- dev.sh +1 -0
- libs/audio.py +55 -0
- packages.txt +2 -1
.gitignore
CHANGED
@@ -6,6 +6,7 @@ __pycache__
|
|
6 |
build
|
7 |
.ipynb_checkpoints
|
8 |
.*.swp
|
|
|
9 |
lfs/*
|
10 |
cache/*
|
11 |
nix-tts/*
|
|
|
6 |
build
|
7 |
.ipynb_checkpoints
|
8 |
.*.swp
|
9 |
+
*.so
|
10 |
lfs/*
|
11 |
cache/*
|
12 |
nix-tts/*
|
app.py
CHANGED
@@ -15,6 +15,7 @@ from scipy.io.wavfile import write
|
|
15 |
import gradio as gr
|
16 |
import scipy.io.wavfile
|
17 |
import numpy as np
|
|
|
18 |
|
19 |
def run_cmd(command):
|
20 |
try:
|
@@ -133,53 +134,18 @@ def load_checkpoints():
|
|
133 |
|
134 |
return model, hps, net_g_vctk, hps_vctk
|
135 |
|
136 |
-
|
137 |
-
def float2pcm(sig, dtype='int16'):
|
138 |
-
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
139 |
-
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
140 |
-
No dithering is used.
|
141 |
-
Note that there are different possibilities for scaling floating
|
142 |
-
point numbers to PCM numbers, this function implements just one of
|
143 |
-
them. For an overview of alternatives see
|
144 |
-
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
145 |
-
Parameters
|
146 |
-
----------
|
147 |
-
sig : array_like
|
148 |
-
Input array, must have floating point type.
|
149 |
-
dtype : data type, optional
|
150 |
-
Desired (integer) data type.
|
151 |
-
Returns
|
152 |
-
-------
|
153 |
-
numpy.ndarray
|
154 |
-
Integer data, scaled and clipped to the range of the given
|
155 |
-
*dtype*.
|
156 |
-
See Also
|
157 |
-
--------
|
158 |
-
pcm2float, dtype
|
159 |
-
"""
|
160 |
-
sig = np.asarray(sig)
|
161 |
-
if sig.dtype.kind != 'f':
|
162 |
-
raise TypeError("'sig' must be a float array")
|
163 |
-
dtype = np.dtype(dtype)
|
164 |
-
if dtype.kind not in 'iu':
|
165 |
-
raise TypeError("'dtype' must be an integer type")
|
166 |
-
|
167 |
-
i = np.iinfo(dtype)
|
168 |
-
abs_max = 2 ** (i.bits - 1)
|
169 |
-
offset = i.min + abs_max
|
170 |
-
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
171 |
-
|
172 |
-
|
173 |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
174 |
if len(text.strip())==0:
|
175 |
return []
|
176 |
language = language.split()[0]
|
177 |
language = language_id_lookup[language] if bool(
|
178 |
language_id_lookup[language]) else "jbo"
|
|
|
179 |
if voice == 'Nix-Deterministic' and language == 'jbo':
|
180 |
-
|
181 |
elif voice == 'Nix-Stochastic' and language == 'jbo':
|
182 |
-
|
|
|
183 |
elif voice == 'LJS':
|
184 |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
|
185 |
with torch.no_grad():
|
@@ -187,7 +153,7 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
|
187 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
188 |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
|
189 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
|
190 |
-
|
191 |
else:
|
192 |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
|
193 |
with torch.no_grad():
|
@@ -196,8 +162,8 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
|
196 |
sid = torch.LongTensor([voice])
|
197 |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
198 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
199 |
-
|
200 |
-
|
201 |
|
202 |
# download_pretrained()
|
203 |
model, hps, model_vctk, hps_vctk = load_checkpoints()
|
|
|
15 |
import gradio as gr
|
16 |
import scipy.io.wavfile
|
17 |
import numpy as np
|
18 |
+
from libs.audio import wav2ogg, float2pcm
|
19 |
|
20 |
def run_cmd(command):
|
21 |
try:
|
|
|
134 |
|
135 |
return model, hps, net_g_vctk, hps_vctk
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
138 |
if len(text.strip())==0:
|
139 |
return []
|
140 |
language = language.split()[0]
|
141 |
language = language_id_lookup[language] if bool(
|
142 |
language_id_lookup[language]) else "jbo"
|
143 |
+
result = []
|
144 |
if voice == 'Nix-Deterministic' and language == 'jbo':
|
145 |
+
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
|
146 |
elif voice == 'Nix-Stochastic' and language == 'jbo':
|
147 |
+
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
|
148 |
+
result = [result[0], wav2ogg(result[1][1], result[1][0], text, language)]
|
149 |
elif voice == 'LJS':
|
150 |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
|
151 |
with torch.no_grad():
|
|
|
153 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
154 |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
|
155 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
|
156 |
+
result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
|
157 |
else:
|
158 |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
|
159 |
with torch.no_grad():
|
|
|
162 |
sid = torch.LongTensor([voice])
|
163 |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
164 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
165 |
+
result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
|
166 |
+
return result
|
167 |
|
168 |
# download_pretrained()
|
169 |
model, hps, model_vctk, hps_vctk = load_checkpoints()
|
dev.sh
CHANGED
@@ -8,6 +8,7 @@ docker rm -f jboselvoha 2> /dev/null
|
|
8 |
# -p 7860:7860 \
|
9 |
# jboselvoha
|
10 |
docker run -d -it --name jboselvoha \
|
|
|
11 |
-v $(pwd)/assets:/home/user/app/assets:Z \
|
12 |
-v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
|
13 |
-v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
|
|
|
8 |
# -p 7860:7860 \
|
9 |
# jboselvoha
|
10 |
docker run -d -it --name jboselvoha \
|
11 |
+
-v $(pwd)/libs:/home/user/app/libs:Z \
|
12 |
-v $(pwd)/assets:/home/user/app/assets:Z \
|
13 |
-v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
|
14 |
-v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
|
libs/audio.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pydub
|
3 |
+
from re import sub
|
4 |
+
|
5 |
+
def float2pcm(sig, dtype='int16'):
|
6 |
+
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
7 |
+
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
8 |
+
No dithering is used.
|
9 |
+
Note that there are different possibilities for scaling floating
|
10 |
+
point numbers to PCM numbers, this function implements just one of
|
11 |
+
them. For an overview of alternatives see
|
12 |
+
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
sig : array_like
|
16 |
+
Input array, must have floating point type.
|
17 |
+
dtype : data type, optional
|
18 |
+
Desired (integer) data type.
|
19 |
+
Returns
|
20 |
+
-------
|
21 |
+
numpy.ndarray
|
22 |
+
Integer data, scaled and clipped to the range of the given
|
23 |
+
*dtype*.
|
24 |
+
See Also
|
25 |
+
--------
|
26 |
+
pcm2float, dtype
|
27 |
+
"""
|
28 |
+
sig = np.asarray(sig)
|
29 |
+
if sig.dtype.kind != 'f':
|
30 |
+
raise TypeError("'sig' must be a float array")
|
31 |
+
dtype = np.dtype(dtype)
|
32 |
+
if dtype.kind not in 'iu':
|
33 |
+
raise TypeError("'dtype' must be an integer type")
|
34 |
+
|
35 |
+
i = np.iinfo(dtype)
|
36 |
+
abs_max = 2 ** (i.bits - 1)
|
37 |
+
offset = i.min + abs_max
|
38 |
+
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
39 |
+
|
40 |
+
def strip_text(text: str) -> str:
|
41 |
+
return sub(r"[^a-zA-Z0-9 ]", "", text)
|
42 |
+
|
43 |
+
def wav2ogg(x, sr, text, language, normalized=True):
|
44 |
+
print(x,sr,text,language)
|
45 |
+
"""numpy array to MP3"""
|
46 |
+
channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
|
47 |
+
if normalized: # normalized array - each item should be a float in [-1, 1)
|
48 |
+
y = np.int16(x * 2 ** 15)
|
49 |
+
else:
|
50 |
+
y = np.int16(x)
|
51 |
+
song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
|
52 |
+
path = f"/tmp/{language}-{strip_text(text)}.ogg"
|
53 |
+
song.export(path, format="ogg", codec="libvorbis")
|
54 |
+
# samples = song.get_array_of_samples()
|
55 |
+
return path # np.array(samples)
|
packages.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
libsndfile1
|
2 |
-
espeak
|
|
|
|
1 |
libsndfile1
|
2 |
+
espeak
|
3 |
+
ffmpeg
|