lojban commited on
Commit
e85d807
1 Parent(s): f9d0d4d

save Nix-Stochastic as ogg

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.py +8 -42
  3. dev.sh +1 -0
  4. libs/audio.py +55 -0
  5. packages.txt +2 -1
.gitignore CHANGED
@@ -6,6 +6,7 @@ __pycache__
6
  build
7
  .ipynb_checkpoints
8
  .*.swp
 
9
  lfs/*
10
  cache/*
11
  nix-tts/*
 
6
  build
7
  .ipynb_checkpoints
8
  .*.swp
9
+ *.so
10
  lfs/*
11
  cache/*
12
  nix-tts/*
app.py CHANGED
@@ -15,6 +15,7 @@ from scipy.io.wavfile import write
15
  import gradio as gr
16
  import scipy.io.wavfile
17
  import numpy as np
 
18
 
19
  def run_cmd(command):
20
  try:
@@ -133,53 +134,18 @@ def load_checkpoints():
133
 
134
  return model, hps, net_g_vctk, hps_vctk
135
 
136
-
137
- def float2pcm(sig, dtype='int16'):
138
- """Convert floating point signal with a range from -1 to 1 to PCM.
139
- Any signal values outside the interval [-1.0, 1.0) are clipped.
140
- No dithering is used.
141
- Note that there are different possibilities for scaling floating
142
- point numbers to PCM numbers, this function implements just one of
143
- them. For an overview of alternatives see
144
- http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
145
- Parameters
146
- ----------
147
- sig : array_like
148
- Input array, must have floating point type.
149
- dtype : data type, optional
150
- Desired (integer) data type.
151
- Returns
152
- -------
153
- numpy.ndarray
154
- Integer data, scaled and clipped to the range of the given
155
- *dtype*.
156
- See Also
157
- --------
158
- pcm2float, dtype
159
- """
160
- sig = np.asarray(sig)
161
- if sig.dtype.kind != 'f':
162
- raise TypeError("'sig' must be a float array")
163
- dtype = np.dtype(dtype)
164
- if dtype.kind not in 'iu':
165
- raise TypeError("'dtype' must be an integer type")
166
-
167
- i = np.iinfo(dtype)
168
- abs_max = 2 ** (i.bits - 1)
169
- offset = i.min + abs_max
170
- return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
171
-
172
-
173
  def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
174
  if len(text.strip())==0:
175
  return []
176
  language = language.split()[0]
177
  language = language_id_lookup[language] if bool(
178
  language_id_lookup[language]) else "jbo"
 
179
  if voice == 'Nix-Deterministic' and language == 'jbo':
180
- return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
181
  elif voice == 'Nix-Stochastic' and language == 'jbo':
182
- return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
 
183
  elif voice == 'LJS':
184
  ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
185
  with torch.no_grad():
@@ -187,7 +153,7 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
187
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
188
  audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
189
  noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
190
- return [ipa_text, (hps.data.sampling_rate, float2pcm(audio))]
191
  else:
192
  ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
193
  with torch.no_grad():
@@ -196,8 +162,8 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
196
  sid = torch.LongTensor([voice])
197
  audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
198
  noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
199
- return [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
200
-
201
 
202
  # download_pretrained()
203
  model, hps, model_vctk, hps_vctk = load_checkpoints()
 
15
  import gradio as gr
16
  import scipy.io.wavfile
17
  import numpy as np
18
+ from libs.audio import wav2ogg, float2pcm
19
 
20
  def run_cmd(command):
21
  try:
 
134
 
135
  return model, hps, net_g_vctk, hps_vctk
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
138
  if len(text.strip())==0:
139
  return []
140
  language = language.split()[0]
141
  language = language_id_lookup[language] if bool(
142
  language_id_lookup[language]) else "jbo"
143
+ result = []
144
  if voice == 'Nix-Deterministic' and language == 'jbo':
145
+ result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
146
  elif voice == 'Nix-Stochastic' and language == 'jbo':
147
+ result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
148
+ result = [result[0], wav2ogg(result[1][1], result[1][0], text, language)]
149
  elif voice == 'LJS':
150
  ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
151
  with torch.no_grad():
 
153
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
154
  audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
155
  noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
156
+ result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
157
  else:
158
  ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
159
  with torch.no_grad():
 
162
  sid = torch.LongTensor([voice])
163
  audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
164
  noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
165
+ result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
166
+ return result
167
 
168
  # download_pretrained()
169
  model, hps, model_vctk, hps_vctk = load_checkpoints()
dev.sh CHANGED
@@ -8,6 +8,7 @@ docker rm -f jboselvoha 2> /dev/null
8
  # -p 7860:7860 \
9
  # jboselvoha
10
  docker run -d -it --name jboselvoha \
 
11
  -v $(pwd)/assets:/home/user/app/assets:Z \
12
  -v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
13
  -v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
 
8
  # -p 7860:7860 \
9
  # jboselvoha
10
  docker run -d -it --name jboselvoha \
11
+ -v $(pwd)/libs:/home/user/app/libs:Z \
12
  -v $(pwd)/assets:/home/user/app/assets:Z \
13
  -v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
14
  -v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
libs/audio.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pydub
3
+ from re import sub
4
+
5
+ def float2pcm(sig, dtype='int16'):
6
+ """Convert floating point signal with a range from -1 to 1 to PCM.
7
+ Any signal values outside the interval [-1.0, 1.0) are clipped.
8
+ No dithering is used.
9
+ Note that there are different possibilities for scaling floating
10
+ point numbers to PCM numbers, this function implements just one of
11
+ them. For an overview of alternatives see
12
+ http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
13
+ Parameters
14
+ ----------
15
+ sig : array_like
16
+ Input array, must have floating point type.
17
+ dtype : data type, optional
18
+ Desired (integer) data type.
19
+ Returns
20
+ -------
21
+ numpy.ndarray
22
+ Integer data, scaled and clipped to the range of the given
23
+ *dtype*.
24
+ See Also
25
+ --------
26
+ pcm2float, dtype
27
+ """
28
+ sig = np.asarray(sig)
29
+ if sig.dtype.kind != 'f':
30
+ raise TypeError("'sig' must be a float array")
31
+ dtype = np.dtype(dtype)
32
+ if dtype.kind not in 'iu':
33
+ raise TypeError("'dtype' must be an integer type")
34
+
35
+ i = np.iinfo(dtype)
36
+ abs_max = 2 ** (i.bits - 1)
37
+ offset = i.min + abs_max
38
+ return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
39
+
40
+ def strip_text(text: str) -> str:
41
+ return sub(r"[^a-zA-Z0-9 ]", "", text)
42
+
43
+ def wav2ogg(x, sr, text, language, normalized=True):
44
+ print(x,sr,text,language)
45
+ """numpy array to MP3"""
46
+ channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
47
+ if normalized: # normalized array - each item should be a float in [-1, 1)
48
+ y = np.int16(x * 2 ** 15)
49
+ else:
50
+ y = np.int16(x)
51
+ song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
52
+ path = f"/tmp/{language}-{strip_text(text)}.ogg"
53
+ song.export(path, format="ogg", codec="libvorbis")
54
+ # samples = song.get_array_of_samples()
55
+ return path # np.array(samples)
packages.txt CHANGED
@@ -1,2 +1,3 @@
1
  libsndfile1
2
- espeak
 
 
1
  libsndfile1
2
+ espeak
3
+ ffmpeg