NTT123 commited on
Commit
3dbfd73
1 Parent(s): 2157b01

Update tacotron model that uses phonemes instead of raw text.

Browse files
.gitattributes CHANGED
@@ -27,9 +27,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
29
  wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
30
- pretrained_model_ljs_600k.ckpt filter=lfs diff=lfs merge=lfs -text
31
- wavegru_vocoder_1024_v3_1310000.ckpt filter=lfs diff=lfs merge=lfs -text
32
- wavegru_vocoder_1024_v3_1330000.ckpt filter=lfs diff=lfs merge=lfs -text
33
- wavegru_vocoder_1024_v3_1340000.ckpt filter=lfs diff=lfs merge=lfs -text
34
- wavegru_vocoder_1024_v3_1360000.ckpt filter=lfs diff=lfs merge=lfs -text
35
- wavegru_vocoder_1024_v3_1400000.ckpt filter=lfs diff=lfs merge=lfs -text
 
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
29
  wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
30
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
alphabet.txt CHANGED
@@ -1,25 +1,18 @@
1
  _
 
2
 
3
  !
4
  "
5
- '
6
- (
7
- )
8
  ,
9
- -
10
  .
11
  :
12
  ;
13
  ?
14
- [
15
- ]
16
  a
17
  b
18
- c
19
  d
20
  e
21
  f
22
- g
23
  h
24
  i
25
  j
@@ -29,7 +22,6 @@ m
29
  n
30
  o
31
  p
32
- q
33
  r
34
  s
35
  t
@@ -37,5 +29,29 @@ u
37
  v
38
  w
39
  x
40
- y
41
  z
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  _
2
+
3
 
4
  !
5
  "
 
 
 
6
  ,
 
7
  .
8
  :
9
  ;
10
  ?
 
 
11
  a
12
  b
 
13
  d
14
  e
15
  f
 
16
  h
17
  i
18
  j
 
22
  n
23
  o
24
  p
 
25
  r
26
  s
27
  t
 
29
  v
30
  w
31
  x
 
32
  z
33
+ æ
34
+ ð
35
+ ŋ
36
+ ɐ
37
+ ɑ
38
+ ɔ
39
+ ə
40
+ ɚ
41
+ ɛ
42
+ ɜ
43
+ ɡ
44
+ ɪ
45
+ ɹ
46
+ ɾ
47
+ ʃ
48
+ ʊ
49
+ ʌ
50
+ ʒ
51
+ ʔ
52
+ ˈ
53
+ ˌ
54
+ ː
55
+ ̩
56
+ θ
57
+
app.py CHANGED
@@ -3,6 +3,10 @@
3
  # os.system("./bazelisk-linux-amd64 clean --expunge")
4
  # os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
5
 
 
 
 
 
6
 
7
  import gradio as gr
8
  from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
@@ -11,7 +15,7 @@ from wavegru_cpp import extract_weight_mask, load_wavegru_cpp
11
 
12
  def speak(text):
13
  alphabet, tacotron_net, tacotron_config = load_tacotron_model(
14
- "./alphabet.txt", "./tacotron.toml", "./pretrained_model_ljs_600k.ckpt"
15
  )
16
 
17
  wavegru_config, wavegru_net = load_wavegru_net(
 
3
  # os.system("./bazelisk-linux-amd64 clean --expunge")
4
  # os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
5
 
6
+ # install espeak
7
+ import os
8
+
9
+ os.system("bash ./install_espeak_ng.sh")
10
 
11
  import gradio as gr
12
  from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
 
15
 
16
  def speak(text):
17
  alphabet, tacotron_net, tacotron_config = load_tacotron_model(
18
+ "./alphabet.txt", "./tacotron.toml", "./tacotrons_ljs_24k_v1_0250000.ckpt"
19
  )
20
 
21
  wavegru_config, wavegru_net = load_wavegru_net(
inference.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import jax
2
  import jax.numpy as jnp
3
  import librosa
@@ -14,6 +16,11 @@ from utils import (
14
  )
15
  from wavegru import WaveGRU
16
 
 
 
 
 
 
17
 
18
  def load_tacotron_model(alphabet_file, config_file, model_file):
19
  """load tacotron model to memory"""
@@ -34,6 +41,8 @@ tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=2
34
  def text_to_mel(net, text, alphabet, config):
35
  """convert text to mel spectrogram"""
36
  text = english_cleaners(text)
 
 
37
  text = text + config["PAD"] * (100 - (len(text) % 100))
38
  tokens = []
39
  for c in text:
 
1
+ import os
2
+
3
  import jax
4
  import jax.numpy as jnp
5
  import librosa
 
16
  )
17
  from wavegru import WaveGRU
18
 
19
+ os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "./espeak/usr/lib/libespeak-ng.so.1.1.51"
20
+ from phonemizer.backend import EspeakBackend
21
+
22
+ backend = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
23
+
24
 
25
  def load_tacotron_model(alphabet_file, config_file, model_file):
26
  """load tacotron model to memory"""
 
41
  def text_to_mel(net, text, alphabet, config):
42
  """convert text to mel spectrogram"""
43
  text = english_cleaners(text)
44
+ text = backend.phonemize([text], strip=True)[0]
45
+ text = text + config["END_CHARACTER"]
46
  text = text + config["PAD"] * (100 - (len(text) % 100))
47
  tokens = []
48
  for c in text:
install_espeak_ng.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ rm -rf espeak
2
+ mkdir -p espeak
3
+ cd espeak
4
+ wget https://github.com/espeak-ng/espeak-ng/archive/refs/tags/1.51.zip
5
+ unzip -qq 1.51.zip
6
+ cd espeak-ng-1.51
7
+ ./autogen.sh
8
+ ./configure --prefix=`pwd`/../usr
9
+ make
10
+ make install
packages.txt CHANGED
@@ -1 +1,7 @@
1
  libsndfile1-dev
 
 
 
 
 
 
 
1
  libsndfile1-dev
2
+ make
3
+ autoconf
4
+ automake
5
+ libtool
6
+ pkg-config
7
+ gcc
requirements.txt CHANGED
@@ -8,4 +8,5 @@ numpy==1.22.3
8
  pax3==0.5.6
9
  pyyaml==6.0
10
  toml==0.10.2
11
- unidecode==1.3.4
 
 
8
  pax3==0.5.6
9
  pyyaml==6.0
10
  toml==0.10.2
11
+ unidecode==1.3.4
12
+ phonemizer==3.1.1
tacotron.py CHANGED
@@ -371,7 +371,10 @@ class Tacotron(pax.Module):
371
  x = x[:, : self.rr, :]
372
  x = jnp.reshape(x, (N, self.rr, -1))
373
  mel = x[..., :-1]
374
- eos = x[..., -1]
 
 
 
375
  return attn_state, decoder_rnn_states, rng_key, (mel, eos)
376
 
377
  def inference(self, text, seed=42, max_len=1000):
@@ -381,6 +384,7 @@ class Tacotron(pax.Module):
381
  text = self.encode_text(text)
382
  text_key = self.text_key_fc(text)
383
  N, L, D = text.shape
 
384
  mel = self.go_frame(N)
385
 
386
  attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
@@ -393,7 +397,7 @@ class Tacotron(pax.Module):
393
  attn_state, decoder_rnn_states, rng_key, mel, text, text_key
394
  )
395
  mels.append(mel)
396
- if eos[0, -1].item() > 0 or count > max_len:
397
  break
398
 
399
  mel = mel[:, -1, :]
 
371
  x = x[:, : self.rr, :]
372
  x = jnp.reshape(x, (N, self.rr, -1))
373
  mel = x[..., :-1]
374
+ eos_logit = x[..., -1]
375
+ eos_pr = jax.nn.sigmoid(eos_logit[0, -1])
376
+ rng_key, eos_rng_key = jax.random.split(rng_key)
377
+ eos = jax.random.bernoulli(eos_rng_key, p=eos_pr)
378
  return attn_state, decoder_rnn_states, rng_key, (mel, eos)
379
 
380
  def inference(self, text, seed=42, max_len=1000):
 
384
  text = self.encode_text(text)
385
  text_key = self.text_key_fc(text)
386
  N, L, D = text.shape
387
+ assert N == 1
388
  mel = self.go_frame(N)
389
 
390
  attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
 
397
  attn_state, decoder_rnn_states, rng_key, mel, text, text_key
398
  )
399
  mels.append(mel)
400
+ if eos.item() or count > max_len:
401
  break
402
 
403
  mel = mel[:, -1, :]
tacotron.toml CHANGED
@@ -16,6 +16,7 @@ MEL_DIM = 80 # the dimension of melspectrogram features
16
  MEL_MIN = 1e-5
17
  PAD = "_" # padding character
18
  PAD_TOKEN = 0
 
19
  TEST_DATA_SIZE = 1024
20
 
21
  # model
 
16
  MEL_MIN = 1e-5
17
  PAD = "_" # padding character
18
  PAD_TOKEN = 0
19
+ END_CHARACTER = "■" # to signal the end of the transcript
20
  TEST_DATA_SIZE = 1024
21
 
22
  # model
pretrained_model_ljs_600k.ckpt → tacotrons_ljs_24k_v1_0250000.ckpt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ec09df89ab9b0e1fd0e310e2888d8dd3590a6dd60d2c6a6ff5c378016f5f381
3
- size 53525995
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512b3af6ef95ccc53d3516256abae81b025e110fa886ec68f9f7033039013fc6
3
+ size 53561547