chnk58hoang commited on
Commit
db3dea6
1 Parent(s): 5ab552b

convert onnx

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
- __pycache__
 
 
1
+ __pycache__
2
+ .idea
app.py CHANGED
@@ -2,16 +2,18 @@ import gradio as gr
2
  import numpy as np
3
  from utils import load_model, normalize_text
4
 
5
-
6
- vits_model = load_model()
7
- vits_model.tts('Alo')
8
 
9
 
10
  def text_to_speech(text):
11
- text = normalize_text(text)
12
- audio = vits_model.tts(text)
13
-
14
- audio = np.array(audio)
 
 
 
 
15
  return 16000, audio
16
 
17
 
@@ -27,4 +29,3 @@ gr.Interface(
27
  ],
28
  theme="default",
29
  ).launch(debug=False)
30
-
 
2
  import numpy as np
3
  from utils import load_model, normalize_text
4
 
5
+ vits = load_model()
 
 
6
 
7
 
8
  def text_to_speech(text):
9
+ """ Text to speech
10
+ """
11
+ text_inputs = np.asarray(
12
+ vits.tokenizer.text_to_ids(text),
13
+ dtype=np.int64,
14
+ )[None, :]
15
+
16
+ audio = vits.inference_onnx(text_inputs)
17
  return 16000, audio
18
 
19
 
 
29
  ],
30
  theme="default",
31
  ).launch(debug=False)
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- TTS
2
  gradio
3
  numpy
4
  regex
 
1
+ TTS==0.17.5
2
  gradio
3
  numpy
4
  regex
utils.py CHANGED
@@ -1,8 +1,9 @@
1
- from TTS.api import TTS
 
 
2
  import unicodedata
3
  import regex
4
 
5
-
6
  num_re = regex.compile(r"([0-9.,]*[0-9])")
7
  digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
8
 
@@ -38,7 +39,7 @@ def read_number(num: str) -> str:
38
  return digits[n // 100] + " trăm lẻ " + digits[n % 100]
39
  else:
40
  return digits[n // 100] + " trăm " + read_number(num[1:])
41
- elif len(num) >= 4 and len(num) <= 6 and num.isdigit():
42
  n = int(num)
43
  n1 = n // 1000
44
  return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
@@ -57,24 +58,31 @@ def read_number(num: str) -> str:
57
  return read_number(parts[0]) + " ngàn " + read_number(parts[1])
58
  elif len(parts) == 3:
59
  return (
60
- read_number(parts[0])
61
- + " triệu "
62
- + read_number(parts[1])
63
- + " ngàn "
64
- + read_number(parts[2])
65
  )
66
  return num
67
 
68
 
69
  def load_model():
70
- config_path = 'vits/vits_config.json'
71
- checkpoint_path = 'vits/best_model_vits_22951.pth'
 
 
 
 
 
 
 
 
 
72
 
73
- tts = TTS(model_name='my_tts',
74
- model_path=checkpoint_path,
75
- config_path=config_path)
76
 
77
- return tts
78
 
79
 
80
  def normalize_text(text):
 
1
+ from TTS.tts.models.vits import Vits
2
+ from TTS.tts.configs.vits_config import VitsConfig
3
+ import numpy as np
4
  import unicodedata
5
  import regex
6
 
 
7
  num_re = regex.compile(r"([0-9.,]*[0-9])")
8
  digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
9
 
 
39
  return digits[n // 100] + " trăm lẻ " + digits[n % 100]
40
  else:
41
  return digits[n // 100] + " trăm " + read_number(num[1:])
42
+ elif 4 <= len(num) <= 6 and num.isdigit():
43
  n = int(num)
44
  n1 = n // 1000
45
  return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
 
58
  return read_number(parts[0]) + " ngàn " + read_number(parts[1])
59
  elif len(parts) == 3:
60
  return (
61
+ read_number(parts[0])
62
+ + " triệu "
63
+ + read_number(parts[1])
64
+ + " ngàn "
65
+ + read_number(parts[2])
66
  )
67
  return num
68
 
69
 
70
  def load_model():
71
+ config = VitsConfig()
72
+ config.load_json("vits/config.json")
73
+ vits = Vits.init_from_config(config)
74
+
75
+ vits.load_onnx("vits/coqui_vits.onnx")
76
+
77
+ text = "xin chào tôi là hoàng đây"
78
+ text_inputs = np.asarray(
79
+ vits.tokenizer.text_to_ids(text),
80
+ dtype=np.int64,
81
+ )[None, :]
82
 
83
+ audio = vits.inference_onnx(text_inputs)
 
 
84
 
85
+ return vits
86
 
87
 
88
  def normalize_text(text):
vits/{vits_config.json → config.json} RENAMED
@@ -1,7 +1,7 @@
1
  {
2
  "output_path": "/kaggle/working/",
3
  "logger_uri": null,
4
- "run_name": "vits_viet",
5
  "project_name": null,
6
  "run_description": "\ud83d\udc38Coqui trainer run.",
7
  "print_step": 25,
@@ -113,7 +113,21 @@
113
  }
114
  ],
115
  "test_sentences": [
116
- "xin ch\u00e0o, t\u1ea5t c\u1ea3 m\u1ecdi ng\u01b0\u1eddi"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  ],
118
  "eval_split_max_size": null,
119
  "eval_split_size": 0.01,
@@ -218,8 +232,8 @@
218
  "reinit_DP": false,
219
  "reinit_text_encoder": false
220
  },
221
- "lr_gen": 0.0001,
222
- "lr_disc": 0.0001,
223
  "lr_scheduler_gen": "ExponentialLR",
224
  "lr_scheduler_gen_params": {
225
  "gamma": 0.999875,
@@ -251,5 +265,6 @@
251
  "use_d_vector_file": false,
252
  "d_vector_file": null,
253
  "d_vector_dim": 0,
254
- "github_branch": "* master"
 
255
  }
 
1
  {
2
  "output_path": "/kaggle/working/",
3
  "logger_uri": null,
4
+ "run_name": "vits_viettts",
5
  "project_name": null,
6
  "run_description": "\ud83d\udc38Coqui trainer run.",
7
  "print_step": 25,
 
113
  }
114
  ],
115
  "test_sentences": [
116
+ [
117
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
118
+ ],
119
+ [
120
+ "Be a voice, not an echo."
121
+ ],
122
+ [
123
+ "I'm sorry Dave. I'm afraid I can't do that."
124
+ ],
125
+ [
126
+ "This cake is great. It's so delicious and moist."
127
+ ],
128
+ [
129
+ "Prior to November 22, 1963."
130
+ ]
131
  ],
132
  "eval_split_max_size": null,
133
  "eval_split_size": 0.01,
 
232
  "reinit_DP": false,
233
  "reinit_text_encoder": false
234
  },
235
+ "lr_gen": 0.0002,
236
+ "lr_disc": 0.0002,
237
  "lr_scheduler_gen": "ExponentialLR",
238
  "lr_scheduler_gen_params": {
239
  "gamma": 0.999875,
 
265
  "use_d_vector_file": false,
266
  "d_vector_file": null,
267
  "d_vector_dim": 0,
268
+ "restore_path": "/kaggle/input/pretrain-glow/checkpoint_80000.pth",
269
+ "github_branch": "* hoang"
270
  }
vits/{best_model_vits_22951.pth → coqui_vits.onnx} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dfc9c3865ccf2359258c5c6e2145365e643ba3be0208b57efedb6bfac20e428
3
- size 997817797
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debdbe5d25d926fae95180670253a23ffb65047d40da86f4cf7a6e205614d90b
3
+ size 131520541