lazhrach commited on
Commit
d595cc9
1 Parent(s): d05c508

Add TeraTTS and remove pyttsx3

Browse files
.gitignore CHANGED
@@ -151,7 +151,7 @@ dmypy.json
151
 
152
  # Cython debug symbols
153
  cython_debug/
154
-
155
  results/
156
  checkpoints/
157
  gradio_cached_examples/
 
151
 
152
  # Cython debug symbols
153
  cython_debug/
154
+ model/
155
  results/
156
  checkpoints/
157
  gradio_cached_examples/
app.py CHANGED
@@ -1,12 +1,31 @@
1
- import os, sys
2
  import uuid
3
- import tempfile
4
- import pyttsx3
5
  import gradio as gr
6
  from src.gradio_demo import SadTalker
7
- # from src.utils.text2speech import TTSTalker
8
  from huggingface_hub import snapshot_download
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def get_source_image(image):
11
  return image
12
 
@@ -33,92 +52,11 @@ def download_model():
33
  REPO_ID = 'vinthony/SadTalker-V002rc'
34
  snapshot_download(REPO_ID)
35
 
36
- # language : en_US, de_DE, ...
37
- # gender : VoiceGenderFemale, VoiceGenderMale
38
- def change_voice(engine, language='ru_ru', gender='male'):
39
-
40
- selected_voices = []
41
-
42
- language = language.lower() if language else ''
43
- gender = gender.lower() if gender else ''
44
-
45
- for voice in engine.getProperty('voices'):
46
- voice_appended = False
47
-
48
- for lang in voice.languages:
49
-
50
- lang_str = str(lang, 'utf-8')
51
- print("lang", lang_str)
52
-
53
- if lang_str and language in lang_str.lower():
54
- selected_voices.append(voice)
55
- print("voice appended by lang", voice, lang_str)
56
- voice_appended = True
57
- break
58
-
59
- if voice_appended:
60
- continue
61
-
62
- if voice.id and language in voice.id.lower():
63
- selected_voices.append(voice)
64
- print("voice appended by id", voice.id)
65
- continue
66
-
67
- if voice.name and language in voice.name.lower():
68
- selected_voices.append(voice)
69
- print("voice appended by name", voice.name)
70
- continue
71
-
72
- for voice in selected_voices:
73
- if voice.gender and gender in voice.gender.lower():
74
- engine.setProperty('voice', voice.id)
75
- print("voice selected by gender", voice.gender)
76
- return True
77
-
78
- if voice.id and gender in voice.id.lower():
79
- engine.setProperty('voice', voice.id)
80
- print("voice selected by id", voice.id)
81
- return True
82
- if voice.name and gender in voice.name.lower():
83
- engine.setProperty('voice', voice.id)
84
- print("voice selected by name", voice.name)
85
- return True
86
-
87
- if len(selected_voices) > 0:
88
- engine.setProperty('voice', selected_voices[0].id)
89
- print("voice selected by default", selected_voices[0].id)
90
- return True
91
-
92
- return False
93
-
94
- def play_text_to_speech(text_input, voice_option):
95
- engine = pyttsx3.init()
96
-
97
- change_voice(engine, 'ru', voice_option)
98
-
99
- print("text_input", text_input)
100
- print("voice_option", voice_option)
101
-
102
- time_tag = str(uuid.uuid4())
103
- save_dir = './results/voice_input'
104
- os.makedirs(save_dir, exist_ok=True)
105
- file_name = os.path.join(save_dir, os.path.basename(time_tag + '.wav'))
106
-
107
- open(file_name, "wb").close()
108
- engine.say(text_input)
109
- engine.save_to_file(text_input, file_name)
110
- engine.runAndWait()
111
-
112
- print("file saved to", file_name)
113
-
114
- return file_name
115
-
116
  def sadtalker_demo():
117
 
118
  download_model()
119
 
120
  sad_talker = SadTalker(lazy_load=True)
121
- # tts_talker = TTSTalker()
122
 
123
  with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
124
  with gr.Row():
@@ -131,6 +69,12 @@ def sadtalker_demo():
131
 
132
  with gr.Tabs(elem_id="sadtalker_driven_audio"):
133
  with gr.TabItem('Driving Methods'):
 
 
 
 
 
 
134
  with gr.Row():
135
  driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
136
  driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
@@ -139,15 +83,10 @@ def sadtalker_demo():
139
  use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
140
  length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
141
  use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
142
- with gr.Row():
143
- text_input = gr.Textbox(label="Enter text", multiline=True)
144
- voice_option = gr.Radio(['Male', 'Female'], label='Voice Option', value='Female')
145
  with gr.Row():
146
  play_button = gr.Button('Text To Speech', variant='primary')
147
  play_button.click(
148
- fn=play_text_to_speech,
149
- inputs=[text_input, voice_option],
150
- outputs=[driven_audio]
151
  )
152
  with gr.Row():
153
  ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref")
@@ -158,13 +97,10 @@ def sadtalker_demo():
158
 
159
  ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
160
 
161
-
162
  with gr.Column(variant='panel'):
163
  with gr.Tabs(elem_id="sadtalker_checkbox"):
164
  with gr.TabItem('Settings'):
165
  with gr.Column(variant='panel'):
166
- # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
167
- # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
168
  with gr.Row():
169
  pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
170
  exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
@@ -187,8 +123,6 @@ def sadtalker_demo():
187
  with gr.Tabs(elem_id="sadtalker_genearted"):
188
  gen_video = gr.Video(label="Generated video", format="mp4")
189
 
190
-
191
-
192
  submit.click(
193
  fn=sad_talker.test,
194
  inputs=[source_image,
@@ -213,7 +147,6 @@ def sadtalker_demo():
213
 
214
  return sadtalker_interface
215
 
216
-
217
  if __name__ == "__main__":
218
 
219
  demo = sadtalker_demo()
 
1
+ import os
2
  import uuid
 
 
3
  import gradio as gr
4
  from src.gradio_demo import SadTalker
5
+ from infer_onnx import TTS
6
  from huggingface_hub import snapshot_download
7
 
8
+
9
+ # Список моделей TTS для выбора
10
+ models = ["TeraTTS/natasha-g2p-vits", "TeraTTS/glados2-g2p-vits", "TeraTTS/glados-g2p-vits", "TeraTTS/girl_nice-g2p-vits"]
11
+
12
+ # Создаем словарь моделей и инициализируем их
13
+ models = {k: TTS(k) for k in models}
14
+
15
+ # Функция для синтеза речи
16
+ def text_to_speech(model_name, length_scale, text):
17
+ time_tag = str(uuid.uuid4())
18
+ save_dir = './results/voice_input'
19
+ os.makedirs(save_dir, exist_ok=True)
20
+ file_name = os.path.join(save_dir, os.path.basename(time_tag + '.wav'))
21
+
22
+ open(file_name, "wb").close()
23
+
24
+ audio = models[model_name](text, length_scale=length_scale)
25
+ models[model_name].save_wav(audio, file_name, sample_rate=models[model_name].config["samplerate"])
26
+
27
+ return file_name
28
+
29
  def get_source_image(image):
30
  return image
31
 
 
52
  REPO_ID = 'vinthony/SadTalker-V002rc'
53
  snapshot_download(REPO_ID)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def sadtalker_demo():
56
 
57
  download_model()
58
 
59
  sad_talker = SadTalker(lazy_load=True)
 
60
 
61
  with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
62
  with gr.Row():
 
69
 
70
  with gr.Tabs(elem_id="sadtalker_driven_audio"):
71
  with gr.TabItem('Driving Methods'):
72
+ with gr.Row():
73
+ model_choice = gr.Dropdown(choices=list(models.keys()), value="TeraTTS/natasha-g2p-vits", label="Choose TTS model")
74
+ with gr.Row():
75
+ length_scale = gr.Slider(minimum=0.1, maximum=2.0, label="Length scale (increase length of sound) Default: 1.2", value=1.2)
76
+ with gr.Row():
77
+ input_text = gr.Textbox(label="Enter text")
78
  with gr.Row():
79
  driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
80
  driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
 
83
  use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
84
  length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
85
  use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
 
 
 
86
  with gr.Row():
87
  play_button = gr.Button('Text To Speech', variant='primary')
88
  play_button.click(
89
+ fn=text_to_speech, inputs=[model_choice, length_scale, input_text], outputs=[driven_audio]
 
 
90
  )
91
  with gr.Row():
92
  ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref")
 
97
 
98
  ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
99
 
 
100
  with gr.Column(variant='panel'):
101
  with gr.Tabs(elem_id="sadtalker_checkbox"):
102
  with gr.TabItem('Settings'):
103
  with gr.Column(variant='panel'):
 
 
104
  with gr.Row():
105
  pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
106
  exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
 
123
  with gr.Tabs(elem_id="sadtalker_genearted"):
124
  gen_video = gr.Video(label="Generated video", format="mp4")
125
 
 
 
126
  submit.click(
127
  fn=sad_talker.test,
128
  inputs=[source_image,
 
147
 
148
  return sadtalker_interface
149
 
 
150
  if __name__ == "__main__":
151
 
152
  demo = sadtalker_demo()
infer_onnx.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scipy.io.wavfile
2
+ import os
3
+ import onnxruntime
4
+ import numpy as np
5
+ from huggingface_hub import snapshot_download
6
+ from num2words import num2words
7
+ import re
8
+ from transliterate import translit
9
+ import json
10
+
11
+ class TTS:
12
+ def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 0.8) -> None:
13
+ if not os.path.exists(save_path):
14
+ os.mkdir(save_path)
15
+
16
+ model_dir = os.path.join(save_path, model_name)
17
+
18
+ if not os.path.exists(model_dir):
19
+ snapshot_download(repo_id=model_name,
20
+ allow_patterns=["*.txt", "*.onnx", "*.json"],
21
+ local_dir=model_dir,
22
+ local_dir_use_symlinks=False
23
+ )
24
+
25
+ self.model = onnxruntime.InferenceSession(os.path.join(model_dir, "exported/model.onnx"), providers=['CPUExecutionProvider'])
26
+ with open(os.path.join(model_dir, "exported/config.json")) as config_file:
27
+ self.config = json.load(config_file)["model_config"]
28
+
29
+ if os.path.exists(os.path.join(model_dir, "exported/dictionary.txt")):
30
+ from tokenizer import TokenizerG2P
31
+ print("Use g2p")
32
+ self.tokenizer = TokenizerG2P(os.path.join(model_dir, "exported"))
33
+
34
+ else:
35
+ from tokenizer import TokenizerGRUUT
36
+ print("Use gruut")
37
+ self.tokenizer = TokenizerGRUUT(os.path.join(model_dir, "exported"))
38
+
39
+ self.add_time_to_end = add_time_to_end
40
+
41
+
42
+ def _add_silent(self, audio, silence_duration: float = 1.0, sample_rate: int = 22050):
43
+ num_samples_silence = int(sample_rate * silence_duration)
44
+ silence_array = np.zeros(num_samples_silence, dtype=np.float32)
45
+ audio_with_silence = np.concatenate((audio, silence_array), axis=0)
46
+ return audio_with_silence
47
+
48
+
49
+ def save_wav(self, audio, path:str, sample_rate: int = 22050):
50
+ '''save audio to wav'''
51
+ scipy.io.wavfile.write(path, sample_rate, audio)
52
+
53
+
54
+ def _intersperse(self, lst, item):
55
+ result = [item] * (len(lst) * 2 + 1)
56
+ result[1::2] = lst
57
+ return result
58
+
59
+ def _get_seq(self, text):
60
+ phoneme_ids = self.tokenizer._get_seq(text)
61
+ phoneme_ids_inter = self._intersperse(phoneme_ids, 0)
62
+ return phoneme_ids_inter
63
+
64
+ def _num2wordsshor(self, match):
65
+ match = match.group()
66
+ ret = num2words(match, lang ='ru')
67
+ return ret
68
+
69
+ def __call__(self, text: str, length_scale=1.2):
70
+ text = translit(text, 'ru')
71
+ text = re.sub(r'\d+',self._num2wordsshor,text)
72
+ phoneme_ids = self._get_seq(text)
73
+ text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
74
+ text_lengths = np.array([text.shape[1]], dtype=np.int64)
75
+ scales = np.array(
76
+ [0.667, length_scale, 0.8],
77
+ dtype=np.float32,
78
+ )
79
+ audio = self.model.run(
80
+ None,
81
+ {
82
+ "input": text,
83
+ "input_lengths": text_lengths,
84
+ "scales": scales,
85
+ "sid": None,
86
+ },
87
+ )[0][0,0][0]
88
+
89
+ audio = self._add_silent(audio, silence_duration = self.add_time_to_end, sample_rate=self.config["samplerate"])
90
+ return audio
packages.txt CHANGED
@@ -1,3 +1,2 @@
1
  ffmpeg
2
- libsndfile1
3
- libespeak1
 
1
  ffmpeg
2
+ libsndfile1
 
requirements.txt CHANGED
@@ -22,4 +22,12 @@ dlib-bin
22
  gfpgan
23
  av
24
  safetensors
25
- pyttsx3==2.90
 
 
 
 
 
 
 
 
 
22
  gfpgan
23
  av
24
  safetensors
25
+ gruut
26
+ gruut-lang-ru
27
+ onnxruntime
28
+ huggingface-hub==0.15.1
29
+ transformers
30
+ sentencepiece
31
+ ruaccent
32
+ transliterate
33
+ num2words
tokenizer/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .gruut import Tokenizer as TokenizerGRUUT
2
+ from .g2p import Tokenizer as TokenizerG2P
tokenizer/g2p/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .tokenizer import Tokenizer
tokenizer/g2p/g2p.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ softletters=set(u"яёюиье")
3
+ startsyl=set(u"#ъьаяоёуюэеиы-")
4
+ others = set(["#", "+", "-", u"ь", u"ъ"])
5
+
6
+ softhard_cons = {
7
+ u"б" : u"b",
8
+ u"в" : u"v",
9
+ u"г" : u"g",
10
+ u"Г" : u"g",
11
+ u"д" : u"d",
12
+ u"з" : u"z",
13
+ u"к" : u"k",
14
+ u"л" : u"l",
15
+ u"м" : u"m",
16
+ u"н" : u"n",
17
+ u"п" : u"p",
18
+ u"р" : u"r",
19
+ u"с" : u"s",
20
+ u"т" : u"t",
21
+ u"ф" : u"f",
22
+ u"х" : u"h"
23
+ }
24
+
25
+ other_cons = {
26
+ u"ж" : u"zh",
27
+ u"ц" : u"c",
28
+ u"ч" : u"ch",
29
+ u"ш" : u"sh",
30
+ u"щ" : u"sch",
31
+ u"й" : u"j"
32
+ }
33
+
34
+ vowels = {
35
+ u"а" : u"a",
36
+ u"я" : u"a",
37
+ u"у" : u"u",
38
+ u"ю" : u"u",
39
+ u"о" : u"o",
40
+ u"ё" : u"o",
41
+ u"э" : u"e",
42
+ u"е" : u"e",
43
+ u"и" : u"i",
44
+ u"ы" : u"y",
45
+ }
46
+
47
+ def pallatize(phones):
48
+ for i, phone in enumerate(phones[:-1]):
49
+ if phone[0] in softhard_cons:
50
+ if phones[i+1][0] in softletters:
51
+ phones[i] = (softhard_cons[phone[0]] + "j", 0)
52
+ else:
53
+ phones[i] = (softhard_cons[phone[0]], 0)
54
+ if phone[0] in other_cons:
55
+ phones[i] = (other_cons[phone[0]], 0)
56
+
57
+ def convert_vowels(phones):
58
+ new_phones = []
59
+ prev = ""
60
+ for phone in phones:
61
+ if prev in startsyl:
62
+ if phone[0] in set(u"яюеё"):
63
+ new_phones.append("j")
64
+ if phone[0] in vowels:
65
+ new_phones.append(vowels[phone[0]] + str(phone[1]))
66
+ else:
67
+ new_phones.append(phone[0])
68
+ prev = phone[0]
69
+
70
+ return new_phones
71
+
72
+ def convert(stressword):
73
+ phones = ("#" + stressword + "#")
74
+
75
+
76
+ # Assign stress marks
77
+ stress_phones = []
78
+ stress = 0
79
+ for phone in phones:
80
+ if phone == "+":
81
+ stress = 1
82
+ else:
83
+ stress_phones.append((phone, stress))
84
+ stress = 0
85
+
86
+ # Pallatize
87
+ pallatize(stress_phones)
88
+
89
+ # Assign stress
90
+ phones = convert_vowels(stress_phones)
91
+
92
+ # Filter
93
+ phones = [x for x in phones if x not in others]
94
+ return " ".join(phones)
tokenizer/g2p/tokenizer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re
3
+ from .g2p import *
4
+ import json
5
+ import os
6
+
7
+ class Tokenizer():
8
+ def __init__(self, data_path: str) -> None:
9
+ self.dic = {}
10
+ for line in open(os.path.join(data_path, "dictionary.txt")):
11
+ items = line.split()
12
+ self.dic[items[0]] = " ".join(items[1:])
13
+
14
+ self.config = json.load(open(os.path.join(data_path, "config.json")))
15
+
16
+ def g2p(self, text):
17
+ text = re.sub("—", "-", text)
18
+ text = re.sub("([!'(),-.:;?])", r' \1 ', text)
19
+
20
+ phonemes = []
21
+ for word in text.split():
22
+ if re.match("[!'(),-.:;?]", word):
23
+ phonemes.append(word)
24
+ continue
25
+
26
+ word = word.lower()
27
+ if len(phonemes) > 0: phonemes.append(' ')
28
+
29
+ if word in self.dic:
30
+ phonemes.extend(self.dic[word].split())
31
+ else:
32
+ phonemes.extend(convert(word).split())
33
+
34
+ phoneme_id_map = self.config["phoneme_id_map"]
35
+ phoneme_ids = []
36
+ phoneme_ids.extend(phoneme_id_map["^"])
37
+ phoneme_ids.extend(phoneme_id_map["_"])
38
+ for p in phonemes:
39
+ if p in phoneme_id_map:
40
+ phoneme_ids.extend(phoneme_id_map[p])
41
+ phoneme_ids.extend(phoneme_id_map["_"])
42
+ phoneme_ids.extend(phoneme_id_map["$"])
43
+
44
+ return phoneme_ids, phonemes
45
+
46
+ def _get_seq(self, text: str) -> list[int]:
47
+ seq = self.g2p(text)[0]
48
+ return seq
tokenizer/gruut/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .tokenizer import Tokenizer
tokenizer/gruut/tokenizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from gruut import sentences
3
+ import os
4
+ import re
5
+
6
+ class Tokenizer():
7
+ def __init__(self, path) -> None:
8
+ with open(os.path.join(path, "vocab.txt"), "r", encoding="utf-8") as vocab_file:
9
+ self.symbols = vocab_file.read().split("\n")
10
+ self.symbols = list(map(chr, list(map(int, self.symbols))))
11
+
12
+ self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
13
+
14
+ def _ru_phonems(self, text: str) -> str:
15
+ text = text.lower()
16
+ phonemes = ""
17
+ for sent in sentences(text, lang="ru"):
18
+ for word in sent:
19
+ if word.phonemes:
20
+ phonemes += "".join(word.phonemes)
21
+ phonemes = re.sub(re.compile(r'\s+'), ' ', phonemes).lstrip().rstrip()
22
+ return phonemes
23
+
24
+
25
+ def _text_to_sequence(self, text: str) -> list[int]:
26
+ '''convert text to seq'''
27
+ sequence = []
28
+ clean_text = self._ru_phonems(text)
29
+ for symbol in clean_text:
30
+ symbol_id = self.symbol_to_id[symbol]
31
+ sequence += [symbol_id]
32
+ return sequence
33
+
34
+
35
+ def _get_seq(self, text: str) -> list[int]:
36
+ seq = self._text_to_sequence(text)
37
+ return seq