anhnh2002 commited on
Commit
b24c7a3
·
verified ·
1 Parent(s): 20eb0d4

Add readme

Browse files
Files changed (1) hide show
  1. README.md +139 -0
README.md ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usage
2
+ Clone repo
3
+ ```bash
4
+ git clone https://github.com/nguyenhoanganh2002/XTTSv2-Finetuning-for-New-Languages.git
5
+ cd XTTSv2-Finetuning-for-New-Languages
6
+ pip install -r requirements.txt
7
+ ```
8
+
9
+ Pull model's weights
10
+ ```python
11
+ from huggingface_hub import snapshot_download
12
+
13
+ snapshot_download(repo_id="anhnh2002/vnTTS",
14
+ repo_type="model",
15
+ local_dir="model/")
16
+ ```
17
+
18
+ Load model
19
+ ```python
20
+ from pprint import pprint
21
+ import torch
22
+ import torchaudio
23
+ from tqdm import tqdm
24
+ from underthesea import sent_tokenize
25
+ from vinorm import TTSnorm
26
+ from TTS.tts.configs.xtts_config import XttsConfig
27
+ from TTS.tts.models.xtts import Xtts
28
+
29
+ device = "cuda:0"
30
+
31
+ xtts_checkpoint = "model/model.pth"
32
+ xtts_config = "model/config.json"
33
+ xtts_vocab = "model/vocab.json"
34
+
35
+ config = XttsConfig()
36
+ config.load_json(xtts_config)
37
+ XTTS_MODEL = Xtts.init_from_config(config)
38
+ XTTS_MODEL.load_checkpoint(config,
39
+ checkpoint_path=xtts_checkpoint,
40
+ vocab_path=xtts_vocab,
41
+ use_deepspeed=False)
42
+ XTTS_MODEL.to(device)
43
+ ```
44
+
45
+ Preprocessing and chunking
46
+ ```python
47
+ def preprocess_text(text, language="vi"):
48
+ if language == "vi":
49
+ text = TTSnorm(text)
50
+
51
+ # split text into sentences
52
+ if language in ["ja", "zh-cn"]:
53
+ sentences = text.split("。")
54
+ else:
55
+ sentences = sent_tokenize(text)
56
+
57
+ chunks = []
58
+ chunk_i = ""
59
+ len_chunk_i = 0
60
+ for sentence in sentences:
61
+ chunk_i += " " + sentence
62
+ len_chunk_i += len(sentence.split())
63
+ if len_chunk_i > 30:
64
+ chunks.append(chunk_i.strip())
65
+ chunk_i = ""
66
+ len_chunk_i = 0
67
+
68
+ if (len(chunks) > 0) and (len_chunk_i < 15):
69
+ chunks[-1] += chunk_i
70
+ else:
71
+ chunks.append(chunk_i)
72
+
73
+ return chunks
74
+ ```
75
+
76
+ Generate latent embeddings for the speaker
77
+ ```python
78
+ speaker_audio_file = "model/vi_man.wav"
79
+
80
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
81
+ audio_path=speaker_audio_file,
82
+ gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
83
+ max_ref_length=XTTS_MODEL.config.max_ref_len,
84
+ sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
85
+ )
86
+ ```
87
+
88
+ Inference
89
+ ```python
90
+ def tts(
91
+ model: Xtts,
92
+ text: str,
93
+ language: str,
94
+ gpt_cond_latent: torch.Tensor,
95
+ speaker_embedding: torch.Tensor,
96
+ verbose: bool = False,
97
+ ):
98
+ # preprocess text
99
+ chunks = preprocess_text(text, language)
100
+
101
+ wav_chunks = []
102
+ for text in tqdm(chunks):
103
+ if text.strip() == "":
104
+ continue
105
+ wav_chunk = model.inference(
106
+ text=text,
107
+ language=language,
108
+ gpt_cond_latent=gpt_cond_latent,
109
+ speaker_embedding=speaker_embedding,
110
+ length_penalty=1.0,
111
+ repetition_penalty=10.0,
112
+ top_k=10,
113
+ top_p=0.5,
114
+ )
115
+
116
+ wav_chunk["wav"] = torch.tensor(wav_chunk["wav"])
117
+
118
+ wav_chunks.append(wav_chunk["wav"])
119
+
120
+ out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
121
+
122
+ return out_wav
123
+
124
+ from IPython.display import Audio
125
+
126
+ audio = tts(
127
+ model=XTTS_MODEL,
128
+ text="Xin chào, tôi là một hệ thống chuyển đổi văn bản tiếng Việt thành giọng nói.", #Hello, I am a Vietnamese text to speech conversion system.
129
+ language="vi",
130
+ gpt_cond_latent=gpt_cond_latent,
131
+ speaker_embedding=speaker_embedding,
132
+ verbose=True,
133
+ )
134
+
135
+ Audio(audio, rate=24000)
136
+ ```
137
+
138
+ # License
139
+ This project uses a model licensed under the Coqui Public Model License 1.0.0, which permits non-commercial use only. This includes personal research, testing, and charitable purposes. Commercial entities may use it for non-commercial research and evaluation. Revenue-generating activities are prohibited. Users must include the license terms when distributing the model or its outputs. For full details, please refer to: https://coqui.ai/cpml