Spaces:
Runtime error
Runtime error
Upload 72 files
Browse files- .gitattributes +1 -0
- AR/data/bucket_sampler.py +4 -3
- AR/models/t2s_lightning_module_onnx.py +106 -0
- AR/models/t2s_model.py +2 -0
- AR/models/t2s_model_onnx.py +337 -0
- AR/modules/activation_onnx.py +178 -0
- AR/modules/embedding_onnx.py +63 -0
- AR/modules/patched_mha_with_cache_onnx.py +92 -0
- AR/modules/transformer_onnx.py +292 -0
- app.py +234 -122
- module/attentions_onnx.py +365 -0
- module/models_onnx.py +920 -0
- onnx_export.py +314 -0
- prepare_datasets/1-get-text.py +131 -0
- prepare_datasets/2-get-hubert-wav32k.py +114 -0
- prepare_datasets/3-get-semantic.py +95 -0
- process_ckpt.py +23 -0
- text/tone_sandhi.py +29 -0
.gitattributes
CHANGED
@@ -44,3 +44,4 @@ audio/Taffy/t2~1_234.wav filter=lfs diff=lfs merge=lfs -text
|
|
44 |
audio/Taffy/t2~1_260.wav filter=lfs diff=lfs merge=lfs -text
|
45 |
audio/Taffy/Taffy_242.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
audio/Taffy/Taffy_250.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
44 |
audio/Taffy/t2~1_260.wav filter=lfs diff=lfs merge=lfs -text
|
45 |
audio/Taffy/Taffy_242.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
audio/Taffy/Taffy_250.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
text/cmudict_cache.pickle filter=lfs diff=lfs merge=lfs -text
|
AR/data/bucket_sampler.py
CHANGED
@@ -41,12 +41,13 @@ class DistributedBucketSampler(Sampler[T_co]):
|
|
41 |
if num_replicas is None:
|
42 |
if not dist.is_available():
|
43 |
raise RuntimeError("Requires distributed package to be available")
|
44 |
-
num_replicas = dist.get_world_size()
|
45 |
if rank is None:
|
46 |
if not dist.is_available():
|
47 |
raise RuntimeError("Requires distributed package to be available")
|
48 |
-
rank = dist.get_rank()
|
49 |
-
torch.cuda.
|
|
|
50 |
if rank >= num_replicas or rank < 0:
|
51 |
raise ValueError(
|
52 |
"Invalid rank {}, rank should be in the interval"
|
|
|
41 |
if num_replicas is None:
|
42 |
if not dist.is_available():
|
43 |
raise RuntimeError("Requires distributed package to be available")
|
44 |
+
num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
|
45 |
if rank is None:
|
46 |
if not dist.is_available():
|
47 |
raise RuntimeError("Requires distributed package to be available")
|
48 |
+
rank = dist.get_rank() if torch.cuda.is_available() else 0
|
49 |
+
if torch.cuda.is_available():
|
50 |
+
torch.cuda.set_device(rank)
|
51 |
if rank >= num_replicas or rank < 0:
|
52 |
raise ValueError(
|
53 |
"Invalid rank {}, rank should be in the interval"
|
AR/models/t2s_lightning_module_onnx.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
|
2 |
+
import os, sys
|
3 |
+
|
4 |
+
now_dir = os.getcwd()
|
5 |
+
sys.path.append(now_dir)
|
6 |
+
from typing import Dict
|
7 |
+
|
8 |
+
import torch
|
9 |
+
from pytorch_lightning import LightningModule
|
10 |
+
from AR.models.t2s_model_onnx import Text2SemanticDecoder
|
11 |
+
from AR.modules.lr_schedulers import WarmupCosineLRSchedule
|
12 |
+
from AR.modules.optim import ScaledAdam
|
13 |
+
|
14 |
+
|
15 |
+
class Text2SemanticLightningModule(LightningModule):
|
16 |
+
def __init__(self, config, output_dir, is_train=True):
|
17 |
+
super().__init__()
|
18 |
+
self.config = config
|
19 |
+
self.top_k = 3
|
20 |
+
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
|
21 |
+
pretrained_s1 = config.get("pretrained_s1")
|
22 |
+
if pretrained_s1 and is_train:
|
23 |
+
# print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
|
24 |
+
print(
|
25 |
+
self.load_state_dict(
|
26 |
+
torch.load(pretrained_s1, map_location="cpu")["weight"]
|
27 |
+
)
|
28 |
+
)
|
29 |
+
if is_train:
|
30 |
+
self.automatic_optimization = False
|
31 |
+
self.save_hyperparameters()
|
32 |
+
self.eval_dir = output_dir / "eval"
|
33 |
+
self.eval_dir.mkdir(parents=True, exist_ok=True)
|
34 |
+
|
35 |
+
def training_step(self, batch: Dict, batch_idx: int):
|
36 |
+
opt = self.optimizers()
|
37 |
+
scheduler = self.lr_schedulers()
|
38 |
+
loss, acc = self.model.forward(
|
39 |
+
batch["phoneme_ids"],
|
40 |
+
batch["phoneme_ids_len"],
|
41 |
+
batch["semantic_ids"],
|
42 |
+
batch["semantic_ids_len"],
|
43 |
+
batch["bert_feature"],
|
44 |
+
)
|
45 |
+
self.manual_backward(loss)
|
46 |
+
if batch_idx > 0 and batch_idx % 4 == 0:
|
47 |
+
opt.step()
|
48 |
+
opt.zero_grad()
|
49 |
+
scheduler.step()
|
50 |
+
|
51 |
+
self.log(
|
52 |
+
"total_loss",
|
53 |
+
loss,
|
54 |
+
on_step=True,
|
55 |
+
on_epoch=True,
|
56 |
+
prog_bar=True,
|
57 |
+
sync_dist=True,
|
58 |
+
)
|
59 |
+
self.log(
|
60 |
+
"lr",
|
61 |
+
scheduler.get_last_lr()[0],
|
62 |
+
on_epoch=True,
|
63 |
+
prog_bar=True,
|
64 |
+
sync_dist=True,
|
65 |
+
)
|
66 |
+
self.log(
|
67 |
+
f"top_{self.top_k}_acc",
|
68 |
+
acc,
|
69 |
+
on_step=True,
|
70 |
+
on_epoch=True,
|
71 |
+
prog_bar=True,
|
72 |
+
sync_dist=True,
|
73 |
+
)
|
74 |
+
|
75 |
+
def validation_step(self, batch: Dict, batch_idx: int):
|
76 |
+
return
|
77 |
+
|
78 |
+
def configure_optimizers(self):
|
79 |
+
model_parameters = self.model.parameters()
|
80 |
+
parameters_names = []
|
81 |
+
parameters_names.append(
|
82 |
+
[name_param_pair[0] for name_param_pair in self.model.named_parameters()]
|
83 |
+
)
|
84 |
+
lm_opt = ScaledAdam(
|
85 |
+
model_parameters,
|
86 |
+
lr=0.01,
|
87 |
+
betas=(0.9, 0.95),
|
88 |
+
clipping_scale=2.0,
|
89 |
+
parameters_names=parameters_names,
|
90 |
+
show_dominant_parameters=False,
|
91 |
+
clipping_update_period=1000,
|
92 |
+
)
|
93 |
+
|
94 |
+
return {
|
95 |
+
"optimizer": lm_opt,
|
96 |
+
"lr_scheduler": {
|
97 |
+
"scheduler": WarmupCosineLRSchedule(
|
98 |
+
lm_opt,
|
99 |
+
init_lr=self.config["optimizer"]["lr_init"],
|
100 |
+
peak_lr=self.config["optimizer"]["lr"],
|
101 |
+
end_lr=self.config["optimizer"]["lr_end"],
|
102 |
+
warmup_steps=self.config["optimizer"]["warmup_steps"],
|
103 |
+
total_steps=self.config["optimizer"]["decay_steps"],
|
104 |
+
)
|
105 |
+
},
|
106 |
+
}
|
AR/models/t2s_model.py
CHANGED
@@ -302,6 +302,8 @@ class Text2SemanticDecoder(nn.Module):
|
|
302 |
xy_dec[:, -1]
|
303 |
) ##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的
|
304 |
# samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
|
|
|
|
|
305 |
samples = sample(
|
306 |
logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35
|
307 |
)[0].unsqueeze(0)
|
|
|
302 |
xy_dec[:, -1]
|
303 |
) ##不用改,如果用了cache的默认就是只有一帧,取最后一帧一样的
|
304 |
# samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
|
305 |
+
if(idx==0):###第一次跑不能EOS否则没有了
|
306 |
+
logits = logits[:, :-1] ###刨除1024终止符号的概率
|
307 |
samples = sample(
|
308 |
logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35
|
309 |
)[0].unsqueeze(0)
|
AR/models/t2s_model_onnx.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_model.py
|
2 |
+
import torch
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
from AR.modules.embedding_onnx import SinePositionalEmbedding
|
6 |
+
from AR.modules.embedding_onnx import TokenEmbedding
|
7 |
+
from AR.modules.transformer_onnx import LayerNorm
|
8 |
+
from AR.modules.transformer_onnx import TransformerEncoder
|
9 |
+
from AR.modules.transformer_onnx import TransformerEncoderLayer
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn import functional as F
|
12 |
+
from torchmetrics.classification import MulticlassAccuracy
|
13 |
+
|
14 |
+
default_config = {
|
15 |
+
"embedding_dim": 512,
|
16 |
+
"hidden_dim": 512,
|
17 |
+
"num_head": 8,
|
18 |
+
"num_layers": 12,
|
19 |
+
"num_codebook": 8,
|
20 |
+
"p_dropout": 0.0,
|
21 |
+
"vocab_size": 1024 + 1,
|
22 |
+
"phoneme_vocab_size": 512,
|
23 |
+
"EOS": 1024,
|
24 |
+
}
|
25 |
+
|
26 |
+
inf_tensor_value = torch.FloatTensor([-float("Inf")]).float()
|
27 |
+
|
28 |
+
def logits_to_probs(
|
29 |
+
logits,
|
30 |
+
previous_tokens = None,
|
31 |
+
temperature: float = 1.0,
|
32 |
+
top_k = None,
|
33 |
+
top_p = None,
|
34 |
+
repetition_penalty: float = 1.0,
|
35 |
+
):
|
36 |
+
previous_tokens = previous_tokens.squeeze()
|
37 |
+
if previous_tokens is not None and repetition_penalty != 1.0:
|
38 |
+
previous_tokens = previous_tokens.long()
|
39 |
+
score = torch.gather(logits, dim=0, index=previous_tokens)
|
40 |
+
score = torch.where(
|
41 |
+
score < 0, score * repetition_penalty, score / repetition_penalty
|
42 |
+
)
|
43 |
+
logits.scatter_(dim=0, index=previous_tokens, src=score)
|
44 |
+
|
45 |
+
if top_p is not None and top_p < 1.0:
|
46 |
+
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
47 |
+
cum_probs = torch.cumsum(
|
48 |
+
torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
|
49 |
+
)
|
50 |
+
sorted_indices_to_remove = cum_probs > top_p
|
51 |
+
sorted_indices_to_remove[0] = False # keep at least one option
|
52 |
+
indices_to_remove = sorted_indices_to_remove.scatter(
|
53 |
+
dim=0, index=sorted_indices, src=sorted_indices_to_remove
|
54 |
+
)
|
55 |
+
logits = logits.masked_fill(indices_to_remove, -float("Inf"))
|
56 |
+
|
57 |
+
logits = logits / max(temperature, 1e-5)
|
58 |
+
|
59 |
+
if top_k is not None:
|
60 |
+
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
61 |
+
pivot = v.select(-1, -1).unsqueeze(-1)
|
62 |
+
logits = torch.where(logits < pivot, inf_tensor_value, logits)
|
63 |
+
|
64 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
65 |
+
return probs
|
66 |
+
|
67 |
+
|
68 |
+
def multinomial_sample_one_no_sync(
|
69 |
+
probs_sort
|
70 |
+
): # Does multinomial sampling without a cuda synchronization
|
71 |
+
q = torch.randn_like(probs_sort)
|
72 |
+
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
|
73 |
+
|
74 |
+
|
75 |
+
def sample(
|
76 |
+
logits,
|
77 |
+
previous_tokens,
|
78 |
+
**sampling_kwargs,
|
79 |
+
):
|
80 |
+
probs = logits_to_probs(
|
81 |
+
logits=logits, previous_tokens=previous_tokens, **sampling_kwargs
|
82 |
+
)
|
83 |
+
idx_next = multinomial_sample_one_no_sync(probs)
|
84 |
+
return idx_next, probs
|
85 |
+
|
86 |
+
|
87 |
+
class OnnxEncoder(nn.Module):
|
88 |
+
def __init__(self, ar_text_embedding, bert_proj, ar_text_position):
|
89 |
+
super().__init__()
|
90 |
+
self.ar_text_embedding = ar_text_embedding
|
91 |
+
self.bert_proj = bert_proj
|
92 |
+
self.ar_text_position = ar_text_position
|
93 |
+
|
94 |
+
def forward(self, x, bert_feature):
|
95 |
+
x = self.ar_text_embedding(x)
|
96 |
+
x = x + self.bert_proj(bert_feature.transpose(1, 2))
|
97 |
+
return self.ar_text_position(x)
|
98 |
+
|
99 |
+
|
100 |
+
class T2SFirstStageDecoder(nn.Module):
|
101 |
+
def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric,
|
102 |
+
top_k, early_stop_num, num_layers):
|
103 |
+
super().__init__()
|
104 |
+
self.ar_audio_embedding = ar_audio_embedding
|
105 |
+
self.ar_audio_position = ar_audio_position
|
106 |
+
self.h = h
|
107 |
+
self.ar_predict_layer = ar_predict_layer
|
108 |
+
self.loss_fct = loss_fct
|
109 |
+
self.ar_accuracy_metric = ar_accuracy_metric
|
110 |
+
self.top_k = top_k
|
111 |
+
self.early_stop_num = early_stop_num
|
112 |
+
self.num_layers = num_layers
|
113 |
+
|
114 |
+
def forward(self, x, prompt):
|
115 |
+
y = prompt
|
116 |
+
x_example = x[:,:,0] * 0.0
|
117 |
+
#N, 1, 512
|
118 |
+
cache = {
|
119 |
+
"all_stage": self.num_layers,
|
120 |
+
"k": None,
|
121 |
+
"v": None,
|
122 |
+
"y_emb": None,
|
123 |
+
"first_infer": 1,
|
124 |
+
"stage": 0,
|
125 |
+
}
|
126 |
+
|
127 |
+
y_emb = self.ar_audio_embedding(y)
|
128 |
+
|
129 |
+
cache["y_emb"] = y_emb
|
130 |
+
y_pos = self.ar_audio_position(y_emb)
|
131 |
+
|
132 |
+
xy_pos = torch.concat([x, y_pos], dim=1)
|
133 |
+
|
134 |
+
y_example = y_pos[:,:,0] * 0.0
|
135 |
+
x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool()
|
136 |
+
y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64)
|
137 |
+
y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum(
|
138 |
+
torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0
|
139 |
+
)
|
140 |
+
y_attn_mask = y_attn_mask > 0
|
141 |
+
|
142 |
+
x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool()
|
143 |
+
y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool()
|
144 |
+
x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1)
|
145 |
+
y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1)
|
146 |
+
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
|
147 |
+
cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\
|
148 |
+
.unsqueeze(1).repeat(self.num_layers, 1, 1, 1)
|
149 |
+
cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\
|
150 |
+
.unsqueeze(1).repeat(self.num_layers, 1, 1, 1)
|
151 |
+
|
152 |
+
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
|
153 |
+
logits = self.ar_predict_layer(xy_dec[:, -1])
|
154 |
+
samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
|
155 |
+
|
156 |
+
y = torch.concat([y, samples], dim=1)
|
157 |
+
|
158 |
+
return y, cache["k"], cache["v"], cache["y_emb"], x_example
|
159 |
+
|
160 |
+
|
161 |
+
class T2SStageDecoder(nn.Module):
|
162 |
+
def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric,
|
163 |
+
top_k, early_stop_num, num_layers):
|
164 |
+
super().__init__()
|
165 |
+
self.ar_audio_embedding = ar_audio_embedding
|
166 |
+
self.ar_audio_position = ar_audio_position
|
167 |
+
self.h = h
|
168 |
+
self.ar_predict_layer = ar_predict_layer
|
169 |
+
self.loss_fct = loss_fct
|
170 |
+
self.ar_accuracy_metric = ar_accuracy_metric
|
171 |
+
self.top_k = top_k
|
172 |
+
self.early_stop_num = early_stop_num
|
173 |
+
self.num_layers = num_layers
|
174 |
+
|
175 |
+
def forward(self, y, k, v, y_emb, x_example):
|
176 |
+
cache = {
|
177 |
+
"all_stage": self.num_layers,
|
178 |
+
"k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)),
|
179 |
+
"v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)),
|
180 |
+
"y_emb": y_emb,
|
181 |
+
"first_infer": 0,
|
182 |
+
"stage": 0,
|
183 |
+
}
|
184 |
+
|
185 |
+
y_emb = torch.cat(
|
186 |
+
[cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1
|
187 |
+
)
|
188 |
+
cache["y_emb"] = y_emb
|
189 |
+
y_pos = self.ar_audio_position(y_emb)
|
190 |
+
|
191 |
+
xy_pos = y_pos[:, -1:]
|
192 |
+
|
193 |
+
y_example = y_pos[:,:,0] * 0.0
|
194 |
+
|
195 |
+
xy_attn_mask = torch.cat([x_example, y_example], dim=1)
|
196 |
+
xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool)
|
197 |
+
|
198 |
+
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
|
199 |
+
logits = self.ar_predict_layer(xy_dec[:, -1])
|
200 |
+
samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
|
201 |
+
|
202 |
+
y = torch.concat([y, samples], dim=1)
|
203 |
+
|
204 |
+
return y, cache["k"], cache["v"], cache["y_emb"], logits, samples
|
205 |
+
|
206 |
+
|
207 |
+
class Text2SemanticDecoder(nn.Module):
|
208 |
+
def __init__(self, config, norm_first=False, top_k=3):
|
209 |
+
super(Text2SemanticDecoder, self).__init__()
|
210 |
+
self.model_dim = config["model"]["hidden_dim"]
|
211 |
+
self.embedding_dim = config["model"]["embedding_dim"]
|
212 |
+
self.num_head = config["model"]["head"]
|
213 |
+
self.num_layers = config["model"]["n_layer"]
|
214 |
+
self.norm_first = norm_first
|
215 |
+
self.vocab_size = config["model"]["vocab_size"]
|
216 |
+
self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
|
217 |
+
self.p_dropout = float(config["model"]["dropout"])
|
218 |
+
self.EOS = config["model"]["EOS"]
|
219 |
+
self.norm_first = norm_first
|
220 |
+
assert self.EOS == self.vocab_size - 1
|
221 |
+
self.bert_proj = nn.Linear(1024, self.embedding_dim)
|
222 |
+
self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout)
|
223 |
+
self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
|
224 |
+
self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout)
|
225 |
+
self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
|
226 |
+
self.h = TransformerEncoder(
|
227 |
+
TransformerEncoderLayer(
|
228 |
+
d_model=self.model_dim,
|
229 |
+
nhead=self.num_head,
|
230 |
+
dim_feedforward=self.model_dim * 4,
|
231 |
+
dropout=0.1,
|
232 |
+
batch_first=True,
|
233 |
+
norm_first=norm_first,
|
234 |
+
),
|
235 |
+
num_layers=self.num_layers,
|
236 |
+
norm=LayerNorm(self.model_dim) if norm_first else None,
|
237 |
+
)
|
238 |
+
self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False)
|
239 |
+
self.loss_fct = nn.CrossEntropyLoss(reduction="sum")
|
240 |
+
self.ar_accuracy_metric = MulticlassAccuracy(
|
241 |
+
self.vocab_size,
|
242 |
+
top_k=top_k,
|
243 |
+
average="micro",
|
244 |
+
multidim_average="global",
|
245 |
+
ignore_index=self.EOS,
|
246 |
+
)
|
247 |
+
self.top_k = torch.LongTensor([1])
|
248 |
+
self.early_stop_num = torch.LongTensor([-1])
|
249 |
+
|
250 |
+
def init_onnx(self):
|
251 |
+
self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position)
|
252 |
+
self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h,
|
253 |
+
self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num,
|
254 |
+
self.num_layers)
|
255 |
+
self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h,
|
256 |
+
self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num,
|
257 |
+
self.num_layers)
|
258 |
+
|
259 |
+
def forward(self, x, prompts, bert_feature):
|
260 |
+
early_stop_num = self.early_stop_num
|
261 |
+
prefix_len = prompts.shape[1]
|
262 |
+
|
263 |
+
x = self.onnx_encoder(x, bert_feature)
|
264 |
+
y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts)
|
265 |
+
|
266 |
+
stop = False
|
267 |
+
for idx in range(1, 1500):
|
268 |
+
enco = self.stage_decoder(y, k, v, y_emb, stage, x_example)
|
269 |
+
y, k, v, y_emb, stage, logits, samples = enco
|
270 |
+
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
271 |
+
stop = True
|
272 |
+
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
|
273 |
+
stop = True
|
274 |
+
if stop:
|
275 |
+
break
|
276 |
+
y[0, -1] = 0
|
277 |
+
return y, idx
|
278 |
+
|
279 |
+
def infer(self, x, prompts, bert_feature):
|
280 |
+
top_k = self.top_k
|
281 |
+
early_stop_num = self.early_stop_num
|
282 |
+
|
283 |
+
x = self.onnx_encoder(x, bert_feature)
|
284 |
+
|
285 |
+
y = prompts
|
286 |
+
prefix_len = y.shape[1]
|
287 |
+
x_len = x.shape[1]
|
288 |
+
x_example = x[:,:,0] * 0.0
|
289 |
+
x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example)
|
290 |
+
x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool)
|
291 |
+
|
292 |
+
stop = False
|
293 |
+
cache = {
|
294 |
+
"all_stage": self.num_layers,
|
295 |
+
"k": [None] * self.num_layers,
|
296 |
+
"v": [None] * self.num_layers,
|
297 |
+
"y_emb": None,
|
298 |
+
"first_infer": 1,
|
299 |
+
"stage": 0,
|
300 |
+
}
|
301 |
+
for idx in range(1500):
|
302 |
+
if cache["first_infer"] == 1:
|
303 |
+
y_emb = self.ar_audio_embedding(y)
|
304 |
+
else:
|
305 |
+
y_emb = torch.cat(
|
306 |
+
[cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1
|
307 |
+
)
|
308 |
+
cache["y_emb"] = y_emb
|
309 |
+
y_pos = self.ar_audio_position(y_emb)
|
310 |
+
if cache["first_infer"] == 1:
|
311 |
+
xy_pos = torch.concat([x, y_pos], dim=1)
|
312 |
+
else:
|
313 |
+
xy_pos = y_pos[:, -1:]
|
314 |
+
y_len = y_pos.shape[1]
|
315 |
+
if cache["first_infer"] == 1:
|
316 |
+
x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True)
|
317 |
+
y_attn_mask = F.pad(
|
318 |
+
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
|
319 |
+
(x_len, 0), value=False
|
320 |
+
)
|
321 |
+
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
|
322 |
+
else:
|
323 |
+
xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool)
|
324 |
+
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
|
325 |
+
logits = self.ar_predict_layer(xy_dec[:, -1])
|
326 |
+
samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
|
327 |
+
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
328 |
+
stop = True
|
329 |
+
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
|
330 |
+
stop = True
|
331 |
+
if stop:
|
332 |
+
if prompts.shape[1] == y.shape[1]:
|
333 |
+
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
|
334 |
+
break
|
335 |
+
y = torch.concat([y, samples], dim=1)
|
336 |
+
cache["first_infer"] = 0
|
337 |
+
return y, idx
|
AR/modules/activation_onnx.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
|
2 |
+
from typing import Optional
|
3 |
+
from typing import Tuple
|
4 |
+
import torch
|
5 |
+
from torch import Tensor
|
6 |
+
from torch.nn import Linear
|
7 |
+
from torch.nn import Module
|
8 |
+
from torch.nn.init import constant_
|
9 |
+
from torch.nn.init import xavier_normal_
|
10 |
+
from torch.nn.init import xavier_uniform_
|
11 |
+
from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
|
12 |
+
from torch.nn.parameter import Parameter
|
13 |
+
|
14 |
+
from torch.nn import functional as F
|
15 |
+
from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched
|
16 |
+
|
17 |
+
|
18 |
+
class MultiheadAttention(Module):
|
19 |
+
__constants__ = ["batch_first"]
|
20 |
+
bias_k: Optional[torch.Tensor]
|
21 |
+
bias_v: Optional[torch.Tensor]
|
22 |
+
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
embed_dim,
|
26 |
+
num_heads,
|
27 |
+
dropout=0.0,
|
28 |
+
bias=True,
|
29 |
+
add_bias_kv=False,
|
30 |
+
add_zero_attn=False,
|
31 |
+
kdim=None,
|
32 |
+
vdim=None,
|
33 |
+
batch_first=False,
|
34 |
+
linear1_cls=Linear,
|
35 |
+
linear2_cls=Linear,
|
36 |
+
device=None,
|
37 |
+
dtype=None,
|
38 |
+
) -> None:
|
39 |
+
factory_kwargs = {"device": device, "dtype": dtype}
|
40 |
+
super(MultiheadAttention, self).__init__()
|
41 |
+
self.embed_dim = embed_dim
|
42 |
+
self.kdim = kdim if kdim is not None else embed_dim
|
43 |
+
self.vdim = vdim if vdim is not None else embed_dim
|
44 |
+
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
45 |
+
|
46 |
+
self.num_heads = num_heads
|
47 |
+
self.dropout = dropout
|
48 |
+
self.batch_first = batch_first
|
49 |
+
self.head_dim = embed_dim // num_heads
|
50 |
+
assert (
|
51 |
+
self.head_dim * num_heads == self.embed_dim
|
52 |
+
), "embed_dim must be divisible by num_heads"
|
53 |
+
|
54 |
+
if add_bias_kv:
|
55 |
+
self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
|
56 |
+
self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
|
57 |
+
else:
|
58 |
+
self.bias_k = self.bias_v = None
|
59 |
+
|
60 |
+
if linear1_cls == Linear:
|
61 |
+
if not self._qkv_same_embed_dim:
|
62 |
+
self.q_proj_weight = Parameter(
|
63 |
+
torch.empty((embed_dim, embed_dim), **factory_kwargs)
|
64 |
+
)
|
65 |
+
self.k_proj_weight = Parameter(
|
66 |
+
torch.empty((embed_dim, self.kdim), **factory_kwargs)
|
67 |
+
)
|
68 |
+
self.v_proj_weight = Parameter(
|
69 |
+
torch.empty((embed_dim, self.vdim), **factory_kwargs)
|
70 |
+
)
|
71 |
+
self.register_parameter("in_proj_weight", None)
|
72 |
+
else:
|
73 |
+
self.in_proj_weight = Parameter(
|
74 |
+
torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
|
75 |
+
)
|
76 |
+
self.register_parameter("q_proj_weight", None)
|
77 |
+
self.register_parameter("k_proj_weight", None)
|
78 |
+
self.register_parameter("v_proj_weight", None)
|
79 |
+
|
80 |
+
if bias:
|
81 |
+
self.in_proj_bias = Parameter(
|
82 |
+
torch.empty(3 * embed_dim, **factory_kwargs)
|
83 |
+
)
|
84 |
+
else:
|
85 |
+
self.register_parameter("in_proj_bias", None)
|
86 |
+
self.out_proj = NonDynamicallyQuantizableLinear(
|
87 |
+
embed_dim, embed_dim, bias=bias, **factory_kwargs
|
88 |
+
)
|
89 |
+
|
90 |
+
self._reset_parameters()
|
91 |
+
else:
|
92 |
+
if not self._qkv_same_embed_dim:
|
93 |
+
raise NotImplementedError
|
94 |
+
else:
|
95 |
+
self.in_proj_linear = linear1_cls(
|
96 |
+
embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
|
97 |
+
)
|
98 |
+
self.in_proj_weight = self.in_proj_linear.weight
|
99 |
+
|
100 |
+
self.register_parameter("q_proj_weight", None)
|
101 |
+
self.register_parameter("k_proj_weight", None)
|
102 |
+
self.register_parameter("v_proj_weight", None)
|
103 |
+
|
104 |
+
if bias:
|
105 |
+
self.in_proj_bias = self.in_proj_linear.bias
|
106 |
+
else:
|
107 |
+
self.register_parameter("in_proj_bias", None)
|
108 |
+
|
109 |
+
self.out_proj = linear2_cls(
|
110 |
+
embed_dim, embed_dim, bias=bias, **factory_kwargs
|
111 |
+
)
|
112 |
+
|
113 |
+
if self.bias_k is not None:
|
114 |
+
xavier_normal_(self.bias_k)
|
115 |
+
if self.bias_v is not None:
|
116 |
+
xavier_normal_(self.bias_v)
|
117 |
+
|
118 |
+
self.add_zero_attn = add_zero_attn
|
119 |
+
|
120 |
+
def _reset_parameters(self):
|
121 |
+
if self._qkv_same_embed_dim:
|
122 |
+
xavier_uniform_(self.in_proj_weight)
|
123 |
+
else:
|
124 |
+
xavier_uniform_(self.q_proj_weight)
|
125 |
+
xavier_uniform_(self.k_proj_weight)
|
126 |
+
xavier_uniform_(self.v_proj_weight)
|
127 |
+
|
128 |
+
if self.in_proj_bias is not None:
|
129 |
+
constant_(self.in_proj_bias, 0.0)
|
130 |
+
constant_(self.out_proj.bias, 0.0)
|
131 |
+
|
132 |
+
if self.bias_k is not None:
|
133 |
+
xavier_normal_(self.bias_k)
|
134 |
+
if self.bias_v is not None:
|
135 |
+
xavier_normal_(self.bias_v)
|
136 |
+
|
137 |
+
def __setstate__(self, state):
|
138 |
+
# Support loading old MultiheadAttention checkpoints generated by v1.1.0
|
139 |
+
if "_qkv_same_embed_dim" not in state:
|
140 |
+
state["_qkv_same_embed_dim"] = True
|
141 |
+
|
142 |
+
super(MultiheadAttention, self).__setstate__(state)
|
143 |
+
|
144 |
+
def forward(
|
145 |
+
self,
|
146 |
+
query: Tensor,
|
147 |
+
key: Tensor,
|
148 |
+
value: Tensor,
|
149 |
+
key_padding_mask: Optional[Tensor] = None,
|
150 |
+
need_weights: bool = True,
|
151 |
+
attn_mask: Optional[Tensor] = None,
|
152 |
+
average_attn_weights: bool = True,
|
153 |
+
cache=None,
|
154 |
+
) -> Tuple[Tensor, Optional[Tensor]]:
|
155 |
+
any_nested = query.is_nested or key.is_nested or value.is_nested
|
156 |
+
query = key = value = query.transpose(1, 0)
|
157 |
+
attn_output = multi_head_attention_forward_patched(
|
158 |
+
query,
|
159 |
+
key,
|
160 |
+
value,
|
161 |
+
self.embed_dim,
|
162 |
+
self.num_heads,
|
163 |
+
self.in_proj_weight,
|
164 |
+
self.in_proj_bias,
|
165 |
+
self.bias_k,
|
166 |
+
self.bias_v,
|
167 |
+
self.add_zero_attn,
|
168 |
+
self.dropout,
|
169 |
+
self.out_proj.weight,
|
170 |
+
self.out_proj.bias,
|
171 |
+
training=self.training,
|
172 |
+
key_padding_mask=key_padding_mask,
|
173 |
+
need_weights=need_weights,
|
174 |
+
attn_mask=attn_mask,
|
175 |
+
average_attn_weights=average_attn_weights,
|
176 |
+
cache=cache,
|
177 |
+
)
|
178 |
+
return attn_output.transpose(1, 0)
|
AR/modules/embedding_onnx.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
|
2 |
+
import math
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
|
7 |
+
|
8 |
+
class TokenEmbedding(nn.Module):
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
embedding_dim: int,
|
12 |
+
vocab_size: int,
|
13 |
+
dropout: float = 0.0,
|
14 |
+
):
|
15 |
+
super().__init__()
|
16 |
+
|
17 |
+
self.vocab_size = vocab_size
|
18 |
+
self.embedding_dim = embedding_dim
|
19 |
+
|
20 |
+
self.dropout = torch.nn.Dropout(p=dropout)
|
21 |
+
self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
|
22 |
+
|
23 |
+
@property
|
24 |
+
def weight(self) -> torch.Tensor:
|
25 |
+
return self.word_embeddings.weight
|
26 |
+
|
27 |
+
def embedding(self, index: int) -> torch.Tensor:
|
28 |
+
return self.word_embeddings.weight[index : index + 1]
|
29 |
+
|
30 |
+
def forward(self, x: torch.Tensor):
|
31 |
+
x = self.word_embeddings(x)
|
32 |
+
x = self.dropout(x)
|
33 |
+
return x
|
34 |
+
|
35 |
+
|
36 |
+
class SinePositionalEmbedding(nn.Module):
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
embedding_dim: int,
|
40 |
+
dropout: float = 0.0,
|
41 |
+
scale: bool = False,
|
42 |
+
alpha: bool = False,
|
43 |
+
):
|
44 |
+
super().__init__()
|
45 |
+
self.embedding_dim = embedding_dim
|
46 |
+
self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
|
47 |
+
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
|
48 |
+
self.dropout = torch.nn.Dropout(p=dropout)
|
49 |
+
self.reverse = False
|
50 |
+
self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
|
51 |
+
|
52 |
+
def extend_pe(self, x):
|
53 |
+
position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
|
54 |
+
scpe = (position * self.div_term).unsqueeze(0)
|
55 |
+
pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
|
56 |
+
pe = pe.contiguous().view(1, -1, self.embedding_dim)
|
57 |
+
return pe
|
58 |
+
|
59 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
60 |
+
pe = self.extend_pe(x)
|
61 |
+
output = x.unsqueeze(-1) if x.ndim == 2 else x
|
62 |
+
output = output * self.x_scale + self.alpha * pe
|
63 |
+
return self.dropout(output)
|
AR/modules/patched_mha_with_cache_onnx.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.nn.functional import *
|
2 |
+
from torch.nn.functional import (
|
3 |
+
_mha_shape_check,
|
4 |
+
_canonical_mask,
|
5 |
+
_none_or_dtype,
|
6 |
+
_in_projection_packed,
|
7 |
+
)
|
8 |
+
|
9 |
+
def multi_head_attention_forward_patched(
|
10 |
+
query,
|
11 |
+
key,
|
12 |
+
value,
|
13 |
+
embed_dim_to_check: int,
|
14 |
+
num_heads: int,
|
15 |
+
in_proj_weight,
|
16 |
+
in_proj_bias: Optional[Tensor],
|
17 |
+
bias_k: Optional[Tensor],
|
18 |
+
bias_v: Optional[Tensor],
|
19 |
+
add_zero_attn: bool,
|
20 |
+
dropout_p: float,
|
21 |
+
out_proj_weight: Tensor,
|
22 |
+
out_proj_bias: Optional[Tensor],
|
23 |
+
training: bool = True,
|
24 |
+
key_padding_mask: Optional[Tensor] = None,
|
25 |
+
need_weights: bool = True,
|
26 |
+
attn_mask: Optional[Tensor] = None,
|
27 |
+
use_separate_proj_weight: bool = False,
|
28 |
+
q_proj_weight: Optional[Tensor] = None,
|
29 |
+
k_proj_weight: Optional[Tensor] = None,
|
30 |
+
v_proj_weight: Optional[Tensor] = None,
|
31 |
+
static_k: Optional[Tensor] = None,
|
32 |
+
static_v: Optional[Tensor] = None,
|
33 |
+
average_attn_weights: bool = True,
|
34 |
+
is_causal: bool = False,
|
35 |
+
cache=None,
|
36 |
+
) -> Tuple[Tensor, Optional[Tensor]]:
|
37 |
+
|
38 |
+
# set up shape vars
|
39 |
+
_, _, embed_dim = query.shape
|
40 |
+
attn_mask = _canonical_mask(
|
41 |
+
mask=attn_mask,
|
42 |
+
mask_name="attn_mask",
|
43 |
+
other_type=None,
|
44 |
+
other_name="",
|
45 |
+
target_type=query.dtype,
|
46 |
+
check_other=False,
|
47 |
+
)
|
48 |
+
head_dim = embed_dim // num_heads
|
49 |
+
|
50 |
+
proj_qkv = linear(query, in_proj_weight, in_proj_bias)
|
51 |
+
proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
|
52 |
+
q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2]
|
53 |
+
|
54 |
+
if cache["first_infer"] == 1:
|
55 |
+
cache["k"][cache["stage"]] = k
|
56 |
+
cache["v"][cache["stage"]] = v
|
57 |
+
else:
|
58 |
+
cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0)
|
59 |
+
cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0)
|
60 |
+
k = cache["k"][cache["stage"]]
|
61 |
+
v = cache["v"][cache["stage"]]
|
62 |
+
cache["stage"] = (cache["stage"] + 1) % cache["all_stage"]
|
63 |
+
|
64 |
+
attn_mask = _canonical_mask(
|
65 |
+
mask=attn_mask,
|
66 |
+
mask_name="attn_mask",
|
67 |
+
other_type=None,
|
68 |
+
other_name="",
|
69 |
+
target_type=q.dtype,
|
70 |
+
check_other=False,
|
71 |
+
)
|
72 |
+
attn_mask = attn_mask.unsqueeze(0)
|
73 |
+
|
74 |
+
q = q.view(-1, num_heads, head_dim).transpose(0, 1)
|
75 |
+
k = k.view(-1, num_heads, head_dim).transpose(0, 1)
|
76 |
+
v = v.view(-1, num_heads, head_dim).transpose(0, 1)
|
77 |
+
|
78 |
+
dropout_p = 0.0
|
79 |
+
attn_mask = attn_mask.unsqueeze(0)
|
80 |
+
q = q.view(num_heads, -1, head_dim).unsqueeze(0)
|
81 |
+
k = k.view(num_heads, -1, head_dim).unsqueeze(0)
|
82 |
+
v = v.view(num_heads, -1, head_dim).unsqueeze(0)
|
83 |
+
attn_output = scaled_dot_product_attention(
|
84 |
+
q, k, v, attn_mask, dropout_p, is_causal
|
85 |
+
)
|
86 |
+
attn_output = (
|
87 |
+
attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
|
88 |
+
)
|
89 |
+
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
|
90 |
+
attn_output = attn_output.view(-1, 1, attn_output.size(1))
|
91 |
+
|
92 |
+
return attn_output
|
AR/modules/transformer_onnx.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py
|
2 |
+
import copy
|
3 |
+
import numbers
|
4 |
+
from functools import partial
|
5 |
+
from typing import Any
|
6 |
+
from typing import Callable
|
7 |
+
from typing import List
|
8 |
+
from typing import Optional
|
9 |
+
from typing import Tuple
|
10 |
+
from typing import Union
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from AR.modules.activation_onnx import MultiheadAttention
|
14 |
+
from AR.modules.scaling import BalancedDoubleSwish
|
15 |
+
from torch import nn
|
16 |
+
from torch import Tensor
|
17 |
+
from torch.nn import functional as F
|
18 |
+
|
19 |
+
_shape_t = Union[int, List[int], torch.Size]
|
20 |
+
|
21 |
+
|
22 |
+
class LayerNorm(nn.Module):
|
23 |
+
__constants__ = ["normalized_shape", "eps", "elementwise_affine"]
|
24 |
+
normalized_shape: Tuple[int, ...]
|
25 |
+
eps: float
|
26 |
+
elementwise_affine: bool
|
27 |
+
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
normalized_shape: _shape_t,
|
31 |
+
eps: float = 1e-5,
|
32 |
+
elementwise_affine: bool = True,
|
33 |
+
device=None,
|
34 |
+
dtype=None,
|
35 |
+
) -> None:
|
36 |
+
factory_kwargs = {"device": device, "dtype": dtype}
|
37 |
+
super(LayerNorm, self).__init__()
|
38 |
+
if isinstance(normalized_shape, numbers.Integral):
|
39 |
+
# mypy error: incompatible types in assignment
|
40 |
+
normalized_shape = (normalized_shape,) # type: ignore[assignment]
|
41 |
+
self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type]
|
42 |
+
self.eps = eps
|
43 |
+
self.elementwise_affine = elementwise_affine
|
44 |
+
if self.elementwise_affine:
|
45 |
+
self.weight = nn.Parameter(
|
46 |
+
torch.empty(self.normalized_shape, **factory_kwargs)
|
47 |
+
)
|
48 |
+
self.bias = nn.Parameter(
|
49 |
+
torch.empty(self.normalized_shape, **factory_kwargs)
|
50 |
+
)
|
51 |
+
else:
|
52 |
+
self.register_parameter("weight", None)
|
53 |
+
self.register_parameter("bias", None)
|
54 |
+
|
55 |
+
self.reset_parameters()
|
56 |
+
|
57 |
+
def reset_parameters(self) -> None:
|
58 |
+
if self.elementwise_affine:
|
59 |
+
nn.init.ones_(self.weight)
|
60 |
+
nn.init.zeros_(self.bias)
|
61 |
+
|
62 |
+
def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
|
63 |
+
if isinstance(input, tuple):
|
64 |
+
input, embedding = input
|
65 |
+
return (
|
66 |
+
F.layer_norm(
|
67 |
+
input,
|
68 |
+
self.normalized_shape,
|
69 |
+
self.weight,
|
70 |
+
self.bias,
|
71 |
+
self.eps,
|
72 |
+
),
|
73 |
+
embedding,
|
74 |
+
)
|
75 |
+
|
76 |
+
assert embedding is None
|
77 |
+
return F.layer_norm(
|
78 |
+
input, self.normalized_shape, self.weight, self.bias, self.eps
|
79 |
+
)
|
80 |
+
|
81 |
+
def extra_repr(self) -> str:
|
82 |
+
return (
|
83 |
+
"{normalized_shape}, eps={eps}, "
|
84 |
+
"elementwise_affine={elementwise_affine}".format(**self.__dict__)
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
class IdentityNorm(nn.Module):
|
89 |
+
def __init__(
|
90 |
+
self,
|
91 |
+
d_model: int,
|
92 |
+
eps: float = 1e-5,
|
93 |
+
device=None,
|
94 |
+
dtype=None,
|
95 |
+
) -> None:
|
96 |
+
super(IdentityNorm, self).__init__()
|
97 |
+
|
98 |
+
def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
|
99 |
+
if isinstance(input, tuple):
|
100 |
+
return input
|
101 |
+
|
102 |
+
assert embedding is None
|
103 |
+
return input
|
104 |
+
|
105 |
+
|
106 |
+
class TransformerEncoder(nn.Module):
|
107 |
+
r"""TransformerEncoder is a stack of N encoder layers. Users can build the
|
108 |
+
BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
encoder_layer: an instance of the TransformerEncoderLayer() class (required).
|
112 |
+
num_layers: the number of sub-encoder-layers in the encoder (required).
|
113 |
+
norm: the layer normalization component (optional).
|
114 |
+
enable_nested_tensor: if True, input will automatically convert to nested tensor
|
115 |
+
(and convert back on output). This will improve the overall performance of
|
116 |
+
TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
|
117 |
+
|
118 |
+
Examples::
|
119 |
+
>>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
|
120 |
+
>>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
|
121 |
+
>>> src = torch.rand(10, 32, 512)
|
122 |
+
>>> out = transformer_encoder(src)
|
123 |
+
"""
|
124 |
+
__constants__ = ["norm"]
|
125 |
+
|
126 |
+
def __init__(self, encoder_layer, num_layers, norm=None):
|
127 |
+
super(TransformerEncoder, self).__init__()
|
128 |
+
self.layers = _get_clones(encoder_layer, num_layers)
|
129 |
+
self.num_layers = num_layers
|
130 |
+
self.norm = norm
|
131 |
+
|
132 |
+
def forward(
|
133 |
+
self,
|
134 |
+
src: Tensor,
|
135 |
+
mask: Optional[Tensor] = None,
|
136 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
137 |
+
return_layer_states: bool = False,
|
138 |
+
cache=None,
|
139 |
+
) -> Tensor:
|
140 |
+
output = src
|
141 |
+
for mod in self.layers:
|
142 |
+
output = mod(
|
143 |
+
output,
|
144 |
+
src_mask=mask,
|
145 |
+
src_key_padding_mask=src_key_padding_mask,
|
146 |
+
cache=cache,
|
147 |
+
)
|
148 |
+
|
149 |
+
if self.norm is not None:
|
150 |
+
output = self.norm(output)
|
151 |
+
|
152 |
+
return output
|
153 |
+
|
154 |
+
|
155 |
+
class TransformerEncoderLayer(nn.Module):
|
156 |
+
__constants__ = ["batch_first", "norm_first"]
|
157 |
+
def __init__(
|
158 |
+
self,
|
159 |
+
d_model: int,
|
160 |
+
nhead: int,
|
161 |
+
dim_feedforward: int = 2048,
|
162 |
+
dropout: float = 0.1,
|
163 |
+
activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
|
164 |
+
batch_first: bool = False,
|
165 |
+
norm_first: bool = False,
|
166 |
+
device=None,
|
167 |
+
dtype=None,
|
168 |
+
linear1_self_attention_cls: nn.Module = nn.Linear,
|
169 |
+
linear2_self_attention_cls: nn.Module = nn.Linear,
|
170 |
+
linear1_feedforward_cls: nn.Module = nn.Linear,
|
171 |
+
linear2_feedforward_cls: nn.Module = nn.Linear,
|
172 |
+
layer_norm_cls: nn.Module = LayerNorm,
|
173 |
+
layer_norm_eps: float = 1e-5,
|
174 |
+
adaptive_layer_norm=False,
|
175 |
+
) -> None:
|
176 |
+
factory_kwargs = {"device": device, "dtype": dtype}
|
177 |
+
super(TransformerEncoderLayer, self).__init__()
|
178 |
+
self.self_attn = MultiheadAttention(
|
179 |
+
d_model, # 512 16
|
180 |
+
nhead,
|
181 |
+
dropout=dropout,
|
182 |
+
batch_first=batch_first,
|
183 |
+
linear1_cls=linear1_self_attention_cls,
|
184 |
+
linear2_cls=linear2_self_attention_cls,
|
185 |
+
**factory_kwargs,
|
186 |
+
)
|
187 |
+
self.linear1 = linear1_feedforward_cls(
|
188 |
+
d_model, dim_feedforward, **factory_kwargs
|
189 |
+
)
|
190 |
+
self.dropout = nn.Dropout(dropout)
|
191 |
+
self.linear2 = linear2_feedforward_cls(
|
192 |
+
dim_feedforward, d_model, **factory_kwargs
|
193 |
+
)
|
194 |
+
self.norm_first = norm_first
|
195 |
+
self.dropout1 = nn.Dropout(dropout)
|
196 |
+
self.dropout2 = nn.Dropout(dropout)
|
197 |
+
if isinstance(activation, str):
|
198 |
+
activation = _get_activation_fn(activation)
|
199 |
+
elif isinstance(activation, partial):
|
200 |
+
activation = activation(d_model)
|
201 |
+
elif activation == BalancedDoubleSwish:
|
202 |
+
activation = BalancedDoubleSwish(d_model)
|
203 |
+
self.activation = activation
|
204 |
+
|
205 |
+
norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
|
206 |
+
if layer_norm_cls == IdentityNorm:
|
207 |
+
norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
|
208 |
+
else:
|
209 |
+
norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
|
210 |
+
|
211 |
+
if adaptive_layer_norm:
|
212 |
+
self.norm1 = AdaptiveLayerNorm(d_model, norm1)
|
213 |
+
self.norm2 = AdaptiveLayerNorm(d_model, norm2)
|
214 |
+
else:
|
215 |
+
self.norm1 = norm1
|
216 |
+
self.norm2 = norm2
|
217 |
+
|
218 |
+
def __setstate__(self, state):
|
219 |
+
super(TransformerEncoderLayer, self).__setstate__(state)
|
220 |
+
if not hasattr(self, "activation"):
|
221 |
+
self.activation = F.relu
|
222 |
+
|
223 |
+
def forward(
|
224 |
+
self,
|
225 |
+
src: Tensor,
|
226 |
+
src_mask: Optional[Tensor] = None,
|
227 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
228 |
+
cache=None,
|
229 |
+
) -> Tensor:
|
230 |
+
x = src
|
231 |
+
stage_embedding = None
|
232 |
+
x = self.norm1(
|
233 |
+
x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache),
|
234 |
+
stage_embedding,
|
235 |
+
)
|
236 |
+
x = self.norm2(x + self._ff_block(x), stage_embedding)
|
237 |
+
|
238 |
+
return x
|
239 |
+
|
240 |
+
def _sa_block(
|
241 |
+
self,
|
242 |
+
x: Tensor,
|
243 |
+
attn_mask: Optional[Tensor],
|
244 |
+
key_padding_mask: Optional[Tensor],
|
245 |
+
cache=None,
|
246 |
+
) -> Tensor:
|
247 |
+
x = self.self_attn(
|
248 |
+
x,
|
249 |
+
x,
|
250 |
+
x,
|
251 |
+
attn_mask=attn_mask,
|
252 |
+
key_padding_mask=key_padding_mask,
|
253 |
+
need_weights=False,
|
254 |
+
cache=cache,
|
255 |
+
)
|
256 |
+
return self.dropout1(x)
|
257 |
+
|
258 |
+
def _ff_block(self, x: Tensor) -> Tensor:
|
259 |
+
x = self.linear2(self.dropout(self.activation(self.linear1(x))))
|
260 |
+
return self.dropout2(x)
|
261 |
+
|
262 |
+
|
263 |
+
class AdaptiveLayerNorm(nn.Module):
|
264 |
+
r"""Adaptive Layer Normalization"""
|
265 |
+
|
266 |
+
def __init__(self, d_model, norm) -> None:
|
267 |
+
super(AdaptiveLayerNorm, self).__init__()
|
268 |
+
self.project_layer = nn.Linear(d_model, 2 * d_model)
|
269 |
+
self.norm = norm
|
270 |
+
self.d_model = d_model
|
271 |
+
self.eps = self.norm.eps
|
272 |
+
|
273 |
+
def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
|
274 |
+
if isinstance(input, tuple):
|
275 |
+
input, embedding = input
|
276 |
+
weight, bias = torch.split(
|
277 |
+
self.project_layer(embedding),
|
278 |
+
split_size_or_sections=self.d_model,
|
279 |
+
dim=-1,
|
280 |
+
)
|
281 |
+
return (weight * self.norm(input) + bias, embedding)
|
282 |
+
|
283 |
+
weight, bias = torch.split(
|
284 |
+
self.project_layer(embedding),
|
285 |
+
split_size_or_sections=self.d_model,
|
286 |
+
dim=-1,
|
287 |
+
)
|
288 |
+
return weight * self.norm(input) + bias
|
289 |
+
|
290 |
+
|
291 |
+
def _get_clones(module, N):
|
292 |
+
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
|
app.py
CHANGED
@@ -1,10 +1,33 @@
|
|
1 |
-
import os,re
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
"
|
6 |
-
)
|
7 |
-
sovits_path = os.environ.get("sovits_path",
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
cnhubert_base_path = os.environ.get(
|
9 |
"cnhubert_base_path", "pretrained_models/chinese-hubert-base"
|
10 |
)
|
@@ -13,6 +36,8 @@ bert_path = os.environ.get(
|
|
13 |
)
|
14 |
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
15 |
infer_ttswebui = int(infer_ttswebui)
|
|
|
|
|
16 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
17 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
18 |
is_half = eval(os.environ.get("is_half", "True"))
|
@@ -22,10 +47,6 @@ import numpy as np
|
|
22 |
import librosa,torch
|
23 |
from feature_extractor import cnhubert
|
24 |
cnhubert.cnhubert_base_path=cnhubert_base_path
|
25 |
-
import ssl
|
26 |
-
ssl._create_default_https_context = ssl._create_unverified_context
|
27 |
-
import nltk
|
28 |
-
nltk.download('cmudict')
|
29 |
|
30 |
from module.models import SynthesizerTrn
|
31 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
@@ -34,12 +55,17 @@ from text.cleaner import clean_text
|
|
34 |
from time import time as ttime
|
35 |
from module.mel_processing import spectrogram_torch
|
36 |
from my_utils import load_audio
|
|
|
|
|
37 |
|
38 |
-
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
)
|
|
|
|
|
|
|
43 |
|
44 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
45 |
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
@@ -48,13 +74,11 @@ if is_half == True:
|
|
48 |
else:
|
49 |
bert_model = bert_model.to(device)
|
50 |
|
51 |
-
|
52 |
-
# bert_model=bert_model.to(device)
|
53 |
def get_bert_feature(text, word2ph):
|
54 |
with torch.no_grad():
|
55 |
inputs = tokenizer(text, return_tensors="pt")
|
56 |
for i in inputs:
|
57 |
-
inputs[i] = inputs[i].to(device)
|
58 |
res = bert_model(**inputs, output_hidden_states=True)
|
59 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
60 |
assert len(word2ph) == len(text)
|
@@ -63,15 +87,8 @@ def get_bert_feature(text, word2ph):
|
|
63 |
repeat_feature = res[i].repeat(word2ph[i], 1)
|
64 |
phone_level_feature.append(repeat_feature)
|
65 |
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
66 |
-
# if(is_half==True):phone_level_feature=phone_level_feature.half()
|
67 |
return phone_level_feature.T
|
68 |
|
69 |
-
|
70 |
-
n_semantic = 1024
|
71 |
-
|
72 |
-
dict_s2=torch.load(sovits_path,map_location="cpu")
|
73 |
-
hps=dict_s2["config"]
|
74 |
-
|
75 |
class DictToAttrRecursive(dict):
|
76 |
def __init__(self, input_dict):
|
77 |
super().__init__(input_dict)
|
@@ -100,11 +117,6 @@ class DictToAttrRecursive(dict):
|
|
100 |
raise AttributeError(f"Attribute {item} not found")
|
101 |
|
102 |
|
103 |
-
hps = DictToAttrRecursive(hps)
|
104 |
-
|
105 |
-
hps.model.semantic_frame_rate = "25hz"
|
106 |
-
dict_s1 = torch.load(gpt_path, map_location="cpu")
|
107 |
-
config = dict_s1["config"]
|
108 |
ssl_model = cnhubert.get_model()
|
109 |
if is_half == True:
|
110 |
ssl_model = ssl_model.half().to(device)
|
@@ -123,13 +135,15 @@ def change_sovits_weights(sovits_path):
|
|
123 |
n_speakers=hps.data.n_speakers,
|
124 |
**hps.model
|
125 |
)
|
126 |
-
|
|
|
127 |
if is_half == True:
|
128 |
vq_model = vq_model.half().to(device)
|
129 |
else:
|
130 |
vq_model = vq_model.to(device)
|
131 |
vq_model.eval()
|
132 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
|
|
133 |
change_sovits_weights(sovits_path)
|
134 |
|
135 |
def change_gpt_weights(gpt_path):
|
@@ -146,9 +160,9 @@ def change_gpt_weights(gpt_path):
|
|
146 |
t2s_model.eval()
|
147 |
total = sum([param.nelement() for param in t2s_model.parameters()])
|
148 |
print("Number of parameter: %.2fM" % (total / 1e6))
|
|
|
149 |
change_gpt_weights(gpt_path)
|
150 |
|
151 |
-
|
152 |
def get_spepc(hps, filename):
|
153 |
audio = load_audio(filename, int(hps.data.sampling_rate))
|
154 |
audio = torch.FloatTensor(audio)
|
@@ -165,14 +179,91 @@ def get_spepc(hps, filename):
|
|
165 |
return spec
|
166 |
|
167 |
|
168 |
-
dict_language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
|
171 |
-
def
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
t0 = ttime()
|
177 |
prompt_text = prompt_text.strip("\n")
|
178 |
prompt_language, text = prompt_language, text.strip("\n")
|
@@ -201,28 +292,38 @@ def get_tts_wav(selected_text, prompt_text, prompt_language, text, text_language
|
|
201 |
t1 = ttime()
|
202 |
prompt_language = dict_language[prompt_language]
|
203 |
text_language = dict_language[text_language]
|
204 |
-
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
205 |
-
phones1 = cleaned_text_to_sequence(phones1)
|
206 |
-
texts = text.split("\n")
|
207 |
-
audio_opt = []
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
for text in texts:
|
210 |
# 解决输入目标文本的空行导致报错的问题
|
211 |
if (len(text.strip()) == 0):
|
212 |
continue
|
213 |
-
|
214 |
-
|
215 |
-
if prompt_language == "zh":
|
216 |
-
bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
217 |
else:
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
if text_language == "zh":
|
223 |
-
bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
|
224 |
else:
|
225 |
-
bert2 =
|
|
|
226 |
bert = torch.cat([bert1, bert2], 1)
|
227 |
|
228 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
@@ -345,85 +446,96 @@ def cut2(inp):
|
|
345 |
def cut3(inp):
|
346 |
inp = inp.strip("\n")
|
347 |
return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
|
348 |
-
|
349 |
-
|
350 |
-
"""
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
|
370 |
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
371 |
-
gr.Markdown(
|
372 |
-
|
373 |
-
|
374 |
-
### <center>模型作者:Xz乔希 https://space.bilibili.com/5859321\n
|
375 |
-
### <center>数据集下载:https://huggingface.co/datasets/XzJosh/audiodataset\n
|
376 |
-
### <center>声音归属:永雏塔菲 https://space.bilibili.com/1265680561\n
|
377 |
-
### <center>GPT-SoVITS项目:https://github.com/RVC-Boss/GPT-SoVITS\n
|
378 |
-
### <center>使用本模型请严格遵守法律法规!发布二创作品请标注本项目作者及链接、作品使用GPT-SoVITS AI生成!\n
|
379 |
-
### <center>⚠️在线端不稳定且生成速度较慢,强烈建议下载模型本地推理!\n
|
380 |
-
""")
|
381 |
-
# with gr.Tabs():
|
382 |
-
# with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
|
383 |
with gr.Group():
|
384 |
-
gr.Markdown(value="
|
385 |
with gr.Row():
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
return selected_text, audio_path
|
394 |
-
|
395 |
-
# 绑定下拉菜单的变化到更新函数
|
396 |
-
audio_select.change(update_ref_text_and_audio, [audio_select], [ref_text, ref_audio])
|
397 |
-
|
398 |
-
# 其他 Gradio 组件和功能
|
399 |
-
prompt_language = gr.Dropdown(
|
400 |
-
label="参考音频语种", choices=["中文", "英文", "日文"], value="中文"
|
401 |
-
)
|
402 |
-
gr.Markdown(value="*请填写需要合成的目标文本")
|
403 |
with gr.Row():
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
text_language = gr.Dropdown(
|
406 |
-
label="需要合成的语种",
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
)
|
408 |
-
inference_button = gr.Button("合成语音", variant="primary")
|
409 |
-
output = gr.Audio(label="输出的语音")
|
|
|
410 |
inference_button.click(
|
411 |
get_tts_wav,
|
412 |
-
[
|
413 |
[output],
|
414 |
)
|
415 |
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os,re,logging
|
2 |
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
3 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
4 |
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
5 |
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
6 |
+
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
7 |
+
|
8 |
+
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
9 |
+
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
10 |
+
import pdb
|
11 |
+
|
12 |
+
if os.path.exists("./gweight.txt"):
|
13 |
+
with open("./gweight.txt", 'r',encoding="utf-8") as file:
|
14 |
+
gweight_data = file.read()
|
15 |
+
gpt_path = os.environ.get(
|
16 |
+
"gpt_path", gweight_data)
|
17 |
+
else:
|
18 |
+
gpt_path = os.environ.get(
|
19 |
+
"gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
|
20 |
|
21 |
+
if os.path.exists("./sweight.txt"):
|
22 |
+
with open("./sweight.txt", 'r',encoding="utf-8") as file:
|
23 |
+
sweight_data = file.read()
|
24 |
+
sovits_path = os.environ.get("sovits_path", sweight_data)
|
25 |
+
else:
|
26 |
+
sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
|
27 |
+
# gpt_path = os.environ.get(
|
28 |
+
# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
29 |
+
# )
|
30 |
+
# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
|
31 |
cnhubert_base_path = os.environ.get(
|
32 |
"cnhubert_base_path", "pretrained_models/chinese-hubert-base"
|
33 |
)
|
|
|
36 |
)
|
37 |
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
38 |
infer_ttswebui = int(infer_ttswebui)
|
39 |
+
is_share = os.environ.get("is_share", "False")
|
40 |
+
is_share=eval(is_share)
|
41 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
42 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
43 |
is_half = eval(os.environ.get("is_half", "True"))
|
|
|
47 |
import librosa,torch
|
48 |
from feature_extractor import cnhubert
|
49 |
cnhubert.cnhubert_base_path=cnhubert_base_path
|
|
|
|
|
|
|
|
|
50 |
|
51 |
from module.models import SynthesizerTrn
|
52 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|
|
55 |
from time import time as ttime
|
56 |
from module.mel_processing import spectrogram_torch
|
57 |
from my_utils import load_audio
|
58 |
+
from tools.i18n.i18n import I18nAuto
|
59 |
+
i18n = I18nAuto()
|
60 |
|
61 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
62 |
|
63 |
+
if torch.cuda.is_available():
|
64 |
+
device = "cuda"
|
65 |
+
elif torch.backends.mps.is_available():
|
66 |
+
device = "mps"
|
67 |
+
else:
|
68 |
+
device = "cpu"
|
69 |
|
70 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
71 |
bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
|
|
|
74 |
else:
|
75 |
bert_model = bert_model.to(device)
|
76 |
|
|
|
|
|
77 |
def get_bert_feature(text, word2ph):
|
78 |
with torch.no_grad():
|
79 |
inputs = tokenizer(text, return_tensors="pt")
|
80 |
for i in inputs:
|
81 |
+
inputs[i] = inputs[i].to(device)
|
82 |
res = bert_model(**inputs, output_hidden_states=True)
|
83 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
84 |
assert len(word2ph) == len(text)
|
|
|
87 |
repeat_feature = res[i].repeat(word2ph[i], 1)
|
88 |
phone_level_feature.append(repeat_feature)
|
89 |
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
|
|
90 |
return phone_level_feature.T
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
class DictToAttrRecursive(dict):
|
93 |
def __init__(self, input_dict):
|
94 |
super().__init__(input_dict)
|
|
|
117 |
raise AttributeError(f"Attribute {item} not found")
|
118 |
|
119 |
|
|
|
|
|
|
|
|
|
|
|
120 |
ssl_model = cnhubert.get_model()
|
121 |
if is_half == True:
|
122 |
ssl_model = ssl_model.half().to(device)
|
|
|
135 |
n_speakers=hps.data.n_speakers,
|
136 |
**hps.model
|
137 |
)
|
138 |
+
if("pretrained"not in sovits_path):
|
139 |
+
del vq_model.enc_q
|
140 |
if is_half == True:
|
141 |
vq_model = vq_model.half().to(device)
|
142 |
else:
|
143 |
vq_model = vq_model.to(device)
|
144 |
vq_model.eval()
|
145 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
146 |
+
with open("./sweight.txt","w",encoding="utf-8")as f:f.write(sovits_path)
|
147 |
change_sovits_weights(sovits_path)
|
148 |
|
149 |
def change_gpt_weights(gpt_path):
|
|
|
160 |
t2s_model.eval()
|
161 |
total = sum([param.nelement() for param in t2s_model.parameters()])
|
162 |
print("Number of parameter: %.2fM" % (total / 1e6))
|
163 |
+
with open("./gweight.txt","w",encoding="utf-8")as f:f.write(gpt_path)
|
164 |
change_gpt_weights(gpt_path)
|
165 |
|
|
|
166 |
def get_spepc(hps, filename):
|
167 |
audio = load_audio(filename, int(hps.data.sampling_rate))
|
168 |
audio = torch.FloatTensor(audio)
|
|
|
179 |
return spec
|
180 |
|
181 |
|
182 |
+
dict_language={
|
183 |
+
i18n("中文"):"zh",
|
184 |
+
i18n("英文"):"en",
|
185 |
+
i18n("日文"):"ja"
|
186 |
+
}
|
187 |
+
|
188 |
+
|
189 |
+
def splite_en_inf(sentence, language):
|
190 |
+
pattern = re.compile(r'[a-zA-Z. ]+')
|
191 |
+
textlist = []
|
192 |
+
langlist = []
|
193 |
+
pos = 0
|
194 |
+
for match in pattern.finditer(sentence):
|
195 |
+
start, end = match.span()
|
196 |
+
if start > pos:
|
197 |
+
textlist.append(sentence[pos:start])
|
198 |
+
langlist.append(language)
|
199 |
+
textlist.append(sentence[start:end])
|
200 |
+
langlist.append("en")
|
201 |
+
pos = end
|
202 |
+
if pos < len(sentence):
|
203 |
+
textlist.append(sentence[pos:])
|
204 |
+
langlist.append(language)
|
205 |
+
|
206 |
+
return textlist, langlist
|
207 |
|
208 |
|
209 |
+
def clean_text_inf(text, language):
|
210 |
+
phones, word2ph, norm_text = clean_text(text, language)
|
211 |
+
phones = cleaned_text_to_sequence(phones)
|
212 |
+
|
213 |
+
return phones, word2ph, norm_text
|
214 |
+
|
215 |
+
|
216 |
+
def get_bert_inf(phones, word2ph, norm_text, language):
|
217 |
+
if language == "zh":
|
218 |
+
bert = get_bert_feature(norm_text, word2ph).to(device)
|
219 |
+
else:
|
220 |
+
bert = torch.zeros(
|
221 |
+
(1024, len(phones)),
|
222 |
+
dtype=torch.float16 if is_half == True else torch.float32,
|
223 |
+
).to(device)
|
224 |
+
|
225 |
+
return bert
|
226 |
+
|
227 |
+
|
228 |
+
def nonen_clean_text_inf(text, language):
|
229 |
+
textlist, langlist = splite_en_inf(text, language)
|
230 |
+
phones_list = []
|
231 |
+
word2ph_list = []
|
232 |
+
norm_text_list = []
|
233 |
+
for i in range(len(textlist)):
|
234 |
+
lang = langlist[i]
|
235 |
+
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
236 |
+
phones_list.append(phones)
|
237 |
+
if lang == "en" or "ja":
|
238 |
+
pass
|
239 |
+
else:
|
240 |
+
word2ph_list.append(word2ph)
|
241 |
+
norm_text_list.append(norm_text)
|
242 |
+
print(word2ph_list)
|
243 |
+
phones = sum(phones_list, [])
|
244 |
+
word2ph = sum(word2ph_list, [])
|
245 |
+
norm_text = ' '.join(norm_text_list)
|
246 |
+
|
247 |
+
return phones, word2ph, norm_text
|
248 |
+
|
249 |
+
|
250 |
+
def nonen_get_bert_inf(text, language):
|
251 |
+
textlist, langlist = splite_en_inf(text, language)
|
252 |
+
print(textlist)
|
253 |
+
print(langlist)
|
254 |
+
bert_list = []
|
255 |
+
for i in range(len(textlist)):
|
256 |
+
text = textlist[i]
|
257 |
+
lang = langlist[i]
|
258 |
+
phones, word2ph, norm_text = clean_text_inf(text, lang)
|
259 |
+
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
260 |
+
bert_list.append(bert)
|
261 |
+
bert = torch.cat(bert_list, dim=1)
|
262 |
+
|
263 |
+
return bert
|
264 |
+
|
265 |
+
#i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切")
|
266 |
+
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,how_to_cut=i18n("不切")):
|
267 |
t0 = ttime()
|
268 |
prompt_text = prompt_text.strip("\n")
|
269 |
prompt_language, text = prompt_language, text.strip("\n")
|
|
|
292 |
t1 = ttime()
|
293 |
prompt_language = dict_language[prompt_language]
|
294 |
text_language = dict_language[text_language]
|
|
|
|
|
|
|
|
|
295 |
|
296 |
+
if prompt_language == "en":
|
297 |
+
phones1, word2ph1, norm_text1 = clean_text_inf(prompt_text, prompt_language)
|
298 |
+
else:
|
299 |
+
phones1, word2ph1, norm_text1 = nonen_clean_text_inf(prompt_text, prompt_language)
|
300 |
+
if(how_to_cut==i18n("凑五句一切")):text=cut1(text)
|
301 |
+
elif(how_to_cut==i18n("凑50字一切")):text=cut2(text)
|
302 |
+
elif(how_to_cut==i18n("按中文句号。切")):text=cut3(text)
|
303 |
+
elif(how_to_cut==i18n("按英文句号.切")):text=cut4(text)
|
304 |
+
text = text.replace("\n\n","\n").replace("\n\n","\n").replace("\n\n","\n")
|
305 |
+
if(text[-1]not in splits):text+="。"if text_language!="en"else "."
|
306 |
+
texts=text.split("\n")
|
307 |
+
audio_opt = []
|
308 |
+
if prompt_language == "en":
|
309 |
+
bert1 = get_bert_inf(phones1, word2ph1, norm_text1, prompt_language)
|
310 |
+
else:
|
311 |
+
bert1 = nonen_get_bert_inf(prompt_text, prompt_language)
|
312 |
+
|
313 |
for text in texts:
|
314 |
# 解决输入目标文本的空行导致报错的问题
|
315 |
if (len(text.strip()) == 0):
|
316 |
continue
|
317 |
+
if text_language == "en":
|
318 |
+
phones2, word2ph2, norm_text2 = clean_text_inf(text, text_language)
|
|
|
|
|
319 |
else:
|
320 |
+
phones2, word2ph2, norm_text2 = nonen_clean_text_inf(text, text_language)
|
321 |
+
|
322 |
+
if text_language == "en":
|
323 |
+
bert2 = get_bert_inf(phones2, word2ph2, norm_text2, text_language)
|
|
|
|
|
324 |
else:
|
325 |
+
bert2 = nonen_get_bert_inf(text, text_language)
|
326 |
+
|
327 |
bert = torch.cat([bert1, bert2], 1)
|
328 |
|
329 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
|
|
446 |
def cut3(inp):
|
447 |
inp = inp.strip("\n")
|
448 |
return "\n".join(["%s。" % item for item in inp.strip("。").split("。")])
|
449 |
+
def cut4(inp):
|
450 |
+
inp = inp.strip("\n")
|
451 |
+
return "\n".join(["%s." % item for item in inp.strip(".").split(".")])
|
452 |
+
|
453 |
+
def custom_sort_key(s):
|
454 |
+
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
455 |
+
parts = re.split('(\d+)', s)
|
456 |
+
# 将数字部分转换为整数,非数字部分保持不变
|
457 |
+
parts = [int(part) if part.isdigit() else part for part in parts]
|
458 |
+
return parts
|
459 |
+
|
460 |
+
def change_choices():
|
461 |
+
SoVITS_names, GPT_names = get_weights_names()
|
462 |
+
return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
|
463 |
+
|
464 |
+
pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
|
465 |
+
pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
466 |
+
SoVITS_weight_root="SoVITS_weights"
|
467 |
+
GPT_weight_root="GPT_weights"
|
468 |
+
os.makedirs(SoVITS_weight_root,exist_ok=True)
|
469 |
+
os.makedirs(GPT_weight_root,exist_ok=True)
|
470 |
+
def get_weights_names():
|
471 |
+
SoVITS_names = [pretrained_sovits_name]
|
472 |
+
for name in os.listdir(SoVITS_weight_root):
|
473 |
+
if name.endswith(".pth"):SoVITS_names.append("%s/%s"%(SoVITS_weight_root,name))
|
474 |
+
GPT_names = [pretrained_gpt_name]
|
475 |
+
for name in os.listdir(GPT_weight_root):
|
476 |
+
if name.endswith(".ckpt"): GPT_names.append("%s/%s"%(GPT_weight_root,name))
|
477 |
+
return SoVITS_names,GPT_names
|
478 |
+
SoVITS_names,GPT_names = get_weights_names()
|
479 |
|
480 |
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
481 |
+
gr.Markdown(
|
482 |
+
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
|
483 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
with gr.Group():
|
485 |
+
gr.Markdown(value=i18n("模型切换"))
|
486 |
with gr.Row():
|
487 |
+
GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path,interactive=True)
|
488 |
+
SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path,interactive=True)
|
489 |
+
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
|
490 |
+
refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
|
491 |
+
SoVITS_dropdown.change(change_sovits_weights,[SoVITS_dropdown],[])
|
492 |
+
GPT_dropdown.change(change_gpt_weights,[GPT_dropdown],[])
|
493 |
+
gr.Markdown(value=i18n("*请上传并填写参考信息"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
with gr.Row():
|
495 |
+
inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath")
|
496 |
+
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
|
497 |
+
prompt_language = gr.Dropdown(
|
498 |
+
label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
|
499 |
+
)
|
500 |
+
gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
|
501 |
+
with gr.Row():
|
502 |
+
text = gr.Textbox(label=i18n("需要合成的文本"), value="")
|
503 |
text_language = gr.Dropdown(
|
504 |
+
label=i18n("需要合成的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n("中文")
|
505 |
+
)
|
506 |
+
how_to_cut = gr.Radio(
|
507 |
+
label=i18n("怎么切"),
|
508 |
+
choices=[i18n("不切"),i18n("凑五句一切"),i18n("凑50字一切"),i18n("按中文句号。切"),i18n("按英文句号.切"),],
|
509 |
+
value=i18n("凑50字一切"),
|
510 |
+
interactive=True,
|
511 |
)
|
512 |
+
inference_button = gr.Button(i18n("合成语音"), variant="primary")
|
513 |
+
output = gr.Audio(label=i18n("输出的语音"))
|
514 |
+
|
515 |
inference_button.click(
|
516 |
get_tts_wav,
|
517 |
+
[inp_ref, prompt_text, prompt_language, text, text_language,how_to_cut],
|
518 |
[output],
|
519 |
)
|
520 |
|
521 |
+
gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
|
522 |
+
with gr.Row():
|
523 |
+
text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"),value="")
|
524 |
+
button1 = gr.Button(i18n("凑五句一切"), variant="primary")
|
525 |
+
button2 = gr.Button(i18n("凑50字一切"), variant="primary")
|
526 |
+
button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
|
527 |
+
button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
|
528 |
+
text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
|
529 |
+
button1.click(cut1, [text_inp], [text_opt])
|
530 |
+
button2.click(cut2, [text_inp], [text_opt])
|
531 |
+
button3.click(cut3, [text_inp], [text_opt])
|
532 |
+
button4.click(cut4, [text_inp], [text_opt])
|
533 |
+
gr.Markdown(value=i18n("后续将支持混合语种编码文本输入。"))
|
534 |
+
|
535 |
+
app.queue(concurrency_count=511, max_size=1022).launch(
|
536 |
+
server_name="0.0.0.0",
|
537 |
+
inbrowser=True,
|
538 |
+
share=is_share,
|
539 |
+
server_port=infer_ttswebui,
|
540 |
+
quiet=True,
|
541 |
+
)
|
module/attentions_onnx.py
ADDED
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from module import commons
|
7 |
+
from module.modules import LayerNorm
|
8 |
+
|
9 |
+
|
10 |
+
class LayerNorm(nn.Module):
|
11 |
+
def __init__(self, channels, eps=1e-5):
|
12 |
+
super().__init__()
|
13 |
+
self.channels = channels
|
14 |
+
self.eps = eps
|
15 |
+
|
16 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
17 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
18 |
+
|
19 |
+
def forward(self, x):
|
20 |
+
x = x.transpose(1, -1)
|
21 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
22 |
+
return x.transpose(1, -1)
|
23 |
+
|
24 |
+
|
25 |
+
@torch.jit.script
|
26 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
27 |
+
n_channels_int = n_channels[0]
|
28 |
+
in_act = input_a + input_b
|
29 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
30 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
31 |
+
acts = t_act * s_act
|
32 |
+
return acts
|
33 |
+
|
34 |
+
|
35 |
+
class Encoder(nn.Module):
|
36 |
+
def __init__(
|
37 |
+
self,
|
38 |
+
hidden_channels,
|
39 |
+
filter_channels,
|
40 |
+
n_heads,
|
41 |
+
n_layers,
|
42 |
+
kernel_size=1,
|
43 |
+
p_dropout=0.0,
|
44 |
+
window_size=4,
|
45 |
+
isflow=True,
|
46 |
+
**kwargs
|
47 |
+
):
|
48 |
+
super().__init__()
|
49 |
+
self.hidden_channels = hidden_channels
|
50 |
+
self.filter_channels = filter_channels
|
51 |
+
self.n_heads = n_heads
|
52 |
+
self.n_layers = n_layers
|
53 |
+
self.kernel_size = kernel_size
|
54 |
+
self.p_dropout = p_dropout
|
55 |
+
self.window_size = window_size
|
56 |
+
# if isflow:
|
57 |
+
# cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
|
58 |
+
# self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
|
59 |
+
# self.cond_layer = weight_norm(cond_layer, name='weight')
|
60 |
+
# self.gin_channels = 256
|
61 |
+
self.cond_layer_idx = self.n_layers
|
62 |
+
if "gin_channels" in kwargs:
|
63 |
+
self.gin_channels = kwargs["gin_channels"]
|
64 |
+
if self.gin_channels != 0:
|
65 |
+
self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
|
66 |
+
# vits2 says 3rd block, so idx is 2 by default
|
67 |
+
self.cond_layer_idx = (
|
68 |
+
kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
|
69 |
+
)
|
70 |
+
logging.debug(self.gin_channels, self.cond_layer_idx)
|
71 |
+
assert (
|
72 |
+
self.cond_layer_idx < self.n_layers
|
73 |
+
), "cond_layer_idx should be less than n_layers"
|
74 |
+
self.drop = nn.Dropout(p_dropout)
|
75 |
+
self.attn_layers = nn.ModuleList()
|
76 |
+
self.norm_layers_1 = nn.ModuleList()
|
77 |
+
self.ffn_layers = nn.ModuleList()
|
78 |
+
self.norm_layers_2 = nn.ModuleList()
|
79 |
+
for i in range(self.n_layers):
|
80 |
+
self.attn_layers.append(
|
81 |
+
MultiHeadAttention(
|
82 |
+
hidden_channels,
|
83 |
+
hidden_channels,
|
84 |
+
n_heads,
|
85 |
+
p_dropout=p_dropout,
|
86 |
+
window_size=window_size,
|
87 |
+
)
|
88 |
+
)
|
89 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
90 |
+
self.ffn_layers.append(
|
91 |
+
FFN(
|
92 |
+
hidden_channels,
|
93 |
+
hidden_channels,
|
94 |
+
filter_channels,
|
95 |
+
kernel_size,
|
96 |
+
p_dropout=p_dropout,
|
97 |
+
)
|
98 |
+
)
|
99 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
100 |
+
|
101 |
+
def forward(self, x, x_mask, g=None):
|
102 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
103 |
+
x = x * x_mask
|
104 |
+
for i in range(self.n_layers):
|
105 |
+
if i == self.cond_layer_idx and g is not None:
|
106 |
+
g = self.spk_emb_linear(g.transpose(1, 2))
|
107 |
+
g = g.transpose(1, 2)
|
108 |
+
x = x + g
|
109 |
+
x = x * x_mask
|
110 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
111 |
+
y = self.drop(y)
|
112 |
+
x = self.norm_layers_1[i](x + y)
|
113 |
+
|
114 |
+
y = self.ffn_layers[i](x, x_mask)
|
115 |
+
y = self.drop(y)
|
116 |
+
x = self.norm_layers_2[i](x + y)
|
117 |
+
x = x * x_mask
|
118 |
+
return x
|
119 |
+
|
120 |
+
|
121 |
+
class MultiHeadAttention(nn.Module):
|
122 |
+
def __init__(
|
123 |
+
self,
|
124 |
+
channels,
|
125 |
+
out_channels,
|
126 |
+
n_heads,
|
127 |
+
p_dropout=0.0,
|
128 |
+
window_size=None,
|
129 |
+
heads_share=True,
|
130 |
+
block_length=None,
|
131 |
+
proximal_bias=False,
|
132 |
+
proximal_init=False,
|
133 |
+
):
|
134 |
+
super().__init__()
|
135 |
+
assert channels % n_heads == 0
|
136 |
+
|
137 |
+
self.channels = channels
|
138 |
+
self.out_channels = out_channels
|
139 |
+
self.n_heads = n_heads
|
140 |
+
self.p_dropout = p_dropout
|
141 |
+
self.window_size = window_size
|
142 |
+
self.heads_share = heads_share
|
143 |
+
self.block_length = block_length
|
144 |
+
self.proximal_bias = proximal_bias
|
145 |
+
self.proximal_init = proximal_init
|
146 |
+
self.attn = None
|
147 |
+
|
148 |
+
self.k_channels = channels // n_heads
|
149 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
150 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
151 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
152 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
153 |
+
self.drop = nn.Dropout(p_dropout)
|
154 |
+
|
155 |
+
if window_size is not None:
|
156 |
+
n_heads_rel = 1 if heads_share else n_heads
|
157 |
+
rel_stddev = self.k_channels**-0.5
|
158 |
+
self.emb_rel_k = nn.Parameter(
|
159 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
160 |
+
* rel_stddev
|
161 |
+
)
|
162 |
+
self.emb_rel_v = nn.Parameter(
|
163 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
164 |
+
* rel_stddev
|
165 |
+
)
|
166 |
+
|
167 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
168 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
169 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
170 |
+
if proximal_init:
|
171 |
+
with torch.no_grad():
|
172 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
173 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
174 |
+
|
175 |
+
def forward(self, x, c, attn_mask=None):
|
176 |
+
q = self.conv_q(x)
|
177 |
+
k = self.conv_k(c)
|
178 |
+
v = self.conv_v(c)
|
179 |
+
|
180 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
181 |
+
|
182 |
+
x = self.conv_o(x)
|
183 |
+
return x
|
184 |
+
|
185 |
+
def attention(self, query, key, value, mask=None):
|
186 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
187 |
+
b, d, t_s, _ = (*key.size(), query.size(2))
|
188 |
+
query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
|
189 |
+
key = key.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
|
190 |
+
value = value.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
|
191 |
+
|
192 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
193 |
+
if self.window_size is not None:
|
194 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
195 |
+
rel_logits = self._matmul_with_relative_keys(
|
196 |
+
query / math.sqrt(self.k_channels), key_relative_embeddings
|
197 |
+
)
|
198 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
199 |
+
scores = scores + scores_local
|
200 |
+
if mask is not None:
|
201 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
202 |
+
if self.block_length is not None:
|
203 |
+
block_mask = (
|
204 |
+
torch.ones_like(scores)
|
205 |
+
.triu(-self.block_length)
|
206 |
+
.tril(self.block_length)
|
207 |
+
)
|
208 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
209 |
+
p_attn = F.softmax(scores, dim=-1)
|
210 |
+
p_attn = self.drop(p_attn)
|
211 |
+
output = torch.matmul(p_attn, value)
|
212 |
+
if self.window_size is not None:
|
213 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
214 |
+
value_relative_embeddings = self._get_relative_embeddings(
|
215 |
+
self.emb_rel_v, t_s
|
216 |
+
)
|
217 |
+
output = output + self._matmul_with_relative_values(
|
218 |
+
relative_weights, value_relative_embeddings
|
219 |
+
)
|
220 |
+
output = (
|
221 |
+
output.transpose(2, 3).contiguous().view(b, d, -1)
|
222 |
+
)
|
223 |
+
return output, p_attn
|
224 |
+
|
225 |
+
def _matmul_with_relative_values(self, x, y):
|
226 |
+
"""
|
227 |
+
x: [b, h, l, m]
|
228 |
+
y: [h or 1, m, d]
|
229 |
+
ret: [b, h, l, d]
|
230 |
+
"""
|
231 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
232 |
+
return ret
|
233 |
+
|
234 |
+
def _matmul_with_relative_keys(self, x, y):
|
235 |
+
"""
|
236 |
+
x: [b, h, l, d]
|
237 |
+
y: [h or 1, m, d]
|
238 |
+
ret: [b, h, l, m]
|
239 |
+
"""
|
240 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
241 |
+
return ret
|
242 |
+
|
243 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
244 |
+
max_relative_position = 2 * self.window_size + 1
|
245 |
+
# Pad first before slice to avoid using cond ops.
|
246 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
247 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
248 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
249 |
+
if pad_length > 0:
|
250 |
+
padded_relative_embeddings = F.pad(
|
251 |
+
relative_embeddings,
|
252 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
253 |
+
)
|
254 |
+
else:
|
255 |
+
padded_relative_embeddings = relative_embeddings
|
256 |
+
used_relative_embeddings = padded_relative_embeddings[
|
257 |
+
:, slice_start_position:slice_end_position
|
258 |
+
]
|
259 |
+
return used_relative_embeddings
|
260 |
+
|
261 |
+
def _relative_position_to_absolute_position(self, x):
|
262 |
+
"""
|
263 |
+
x: [b, h, l, 2*l-1]
|
264 |
+
ret: [b, h, l, l]
|
265 |
+
"""
|
266 |
+
batch, heads, length, _ = x.size()
|
267 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
268 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
269 |
+
|
270 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
271 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
272 |
+
x_flat = F.pad(
|
273 |
+
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
|
274 |
+
)
|
275 |
+
|
276 |
+
# Reshape and slice out the padded elements.
|
277 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
|
278 |
+
:, :, :length, length - 1 :
|
279 |
+
]
|
280 |
+
return x_final
|
281 |
+
|
282 |
+
def _absolute_position_to_relative_position(self, x):
|
283 |
+
"""
|
284 |
+
x: [b, h, l, l]
|
285 |
+
ret: [b, h, l, 2*l-1]
|
286 |
+
"""
|
287 |
+
batch, heads, length, _ = x.size()
|
288 |
+
# padd along column
|
289 |
+
x = F.pad(
|
290 |
+
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
|
291 |
+
)
|
292 |
+
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
|
293 |
+
# add 0's in the beginning that will skew the elements after reshape
|
294 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
295 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
296 |
+
return x_final
|
297 |
+
|
298 |
+
def _attention_bias_proximal(self, length):
|
299 |
+
"""Bias for self-attention to encourage attention to close positions.
|
300 |
+
Args:
|
301 |
+
length: an integer scalar.
|
302 |
+
Returns:
|
303 |
+
a Tensor with shape [1, 1, length, length]
|
304 |
+
"""
|
305 |
+
r = torch.arange(length, dtype=torch.float32)
|
306 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
307 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
308 |
+
|
309 |
+
|
310 |
+
class FFN(nn.Module):
|
311 |
+
def __init__(
|
312 |
+
self,
|
313 |
+
in_channels,
|
314 |
+
out_channels,
|
315 |
+
filter_channels,
|
316 |
+
kernel_size,
|
317 |
+
p_dropout=0.0,
|
318 |
+
activation=None,
|
319 |
+
causal=False,
|
320 |
+
):
|
321 |
+
super().__init__()
|
322 |
+
self.in_channels = in_channels
|
323 |
+
self.out_channels = out_channels
|
324 |
+
self.filter_channels = filter_channels
|
325 |
+
self.kernel_size = kernel_size
|
326 |
+
self.p_dropout = p_dropout
|
327 |
+
self.activation = activation
|
328 |
+
self.causal = causal
|
329 |
+
|
330 |
+
if causal:
|
331 |
+
self.padding = self._causal_padding
|
332 |
+
else:
|
333 |
+
self.padding = self._same_padding
|
334 |
+
|
335 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
336 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
337 |
+
self.drop = nn.Dropout(p_dropout)
|
338 |
+
|
339 |
+
def forward(self, x, x_mask):
|
340 |
+
x = self.conv_1(self.padding(x * x_mask))
|
341 |
+
if self.activation == "gelu":
|
342 |
+
x = x * torch.sigmoid(1.702 * x)
|
343 |
+
else:
|
344 |
+
x = torch.relu(x)
|
345 |
+
x = self.drop(x)
|
346 |
+
x = self.conv_2(self.padding(x * x_mask))
|
347 |
+
return x * x_mask
|
348 |
+
|
349 |
+
def _causal_padding(self, x):
|
350 |
+
if self.kernel_size == 1:
|
351 |
+
return x
|
352 |
+
pad_l = self.kernel_size - 1
|
353 |
+
pad_r = 0
|
354 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
355 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
356 |
+
return x
|
357 |
+
|
358 |
+
def _same_padding(self, x):
|
359 |
+
if self.kernel_size == 1:
|
360 |
+
return x
|
361 |
+
pad_l = (self.kernel_size - 1) // 2
|
362 |
+
pad_r = self.kernel_size // 2
|
363 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
364 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
365 |
+
return x
|
module/models_onnx.py
ADDED
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
from module import commons
|
8 |
+
from module import modules
|
9 |
+
from module import attentions_onnx as attentions
|
10 |
+
|
11 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
12 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
13 |
+
from module.commons import init_weights, get_padding
|
14 |
+
from module.mrte_model import MRTE
|
15 |
+
from module.quantize import ResidualVectorQuantizer
|
16 |
+
from text import symbols
|
17 |
+
from torch.cuda.amp import autocast
|
18 |
+
|
19 |
+
|
20 |
+
class StochasticDurationPredictor(nn.Module):
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
in_channels,
|
24 |
+
filter_channels,
|
25 |
+
kernel_size,
|
26 |
+
p_dropout,
|
27 |
+
n_flows=4,
|
28 |
+
gin_channels=0,
|
29 |
+
):
|
30 |
+
super().__init__()
|
31 |
+
filter_channels = in_channels # it needs to be removed from future version.
|
32 |
+
self.in_channels = in_channels
|
33 |
+
self.filter_channels = filter_channels
|
34 |
+
self.kernel_size = kernel_size
|
35 |
+
self.p_dropout = p_dropout
|
36 |
+
self.n_flows = n_flows
|
37 |
+
self.gin_channels = gin_channels
|
38 |
+
|
39 |
+
self.log_flow = modules.Log()
|
40 |
+
self.flows = nn.ModuleList()
|
41 |
+
self.flows.append(modules.ElementwiseAffine(2))
|
42 |
+
for i in range(n_flows):
|
43 |
+
self.flows.append(
|
44 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
45 |
+
)
|
46 |
+
self.flows.append(modules.Flip())
|
47 |
+
|
48 |
+
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
49 |
+
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
50 |
+
self.post_convs = modules.DDSConv(
|
51 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
52 |
+
)
|
53 |
+
self.post_flows = nn.ModuleList()
|
54 |
+
self.post_flows.append(modules.ElementwiseAffine(2))
|
55 |
+
for i in range(4):
|
56 |
+
self.post_flows.append(
|
57 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
58 |
+
)
|
59 |
+
self.post_flows.append(modules.Flip())
|
60 |
+
|
61 |
+
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
62 |
+
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
63 |
+
self.convs = modules.DDSConv(
|
64 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
65 |
+
)
|
66 |
+
if gin_channels != 0:
|
67 |
+
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
68 |
+
|
69 |
+
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
70 |
+
x = torch.detach(x)
|
71 |
+
x = self.pre(x)
|
72 |
+
if g is not None:
|
73 |
+
g = torch.detach(g)
|
74 |
+
x = x + self.cond(g)
|
75 |
+
x = self.convs(x, x_mask)
|
76 |
+
x = self.proj(x) * x_mask
|
77 |
+
|
78 |
+
if not reverse:
|
79 |
+
flows = self.flows
|
80 |
+
assert w is not None
|
81 |
+
|
82 |
+
logdet_tot_q = 0
|
83 |
+
h_w = self.post_pre(w)
|
84 |
+
h_w = self.post_convs(h_w, x_mask)
|
85 |
+
h_w = self.post_proj(h_w) * x_mask
|
86 |
+
e_q = (
|
87 |
+
torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
|
88 |
+
* x_mask
|
89 |
+
)
|
90 |
+
z_q = e_q
|
91 |
+
for flow in self.post_flows:
|
92 |
+
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
93 |
+
logdet_tot_q += logdet_q
|
94 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
95 |
+
u = torch.sigmoid(z_u) * x_mask
|
96 |
+
z0 = (w - u) * x_mask
|
97 |
+
logdet_tot_q += torch.sum(
|
98 |
+
(F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
|
99 |
+
)
|
100 |
+
logq = (
|
101 |
+
torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
|
102 |
+
- logdet_tot_q
|
103 |
+
)
|
104 |
+
|
105 |
+
logdet_tot = 0
|
106 |
+
z0, logdet = self.log_flow(z0, x_mask)
|
107 |
+
logdet_tot += logdet
|
108 |
+
z = torch.cat([z0, z1], 1)
|
109 |
+
for flow in flows:
|
110 |
+
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
111 |
+
logdet_tot = logdet_tot + logdet
|
112 |
+
nll = (
|
113 |
+
torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
|
114 |
+
- logdet_tot
|
115 |
+
)
|
116 |
+
return nll + logq # [b]
|
117 |
+
else:
|
118 |
+
flows = list(reversed(self.flows))
|
119 |
+
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
120 |
+
z = (
|
121 |
+
torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
|
122 |
+
* noise_scale
|
123 |
+
)
|
124 |
+
for flow in flows:
|
125 |
+
z = flow(z, x_mask, g=x, reverse=reverse)
|
126 |
+
z0, z1 = torch.split(z, [1, 1], 1)
|
127 |
+
logw = z0
|
128 |
+
return logw
|
129 |
+
|
130 |
+
|
131 |
+
class DurationPredictor(nn.Module):
|
132 |
+
def __init__(
|
133 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
134 |
+
):
|
135 |
+
super().__init__()
|
136 |
+
|
137 |
+
self.in_channels = in_channels
|
138 |
+
self.filter_channels = filter_channels
|
139 |
+
self.kernel_size = kernel_size
|
140 |
+
self.p_dropout = p_dropout
|
141 |
+
self.gin_channels = gin_channels
|
142 |
+
|
143 |
+
self.drop = nn.Dropout(p_dropout)
|
144 |
+
self.conv_1 = nn.Conv1d(
|
145 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
146 |
+
)
|
147 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
148 |
+
self.conv_2 = nn.Conv1d(
|
149 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
150 |
+
)
|
151 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
152 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
153 |
+
|
154 |
+
if gin_channels != 0:
|
155 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
156 |
+
|
157 |
+
def forward(self, x, x_mask, g=None):
|
158 |
+
x = torch.detach(x)
|
159 |
+
if g is not None:
|
160 |
+
g = torch.detach(g)
|
161 |
+
x = x + self.cond(g)
|
162 |
+
x = self.conv_1(x * x_mask)
|
163 |
+
x = torch.relu(x)
|
164 |
+
x = self.norm_1(x)
|
165 |
+
x = self.drop(x)
|
166 |
+
x = self.conv_2(x * x_mask)
|
167 |
+
x = torch.relu(x)
|
168 |
+
x = self.norm_2(x)
|
169 |
+
x = self.drop(x)
|
170 |
+
x = self.proj(x * x_mask)
|
171 |
+
return x * x_mask
|
172 |
+
|
173 |
+
|
174 |
+
class TextEncoder(nn.Module):
|
175 |
+
def __init__(
|
176 |
+
self,
|
177 |
+
out_channels,
|
178 |
+
hidden_channels,
|
179 |
+
filter_channels,
|
180 |
+
n_heads,
|
181 |
+
n_layers,
|
182 |
+
kernel_size,
|
183 |
+
p_dropout,
|
184 |
+
latent_channels=192,
|
185 |
+
):
|
186 |
+
super().__init__()
|
187 |
+
self.out_channels = out_channels
|
188 |
+
self.hidden_channels = hidden_channels
|
189 |
+
self.filter_channels = filter_channels
|
190 |
+
self.n_heads = n_heads
|
191 |
+
self.n_layers = n_layers
|
192 |
+
self.kernel_size = kernel_size
|
193 |
+
self.p_dropout = p_dropout
|
194 |
+
self.latent_channels = latent_channels
|
195 |
+
|
196 |
+
self.ssl_proj = nn.Conv1d(768, hidden_channels, 1)
|
197 |
+
|
198 |
+
self.encoder_ssl = attentions.Encoder(
|
199 |
+
hidden_channels,
|
200 |
+
filter_channels,
|
201 |
+
n_heads,
|
202 |
+
n_layers // 2,
|
203 |
+
kernel_size,
|
204 |
+
p_dropout,
|
205 |
+
)
|
206 |
+
|
207 |
+
self.encoder_text = attentions.Encoder(
|
208 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
209 |
+
)
|
210 |
+
self.text_embedding = nn.Embedding(len(symbols), hidden_channels)
|
211 |
+
|
212 |
+
self.mrte = MRTE()
|
213 |
+
|
214 |
+
self.encoder2 = attentions.Encoder(
|
215 |
+
hidden_channels,
|
216 |
+
filter_channels,
|
217 |
+
n_heads,
|
218 |
+
n_layers // 2,
|
219 |
+
kernel_size,
|
220 |
+
p_dropout,
|
221 |
+
)
|
222 |
+
|
223 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
224 |
+
|
225 |
+
def forward(self, y, text, ge):
|
226 |
+
y_mask = torch.ones_like(y[:1,:1,:])
|
227 |
+
|
228 |
+
y = self.ssl_proj(y * y_mask) * y_mask
|
229 |
+
y = self.encoder_ssl(y * y_mask, y_mask)
|
230 |
+
|
231 |
+
text_mask = torch.ones_like(text).to(y.dtype).unsqueeze(0)
|
232 |
+
|
233 |
+
text = self.text_embedding(text).transpose(1, 2)
|
234 |
+
text = self.encoder_text(text * text_mask, text_mask)
|
235 |
+
y = self.mrte(y, y_mask, text, text_mask, ge)
|
236 |
+
|
237 |
+
y = self.encoder2(y * y_mask, y_mask)
|
238 |
+
|
239 |
+
stats = self.proj(y) * y_mask
|
240 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
241 |
+
return y, m, logs, y_mask
|
242 |
+
|
243 |
+
def extract_latent(self, x):
|
244 |
+
x = self.ssl_proj(x)
|
245 |
+
quantized, codes, commit_loss, quantized_list = self.quantizer(x)
|
246 |
+
return codes.transpose(0, 1)
|
247 |
+
|
248 |
+
def decode_latent(self, codes, y_mask, refer, refer_mask, ge):
|
249 |
+
quantized = self.quantizer.decode(codes)
|
250 |
+
|
251 |
+
y = self.vq_proj(quantized) * y_mask
|
252 |
+
y = self.encoder_ssl(y * y_mask, y_mask)
|
253 |
+
|
254 |
+
y = self.mrte(y, y_mask, refer, refer_mask, ge)
|
255 |
+
|
256 |
+
y = self.encoder2(y * y_mask, y_mask)
|
257 |
+
|
258 |
+
stats = self.proj(y) * y_mask
|
259 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
260 |
+
return y, m, logs, y_mask, quantized
|
261 |
+
|
262 |
+
|
263 |
+
class ResidualCouplingBlock(nn.Module):
|
264 |
+
def __init__(
|
265 |
+
self,
|
266 |
+
channels,
|
267 |
+
hidden_channels,
|
268 |
+
kernel_size,
|
269 |
+
dilation_rate,
|
270 |
+
n_layers,
|
271 |
+
n_flows=4,
|
272 |
+
gin_channels=0,
|
273 |
+
):
|
274 |
+
super().__init__()
|
275 |
+
self.channels = channels
|
276 |
+
self.hidden_channels = hidden_channels
|
277 |
+
self.kernel_size = kernel_size
|
278 |
+
self.dilation_rate = dilation_rate
|
279 |
+
self.n_layers = n_layers
|
280 |
+
self.n_flows = n_flows
|
281 |
+
self.gin_channels = gin_channels
|
282 |
+
|
283 |
+
self.flows = nn.ModuleList()
|
284 |
+
for i in range(n_flows):
|
285 |
+
self.flows.append(
|
286 |
+
modules.ResidualCouplingLayer(
|
287 |
+
channels,
|
288 |
+
hidden_channels,
|
289 |
+
kernel_size,
|
290 |
+
dilation_rate,
|
291 |
+
n_layers,
|
292 |
+
gin_channels=gin_channels,
|
293 |
+
mean_only=True,
|
294 |
+
)
|
295 |
+
)
|
296 |
+
self.flows.append(modules.Flip())
|
297 |
+
|
298 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
299 |
+
if not reverse:
|
300 |
+
for flow in self.flows:
|
301 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
302 |
+
else:
|
303 |
+
for flow in reversed(self.flows):
|
304 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
305 |
+
return x
|
306 |
+
|
307 |
+
|
308 |
+
class PosteriorEncoder(nn.Module):
|
309 |
+
def __init__(
|
310 |
+
self,
|
311 |
+
in_channels,
|
312 |
+
out_channels,
|
313 |
+
hidden_channels,
|
314 |
+
kernel_size,
|
315 |
+
dilation_rate,
|
316 |
+
n_layers,
|
317 |
+
gin_channels=0,
|
318 |
+
):
|
319 |
+
super().__init__()
|
320 |
+
self.in_channels = in_channels
|
321 |
+
self.out_channels = out_channels
|
322 |
+
self.hidden_channels = hidden_channels
|
323 |
+
self.kernel_size = kernel_size
|
324 |
+
self.dilation_rate = dilation_rate
|
325 |
+
self.n_layers = n_layers
|
326 |
+
self.gin_channels = gin_channels
|
327 |
+
|
328 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
329 |
+
self.enc = modules.WN(
|
330 |
+
hidden_channels,
|
331 |
+
kernel_size,
|
332 |
+
dilation_rate,
|
333 |
+
n_layers,
|
334 |
+
gin_channels=gin_channels,
|
335 |
+
)
|
336 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
337 |
+
|
338 |
+
def forward(self, x, x_lengths, g=None):
|
339 |
+
if g != None:
|
340 |
+
g = g.detach()
|
341 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
342 |
+
x.dtype
|
343 |
+
)
|
344 |
+
x = self.pre(x) * x_mask
|
345 |
+
x = self.enc(x, x_mask, g=g)
|
346 |
+
stats = self.proj(x) * x_mask
|
347 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
348 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
349 |
+
return z, m, logs, x_mask
|
350 |
+
|
351 |
+
|
352 |
+
class WNEncoder(nn.Module):
|
353 |
+
def __init__(
|
354 |
+
self,
|
355 |
+
in_channels,
|
356 |
+
out_channels,
|
357 |
+
hidden_channels,
|
358 |
+
kernel_size,
|
359 |
+
dilation_rate,
|
360 |
+
n_layers,
|
361 |
+
gin_channels=0,
|
362 |
+
):
|
363 |
+
super().__init__()
|
364 |
+
self.in_channels = in_channels
|
365 |
+
self.out_channels = out_channels
|
366 |
+
self.hidden_channels = hidden_channels
|
367 |
+
self.kernel_size = kernel_size
|
368 |
+
self.dilation_rate = dilation_rate
|
369 |
+
self.n_layers = n_layers
|
370 |
+
self.gin_channels = gin_channels
|
371 |
+
|
372 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
373 |
+
self.enc = modules.WN(
|
374 |
+
hidden_channels,
|
375 |
+
kernel_size,
|
376 |
+
dilation_rate,
|
377 |
+
n_layers,
|
378 |
+
gin_channels=gin_channels,
|
379 |
+
)
|
380 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
381 |
+
self.norm = modules.LayerNorm(out_channels)
|
382 |
+
|
383 |
+
def forward(self, x, x_lengths, g=None):
|
384 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
385 |
+
x.dtype
|
386 |
+
)
|
387 |
+
x = self.pre(x) * x_mask
|
388 |
+
x = self.enc(x, x_mask, g=g)
|
389 |
+
out = self.proj(x) * x_mask
|
390 |
+
out = self.norm(out)
|
391 |
+
return out
|
392 |
+
|
393 |
+
|
394 |
+
class Generator(torch.nn.Module):
|
395 |
+
def __init__(
|
396 |
+
self,
|
397 |
+
initial_channel,
|
398 |
+
resblock,
|
399 |
+
resblock_kernel_sizes,
|
400 |
+
resblock_dilation_sizes,
|
401 |
+
upsample_rates,
|
402 |
+
upsample_initial_channel,
|
403 |
+
upsample_kernel_sizes,
|
404 |
+
gin_channels=0,
|
405 |
+
):
|
406 |
+
super(Generator, self).__init__()
|
407 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
408 |
+
self.num_upsamples = len(upsample_rates)
|
409 |
+
self.conv_pre = Conv1d(
|
410 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
411 |
+
)
|
412 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
413 |
+
|
414 |
+
self.ups = nn.ModuleList()
|
415 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
416 |
+
self.ups.append(
|
417 |
+
weight_norm(
|
418 |
+
ConvTranspose1d(
|
419 |
+
upsample_initial_channel // (2**i),
|
420 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
421 |
+
k,
|
422 |
+
u,
|
423 |
+
padding=(k - u) // 2,
|
424 |
+
)
|
425 |
+
)
|
426 |
+
)
|
427 |
+
|
428 |
+
self.resblocks = nn.ModuleList()
|
429 |
+
for i in range(len(self.ups)):
|
430 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
431 |
+
for j, (k, d) in enumerate(
|
432 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
433 |
+
):
|
434 |
+
self.resblocks.append(resblock(ch, k, d))
|
435 |
+
|
436 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
437 |
+
self.ups.apply(init_weights)
|
438 |
+
|
439 |
+
if gin_channels != 0:
|
440 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
441 |
+
|
442 |
+
def forward(self, x, g=None):
|
443 |
+
x = self.conv_pre(x)
|
444 |
+
if g is not None:
|
445 |
+
x = x + self.cond(g)
|
446 |
+
|
447 |
+
for i in range(self.num_upsamples):
|
448 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
449 |
+
x = self.ups[i](x)
|
450 |
+
xs = None
|
451 |
+
for j in range(self.num_kernels):
|
452 |
+
if xs is None:
|
453 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
454 |
+
else:
|
455 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
456 |
+
x = xs / self.num_kernels
|
457 |
+
x = F.leaky_relu(x)
|
458 |
+
x = self.conv_post(x)
|
459 |
+
x = torch.tanh(x)
|
460 |
+
|
461 |
+
return x
|
462 |
+
|
463 |
+
def remove_weight_norm(self):
|
464 |
+
print("Removing weight norm...")
|
465 |
+
for l in self.ups:
|
466 |
+
remove_weight_norm(l)
|
467 |
+
for l in self.resblocks:
|
468 |
+
l.remove_weight_norm()
|
469 |
+
|
470 |
+
|
471 |
+
class DiscriminatorP(torch.nn.Module):
|
472 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
473 |
+
super(DiscriminatorP, self).__init__()
|
474 |
+
self.period = period
|
475 |
+
self.use_spectral_norm = use_spectral_norm
|
476 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
477 |
+
self.convs = nn.ModuleList(
|
478 |
+
[
|
479 |
+
norm_f(
|
480 |
+
Conv2d(
|
481 |
+
1,
|
482 |
+
32,
|
483 |
+
(kernel_size, 1),
|
484 |
+
(stride, 1),
|
485 |
+
padding=(get_padding(kernel_size, 1), 0),
|
486 |
+
)
|
487 |
+
),
|
488 |
+
norm_f(
|
489 |
+
Conv2d(
|
490 |
+
32,
|
491 |
+
128,
|
492 |
+
(kernel_size, 1),
|
493 |
+
(stride, 1),
|
494 |
+
padding=(get_padding(kernel_size, 1), 0),
|
495 |
+
)
|
496 |
+
),
|
497 |
+
norm_f(
|
498 |
+
Conv2d(
|
499 |
+
128,
|
500 |
+
512,
|
501 |
+
(kernel_size, 1),
|
502 |
+
(stride, 1),
|
503 |
+
padding=(get_padding(kernel_size, 1), 0),
|
504 |
+
)
|
505 |
+
),
|
506 |
+
norm_f(
|
507 |
+
Conv2d(
|
508 |
+
512,
|
509 |
+
1024,
|
510 |
+
(kernel_size, 1),
|
511 |
+
(stride, 1),
|
512 |
+
padding=(get_padding(kernel_size, 1), 0),
|
513 |
+
)
|
514 |
+
),
|
515 |
+
norm_f(
|
516 |
+
Conv2d(
|
517 |
+
1024,
|
518 |
+
1024,
|
519 |
+
(kernel_size, 1),
|
520 |
+
1,
|
521 |
+
padding=(get_padding(kernel_size, 1), 0),
|
522 |
+
)
|
523 |
+
),
|
524 |
+
]
|
525 |
+
)
|
526 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
527 |
+
|
528 |
+
def forward(self, x):
|
529 |
+
fmap = []
|
530 |
+
|
531 |
+
# 1d to 2d
|
532 |
+
b, c, t = x.shape
|
533 |
+
if t % self.period != 0: # pad first
|
534 |
+
n_pad = self.period - (t % self.period)
|
535 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
536 |
+
t = t + n_pad
|
537 |
+
x = x.view(b, c, t // self.period, self.period)
|
538 |
+
|
539 |
+
for l in self.convs:
|
540 |
+
x = l(x)
|
541 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
542 |
+
fmap.append(x)
|
543 |
+
x = self.conv_post(x)
|
544 |
+
fmap.append(x)
|
545 |
+
x = torch.flatten(x, 1, -1)
|
546 |
+
|
547 |
+
return x, fmap
|
548 |
+
|
549 |
+
|
550 |
+
class DiscriminatorS(torch.nn.Module):
|
551 |
+
def __init__(self, use_spectral_norm=False):
|
552 |
+
super(DiscriminatorS, self).__init__()
|
553 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
554 |
+
self.convs = nn.ModuleList(
|
555 |
+
[
|
556 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
557 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
558 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
559 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
560 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
561 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
562 |
+
]
|
563 |
+
)
|
564 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
565 |
+
|
566 |
+
def forward(self, x):
|
567 |
+
fmap = []
|
568 |
+
|
569 |
+
for l in self.convs:
|
570 |
+
x = l(x)
|
571 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
572 |
+
fmap.append(x)
|
573 |
+
x = self.conv_post(x)
|
574 |
+
fmap.append(x)
|
575 |
+
x = torch.flatten(x, 1, -1)
|
576 |
+
|
577 |
+
return x, fmap
|
578 |
+
|
579 |
+
|
580 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
581 |
+
def __init__(self, use_spectral_norm=False):
|
582 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
583 |
+
periods = [2, 3, 5, 7, 11]
|
584 |
+
|
585 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
586 |
+
discs = discs + [
|
587 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
588 |
+
]
|
589 |
+
self.discriminators = nn.ModuleList(discs)
|
590 |
+
|
591 |
+
def forward(self, y, y_hat):
|
592 |
+
y_d_rs = []
|
593 |
+
y_d_gs = []
|
594 |
+
fmap_rs = []
|
595 |
+
fmap_gs = []
|
596 |
+
for i, d in enumerate(self.discriminators):
|
597 |
+
y_d_r, fmap_r = d(y)
|
598 |
+
y_d_g, fmap_g = d(y_hat)
|
599 |
+
y_d_rs.append(y_d_r)
|
600 |
+
y_d_gs.append(y_d_g)
|
601 |
+
fmap_rs.append(fmap_r)
|
602 |
+
fmap_gs.append(fmap_g)
|
603 |
+
|
604 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
605 |
+
|
606 |
+
|
607 |
+
class ReferenceEncoder(nn.Module):
|
608 |
+
"""
|
609 |
+
inputs --- [N, Ty/r, n_mels*r] mels
|
610 |
+
outputs --- [N, ref_enc_gru_size]
|
611 |
+
"""
|
612 |
+
|
613 |
+
def __init__(self, spec_channels, gin_channels=0):
|
614 |
+
super().__init__()
|
615 |
+
self.spec_channels = spec_channels
|
616 |
+
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
617 |
+
K = len(ref_enc_filters)
|
618 |
+
filters = [1] + ref_enc_filters
|
619 |
+
convs = [
|
620 |
+
weight_norm(
|
621 |
+
nn.Conv2d(
|
622 |
+
in_channels=filters[i],
|
623 |
+
out_channels=filters[i + 1],
|
624 |
+
kernel_size=(3, 3),
|
625 |
+
stride=(2, 2),
|
626 |
+
padding=(1, 1),
|
627 |
+
)
|
628 |
+
)
|
629 |
+
for i in range(K)
|
630 |
+
]
|
631 |
+
self.convs = nn.ModuleList(convs)
|
632 |
+
# self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
|
633 |
+
|
634 |
+
out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
|
635 |
+
self.gru = nn.GRU(
|
636 |
+
input_size=ref_enc_filters[-1] * out_channels,
|
637 |
+
hidden_size=256 // 2,
|
638 |
+
batch_first=True,
|
639 |
+
)
|
640 |
+
self.proj = nn.Linear(128, gin_channels)
|
641 |
+
|
642 |
+
def forward(self, inputs):
|
643 |
+
N = inputs.size(0)
|
644 |
+
out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
|
645 |
+
for conv in self.convs:
|
646 |
+
out = conv(out)
|
647 |
+
# out = wn(out)
|
648 |
+
out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
649 |
+
|
650 |
+
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
651 |
+
T = out.size(1)
|
652 |
+
N = out.size(0)
|
653 |
+
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
654 |
+
|
655 |
+
self.gru.flatten_parameters()
|
656 |
+
memory, out = self.gru(out) # out --- [1, N, 128]
|
657 |
+
|
658 |
+
return self.proj(out.squeeze(0)).unsqueeze(-1)
|
659 |
+
|
660 |
+
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
661 |
+
for i in range(n_convs):
|
662 |
+
L = (L - kernel_size + 2 * pad) // stride + 1
|
663 |
+
return L
|
664 |
+
|
665 |
+
|
666 |
+
class Quantizer_module(torch.nn.Module):
|
667 |
+
def __init__(self, n_e, e_dim):
|
668 |
+
super(Quantizer_module, self).__init__()
|
669 |
+
self.embedding = nn.Embedding(n_e, e_dim)
|
670 |
+
self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
|
671 |
+
|
672 |
+
def forward(self, x):
|
673 |
+
d = (
|
674 |
+
torch.sum(x**2, 1, keepdim=True)
|
675 |
+
+ torch.sum(self.embedding.weight**2, 1)
|
676 |
+
- 2 * torch.matmul(x, self.embedding.weight.T)
|
677 |
+
)
|
678 |
+
min_indicies = torch.argmin(d, 1)
|
679 |
+
z_q = self.embedding(min_indicies)
|
680 |
+
return z_q, min_indicies
|
681 |
+
|
682 |
+
|
683 |
+
class Quantizer(torch.nn.Module):
|
684 |
+
def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160):
|
685 |
+
super(Quantizer, self).__init__()
|
686 |
+
assert embed_dim % n_code_groups == 0
|
687 |
+
self.quantizer_modules = nn.ModuleList(
|
688 |
+
[
|
689 |
+
Quantizer_module(n_codes, embed_dim // n_code_groups)
|
690 |
+
for _ in range(n_code_groups)
|
691 |
+
]
|
692 |
+
)
|
693 |
+
self.n_code_groups = n_code_groups
|
694 |
+
self.embed_dim = embed_dim
|
695 |
+
|
696 |
+
def forward(self, xin):
|
697 |
+
# B, C, T
|
698 |
+
B, C, T = xin.shape
|
699 |
+
xin = xin.transpose(1, 2)
|
700 |
+
x = xin.reshape(-1, self.embed_dim)
|
701 |
+
x = torch.split(x, self.embed_dim // self.n_code_groups, dim=-1)
|
702 |
+
min_indicies = []
|
703 |
+
z_q = []
|
704 |
+
for _x, m in zip(x, self.quantizer_modules):
|
705 |
+
_z_q, _min_indicies = m(_x)
|
706 |
+
z_q.append(_z_q)
|
707 |
+
min_indicies.append(_min_indicies) # B * T,
|
708 |
+
z_q = torch.cat(z_q, -1).reshape(xin.shape)
|
709 |
+
loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean(
|
710 |
+
(z_q - xin.detach()) ** 2
|
711 |
+
)
|
712 |
+
z_q = xin + (z_q - xin).detach()
|
713 |
+
z_q = z_q.transpose(1, 2)
|
714 |
+
codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups)
|
715 |
+
return z_q, loss, codes.transpose(1, 2)
|
716 |
+
|
717 |
+
def embed(self, x):
|
718 |
+
# idx: N, 4, T
|
719 |
+
x = x.transpose(1, 2)
|
720 |
+
x = torch.split(x, 1, 2)
|
721 |
+
ret = []
|
722 |
+
for q, embed in zip(x, self.quantizer_modules):
|
723 |
+
q = embed.embedding(q.squeeze(-1))
|
724 |
+
ret.append(q)
|
725 |
+
ret = torch.cat(ret, -1)
|
726 |
+
return ret.transpose(1, 2) # N, C, T
|
727 |
+
|
728 |
+
|
729 |
+
class CodePredictor(nn.Module):
|
730 |
+
def __init__(
|
731 |
+
self,
|
732 |
+
hidden_channels,
|
733 |
+
filter_channels,
|
734 |
+
n_heads,
|
735 |
+
n_layers,
|
736 |
+
kernel_size,
|
737 |
+
p_dropout,
|
738 |
+
n_q=8,
|
739 |
+
dims=1024,
|
740 |
+
ssl_dim=768,
|
741 |
+
):
|
742 |
+
super().__init__()
|
743 |
+
self.hidden_channels = hidden_channels
|
744 |
+
self.filter_channels = filter_channels
|
745 |
+
self.n_heads = n_heads
|
746 |
+
self.n_layers = n_layers
|
747 |
+
self.kernel_size = kernel_size
|
748 |
+
self.p_dropout = p_dropout
|
749 |
+
|
750 |
+
self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1)
|
751 |
+
self.ref_enc = modules.MelStyleEncoder(
|
752 |
+
ssl_dim, style_vector_dim=hidden_channels
|
753 |
+
)
|
754 |
+
|
755 |
+
self.encoder = attentions.Encoder(
|
756 |
+
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
757 |
+
)
|
758 |
+
|
759 |
+
self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1)
|
760 |
+
self.n_q = n_q
|
761 |
+
self.dims = dims
|
762 |
+
|
763 |
+
def forward(self, x, x_mask, refer, codes, infer=False):
|
764 |
+
x = x.detach()
|
765 |
+
x = self.vq_proj(x * x_mask) * x_mask
|
766 |
+
g = self.ref_enc(refer, x_mask)
|
767 |
+
x = x + g
|
768 |
+
x = self.encoder(x * x_mask, x_mask)
|
769 |
+
x = self.out_proj(x * x_mask) * x_mask
|
770 |
+
logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(
|
771 |
+
2, 3
|
772 |
+
)
|
773 |
+
target = codes[1:].transpose(0, 1)
|
774 |
+
if not infer:
|
775 |
+
logits = logits.reshape(-1, self.dims)
|
776 |
+
target = target.reshape(-1)
|
777 |
+
loss = torch.nn.functional.cross_entropy(logits, target)
|
778 |
+
return loss
|
779 |
+
else:
|
780 |
+
_, top10_preds = torch.topk(logits, 10, dim=-1)
|
781 |
+
correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1)
|
782 |
+
top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item()
|
783 |
+
|
784 |
+
print("Top-10 Accuracy:", top3_acc, "%")
|
785 |
+
|
786 |
+
pred_codes = torch.argmax(logits, dim=-1)
|
787 |
+
acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item()
|
788 |
+
print("Top-1 Accuracy:", acc, "%")
|
789 |
+
|
790 |
+
return pred_codes.transpose(0, 1)
|
791 |
+
|
792 |
+
|
793 |
+
class SynthesizerTrn(nn.Module):
|
794 |
+
"""
|
795 |
+
Synthesizer for Training
|
796 |
+
"""
|
797 |
+
|
798 |
+
def __init__(
|
799 |
+
self,
|
800 |
+
spec_channels,
|
801 |
+
segment_size,
|
802 |
+
inter_channels,
|
803 |
+
hidden_channels,
|
804 |
+
filter_channels,
|
805 |
+
n_heads,
|
806 |
+
n_layers,
|
807 |
+
kernel_size,
|
808 |
+
p_dropout,
|
809 |
+
resblock,
|
810 |
+
resblock_kernel_sizes,
|
811 |
+
resblock_dilation_sizes,
|
812 |
+
upsample_rates,
|
813 |
+
upsample_initial_channel,
|
814 |
+
upsample_kernel_sizes,
|
815 |
+
n_speakers=0,
|
816 |
+
gin_channels=0,
|
817 |
+
use_sdp=True,
|
818 |
+
semantic_frame_rate=None,
|
819 |
+
freeze_quantizer=None,
|
820 |
+
**kwargs
|
821 |
+
):
|
822 |
+
super().__init__()
|
823 |
+
self.spec_channels = spec_channels
|
824 |
+
self.inter_channels = inter_channels
|
825 |
+
self.hidden_channels = hidden_channels
|
826 |
+
self.filter_channels = filter_channels
|
827 |
+
self.n_heads = n_heads
|
828 |
+
self.n_layers = n_layers
|
829 |
+
self.kernel_size = kernel_size
|
830 |
+
self.p_dropout = p_dropout
|
831 |
+
self.resblock = resblock
|
832 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
833 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
834 |
+
self.upsample_rates = upsample_rates
|
835 |
+
self.upsample_initial_channel = upsample_initial_channel
|
836 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
837 |
+
self.segment_size = segment_size
|
838 |
+
self.n_speakers = n_speakers
|
839 |
+
self.gin_channels = gin_channels
|
840 |
+
|
841 |
+
self.use_sdp = use_sdp
|
842 |
+
self.enc_p = TextEncoder(
|
843 |
+
inter_channels,
|
844 |
+
hidden_channels,
|
845 |
+
filter_channels,
|
846 |
+
n_heads,
|
847 |
+
n_layers,
|
848 |
+
kernel_size,
|
849 |
+
p_dropout,
|
850 |
+
)
|
851 |
+
self.dec = Generator(
|
852 |
+
inter_channels,
|
853 |
+
resblock,
|
854 |
+
resblock_kernel_sizes,
|
855 |
+
resblock_dilation_sizes,
|
856 |
+
upsample_rates,
|
857 |
+
upsample_initial_channel,
|
858 |
+
upsample_kernel_sizes,
|
859 |
+
gin_channels=gin_channels,
|
860 |
+
)
|
861 |
+
self.enc_q = PosteriorEncoder(
|
862 |
+
spec_channels,
|
863 |
+
inter_channels,
|
864 |
+
hidden_channels,
|
865 |
+
5,
|
866 |
+
1,
|
867 |
+
16,
|
868 |
+
gin_channels=gin_channels,
|
869 |
+
)
|
870 |
+
self.flow = ResidualCouplingBlock(
|
871 |
+
inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
|
872 |
+
)
|
873 |
+
|
874 |
+
self.ref_enc = modules.MelStyleEncoder(
|
875 |
+
spec_channels, style_vector_dim=gin_channels
|
876 |
+
)
|
877 |
+
|
878 |
+
ssl_dim = 768
|
879 |
+
self.ssl_dim = ssl_dim
|
880 |
+
assert semantic_frame_rate in ["25hz", "50hz"]
|
881 |
+
self.semantic_frame_rate = semantic_frame_rate
|
882 |
+
if semantic_frame_rate == "25hz":
|
883 |
+
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
|
884 |
+
else:
|
885 |
+
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
|
886 |
+
|
887 |
+
self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
|
888 |
+
if freeze_quantizer:
|
889 |
+
self.ssl_proj.requires_grad_(False)
|
890 |
+
self.quantizer.requires_grad_(False)
|
891 |
+
# self.enc_p.text_embedding.requires_grad_(False)
|
892 |
+
# self.enc_p.encoder_text.requires_grad_(False)
|
893 |
+
# self.enc_p.mrte.requires_grad_(False)
|
894 |
+
|
895 |
+
def forward(self, codes, text, refer):
|
896 |
+
refer_mask = torch.ones_like(refer[:1,:1,:])
|
897 |
+
ge = self.ref_enc(refer * refer_mask, refer_mask)
|
898 |
+
|
899 |
+
y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
|
900 |
+
text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)
|
901 |
+
|
902 |
+
quantized = self.quantizer.decode(codes)
|
903 |
+
if self.semantic_frame_rate == "25hz":
|
904 |
+
dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0)
|
905 |
+
quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)
|
906 |
+
|
907 |
+
x, m_p, logs_p, y_mask = self.enc_p(
|
908 |
+
quantized, text, ge
|
909 |
+
)
|
910 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p)
|
911 |
+
|
912 |
+
z = self.flow(z_p, y_mask, g=ge, reverse=True)
|
913 |
+
|
914 |
+
o = self.dec((z * y_mask)[:, :, :], g=ge)
|
915 |
+
return o
|
916 |
+
|
917 |
+
def extract_latent(self, x):
|
918 |
+
ssl = self.ssl_proj(x)
|
919 |
+
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
920 |
+
return codes.transpose(0, 1)
|
onnx_export.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from module.models_onnx import SynthesizerTrn, symbols
|
2 |
+
from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
from torch import nn
|
6 |
+
from feature_extractor import cnhubert
|
7 |
+
cnhubert_base_path = "pretrained_models/chinese-hubert-base"
|
8 |
+
cnhubert.cnhubert_base_path=cnhubert_base_path
|
9 |
+
ssl_model = cnhubert.get_model()
|
10 |
+
from text import cleaned_text_to_sequence
|
11 |
+
import soundfile
|
12 |
+
from my_utils import load_audio
|
13 |
+
import os
|
14 |
+
import json
|
15 |
+
|
16 |
+
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
17 |
+
hann_window = torch.hann_window(win_size).to(
|
18 |
+
dtype=y.dtype, device=y.device
|
19 |
+
)
|
20 |
+
y = torch.nn.functional.pad(
|
21 |
+
y.unsqueeze(1),
|
22 |
+
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
|
23 |
+
mode="reflect",
|
24 |
+
)
|
25 |
+
y = y.squeeze(1)
|
26 |
+
spec = torch.stft(
|
27 |
+
y,
|
28 |
+
n_fft,
|
29 |
+
hop_length=hop_size,
|
30 |
+
win_length=win_size,
|
31 |
+
window=hann_window,
|
32 |
+
center=center,
|
33 |
+
pad_mode="reflect",
|
34 |
+
normalized=False,
|
35 |
+
onesided=True,
|
36 |
+
return_complex=False,
|
37 |
+
)
|
38 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
39 |
+
return spec
|
40 |
+
|
41 |
+
|
42 |
+
class DictToAttrRecursive(dict):
|
43 |
+
def __init__(self, input_dict):
|
44 |
+
super().__init__(input_dict)
|
45 |
+
for key, value in input_dict.items():
|
46 |
+
if isinstance(value, dict):
|
47 |
+
value = DictToAttrRecursive(value)
|
48 |
+
self[key] = value
|
49 |
+
setattr(self, key, value)
|
50 |
+
|
51 |
+
def __getattr__(self, item):
|
52 |
+
try:
|
53 |
+
return self[item]
|
54 |
+
except KeyError:
|
55 |
+
raise AttributeError(f"Attribute {item} not found")
|
56 |
+
|
57 |
+
def __setattr__(self, key, value):
|
58 |
+
if isinstance(value, dict):
|
59 |
+
value = DictToAttrRecursive(value)
|
60 |
+
super(DictToAttrRecursive, self).__setitem__(key, value)
|
61 |
+
super().__setattr__(key, value)
|
62 |
+
|
63 |
+
def __delattr__(self, item):
|
64 |
+
try:
|
65 |
+
del self[item]
|
66 |
+
except KeyError:
|
67 |
+
raise AttributeError(f"Attribute {item} not found")
|
68 |
+
|
69 |
+
|
70 |
+
class T2SEncoder(nn.Module):
|
71 |
+
def __init__(self, t2s, vits):
|
72 |
+
super().__init__()
|
73 |
+
self.encoder = t2s.onnx_encoder
|
74 |
+
self.vits = vits
|
75 |
+
|
76 |
+
def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
|
77 |
+
codes = self.vits.extract_latent(ssl_content)
|
78 |
+
prompt_semantic = codes[0, 0]
|
79 |
+
bert = torch.cat([ref_bert.transpose(0, 1), text_bert.transpose(0, 1)], 1)
|
80 |
+
all_phoneme_ids = torch.cat([ref_seq, text_seq], 1)
|
81 |
+
bert = bert.unsqueeze(0)
|
82 |
+
prompt = prompt_semantic.unsqueeze(0)
|
83 |
+
return self.encoder(all_phoneme_ids, bert), prompt
|
84 |
+
|
85 |
+
|
86 |
+
class T2SModel(nn.Module):
|
87 |
+
def __init__(self, t2s_path, vits_model):
|
88 |
+
super().__init__()
|
89 |
+
dict_s1 = torch.load(t2s_path, map_location="cpu")
|
90 |
+
self.config = dict_s1["config"]
|
91 |
+
self.t2s_model = Text2SemanticLightningModule(self.config, "ojbk", is_train=False)
|
92 |
+
self.t2s_model.load_state_dict(dict_s1["weight"])
|
93 |
+
self.t2s_model.eval()
|
94 |
+
self.vits_model = vits_model.vq_model
|
95 |
+
self.hz = 50
|
96 |
+
self.max_sec = self.config["data"]["max_sec"]
|
97 |
+
self.t2s_model.model.top_k = torch.LongTensor([self.config["inference"]["top_k"]])
|
98 |
+
self.t2s_model.model.early_stop_num = torch.LongTensor([self.hz * self.max_sec])
|
99 |
+
self.t2s_model = self.t2s_model.model
|
100 |
+
self.t2s_model.init_onnx()
|
101 |
+
self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model)
|
102 |
+
self.first_stage_decoder = self.t2s_model.first_stage_decoder
|
103 |
+
self.stage_decoder = self.t2s_model.stage_decoder
|
104 |
+
#self.t2s_model = torch.jit.script(self.t2s_model)
|
105 |
+
|
106 |
+
def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
|
107 |
+
early_stop_num = self.t2s_model.early_stop_num
|
108 |
+
|
109 |
+
#[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N]
|
110 |
+
x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
|
111 |
+
|
112 |
+
prefix_len = prompts.shape[1]
|
113 |
+
|
114 |
+
#[1,N,512] [1,N]
|
115 |
+
y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts)
|
116 |
+
|
117 |
+
stop = False
|
118 |
+
for idx in range(1, 1500):
|
119 |
+
#[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N]
|
120 |
+
enco = self.stage_decoder(y, k, v, y_emb, x_example)
|
121 |
+
y, k, v, y_emb, logits, samples = enco
|
122 |
+
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
|
123 |
+
stop = True
|
124 |
+
if torch.argmax(logits, dim=-1)[0] == self.t2s_model.EOS or samples[0, 0] == self.t2s_model.EOS:
|
125 |
+
stop = True
|
126 |
+
if stop:
|
127 |
+
break
|
128 |
+
y[0, -1] = 0
|
129 |
+
|
130 |
+
return y[:, -idx:].unsqueeze(0)
|
131 |
+
|
132 |
+
def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False):
|
133 |
+
#self.onnx_encoder = torch.jit.script(self.onnx_encoder)
|
134 |
+
if dynamo:
|
135 |
+
export_options = torch.onnx.ExportOptions(dynamic_shapes=True)
|
136 |
+
onnx_encoder_export_output = torch.onnx.dynamo_export(
|
137 |
+
self.onnx_encoder,
|
138 |
+
(ref_seq, text_seq, ref_bert, text_bert, ssl_content),
|
139 |
+
export_options=export_options
|
140 |
+
)
|
141 |
+
onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx")
|
142 |
+
return
|
143 |
+
torch.onnx.export(
|
144 |
+
self.onnx_encoder,
|
145 |
+
(ref_seq, text_seq, ref_bert, text_bert, ssl_content),
|
146 |
+
f"onnx/{project_name}/{project_name}_t2s_encoder.onnx",
|
147 |
+
input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"],
|
148 |
+
output_names=["x", "prompts"],
|
149 |
+
dynamic_axes={
|
150 |
+
"ref_seq": [1],
|
151 |
+
"text_seq": [1],
|
152 |
+
"ref_bert": [0],
|
153 |
+
"text_bert": [0],
|
154 |
+
"ssl_content": [2],
|
155 |
+
},
|
156 |
+
opset_version=16
|
157 |
+
)
|
158 |
+
x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
|
159 |
+
torch.exp
|
160 |
+
torch.onnx.export(
|
161 |
+
self.first_stage_decoder,
|
162 |
+
(x, prompts),
|
163 |
+
f"onnx/{project_name}/{project_name}_t2s_fsdec.onnx",
|
164 |
+
input_names=["x", "prompts"],
|
165 |
+
output_names=["y", "k", "v", "y_emb", "x_example"],
|
166 |
+
dynamic_axes={
|
167 |
+
"x": [1],
|
168 |
+
"prompts": [1],
|
169 |
+
},
|
170 |
+
verbose=True,
|
171 |
+
opset_version=16
|
172 |
+
)
|
173 |
+
y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts)
|
174 |
+
|
175 |
+
torch.onnx.export(
|
176 |
+
self.stage_decoder,
|
177 |
+
(y, k, v, y_emb, x_example),
|
178 |
+
f"onnx/{project_name}/{project_name}_t2s_sdec.onnx",
|
179 |
+
input_names=["iy", "ik", "iv", "iy_emb", "ix_example"],
|
180 |
+
output_names=["y", "k", "v", "y_emb", "logits", "samples"],
|
181 |
+
dynamic_axes={
|
182 |
+
"iy": [1],
|
183 |
+
"ik": [1],
|
184 |
+
"iv": [1],
|
185 |
+
"iy_emb": [1],
|
186 |
+
"ix_example": [1],
|
187 |
+
},
|
188 |
+
verbose=True,
|
189 |
+
opset_version=16
|
190 |
+
)
|
191 |
+
|
192 |
+
|
193 |
+
class VitsModel(nn.Module):
|
194 |
+
def __init__(self, vits_path):
|
195 |
+
super().__init__()
|
196 |
+
dict_s2 = torch.load(vits_path,map_location="cpu")
|
197 |
+
self.hps = dict_s2["config"]
|
198 |
+
self.hps = DictToAttrRecursive(self.hps)
|
199 |
+
self.hps.model.semantic_frame_rate = "25hz"
|
200 |
+
self.vq_model = SynthesizerTrn(
|
201 |
+
self.hps.data.filter_length // 2 + 1,
|
202 |
+
self.hps.train.segment_size // self.hps.data.hop_length,
|
203 |
+
n_speakers=self.hps.data.n_speakers,
|
204 |
+
**self.hps.model
|
205 |
+
)
|
206 |
+
self.vq_model.eval()
|
207 |
+
self.vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
208 |
+
|
209 |
+
def forward(self, text_seq, pred_semantic, ref_audio):
|
210 |
+
refer = spectrogram_torch(
|
211 |
+
ref_audio,
|
212 |
+
self.hps.data.filter_length,
|
213 |
+
self.hps.data.sampling_rate,
|
214 |
+
self.hps.data.hop_length,
|
215 |
+
self.hps.data.win_length,
|
216 |
+
center=False
|
217 |
+
)
|
218 |
+
return self.vq_model(pred_semantic, text_seq, refer)[0, 0]
|
219 |
+
|
220 |
+
|
221 |
+
class GptSoVits(nn.Module):
|
222 |
+
def __init__(self, vits, t2s):
|
223 |
+
super().__init__()
|
224 |
+
self.vits = vits
|
225 |
+
self.t2s = t2s
|
226 |
+
|
227 |
+
def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content):
|
228 |
+
pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
|
229 |
+
return self.vits(text_seq, pred_semantic, ref_audio)
|
230 |
+
|
231 |
+
def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, project_name):
|
232 |
+
self.t2s.export(ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name)
|
233 |
+
pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content)
|
234 |
+
torch.onnx.export(
|
235 |
+
self.vits,
|
236 |
+
(text_seq, pred_semantic, ref_audio),
|
237 |
+
f"onnx/{project_name}/{project_name}_vits.onnx",
|
238 |
+
input_names=["text_seq", "pred_semantic", "ref_audio"],
|
239 |
+
output_names=["audio"],
|
240 |
+
dynamic_axes={
|
241 |
+
"text_seq": [1],
|
242 |
+
"pred_semantic": [2],
|
243 |
+
"ref_audio": [1],
|
244 |
+
},
|
245 |
+
opset_version=17
|
246 |
+
)
|
247 |
+
|
248 |
+
|
249 |
+
class SSLModel(nn.Module):
|
250 |
+
def __init__(self):
|
251 |
+
super().__init__()
|
252 |
+
self.ssl = ssl_model
|
253 |
+
|
254 |
+
def forward(self, ref_audio_16k):
|
255 |
+
return self.ssl.model(ref_audio_16k)["last_hidden_state"].transpose(1, 2)
|
256 |
+
|
257 |
+
|
258 |
+
def export(vits_path, gpt_path, project_name):
|
259 |
+
vits = VitsModel(vits_path)
|
260 |
+
gpt = T2SModel(gpt_path, vits)
|
261 |
+
gpt_sovits = GptSoVits(vits, gpt)
|
262 |
+
ssl = SSLModel()
|
263 |
+
ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])])
|
264 |
+
text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])])
|
265 |
+
ref_bert = torch.randn((ref_seq.shape[1], 1024)).float()
|
266 |
+
text_bert = torch.randn((text_seq.shape[1], 1024)).float()
|
267 |
+
ref_audio = torch.randn((1, 48000 * 5)).float()
|
268 |
+
# ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float()
|
269 |
+
ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float()
|
270 |
+
ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float()
|
271 |
+
|
272 |
+
try:
|
273 |
+
os.mkdir(f"onnx/{project_name}")
|
274 |
+
except:
|
275 |
+
pass
|
276 |
+
|
277 |
+
ssl_content = ssl(ref_audio_16k).float()
|
278 |
+
|
279 |
+
a = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content).detach().cpu().numpy()
|
280 |
+
|
281 |
+
# soundfile.write("out.wav", a, vits.hps.data.sampling_rate)
|
282 |
+
|
283 |
+
gpt_sovits.export(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, project_name)
|
284 |
+
|
285 |
+
MoeVSConf = {
|
286 |
+
"Folder" : f"{project_name}",
|
287 |
+
"Name" : f"{project_name}",
|
288 |
+
"Type" : "GPT-SoVits",
|
289 |
+
"Rate" : vits.hps.data.sampling_rate,
|
290 |
+
"NumLayers": gpt.t2s_model.num_layers,
|
291 |
+
"EmbeddingDim": gpt.t2s_model.embedding_dim,
|
292 |
+
"Dict": "BasicDict",
|
293 |
+
"BertPath": "chinese-roberta-wwm-ext-large",
|
294 |
+
"Symbol": symbols,
|
295 |
+
"AddBlank": False
|
296 |
+
}
|
297 |
+
|
298 |
+
MoeVSConfJson = json.dumps(MoeVSConf)
|
299 |
+
with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile:
|
300 |
+
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
301 |
+
|
302 |
+
|
303 |
+
if __name__ == "__main__":
|
304 |
+
try:
|
305 |
+
os.mkdir("onnx")
|
306 |
+
except:
|
307 |
+
pass
|
308 |
+
|
309 |
+
gpt_path = "pt_model/koharu-e20.ckpt"
|
310 |
+
vits_path = "pt_model/koharu_e20_s4960.pth"
|
311 |
+
exp_path = "koharu"
|
312 |
+
export(vits_path, gpt_path, exp_path)
|
313 |
+
|
314 |
+
# soundfile.write("out.wav", a, vits.hps.data.sampling_rate)
|
prepare_datasets/1-get-text.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
inp_text = os.environ.get("inp_text")
|
6 |
+
inp_wav_dir = os.environ.get("inp_wav_dir")
|
7 |
+
exp_name = os.environ.get("exp_name")
|
8 |
+
i_part = os.environ.get("i_part")
|
9 |
+
all_parts = os.environ.get("all_parts")
|
10 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
|
11 |
+
opt_dir = os.environ.get("opt_dir")
|
12 |
+
bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
|
13 |
+
is_half = eval(os.environ.get("is_half", "True"))
|
14 |
+
import sys, numpy as np, traceback, pdb
|
15 |
+
import os.path
|
16 |
+
from glob import glob
|
17 |
+
from tqdm import tqdm
|
18 |
+
from text.cleaner import clean_text
|
19 |
+
import torch
|
20 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
21 |
+
import numpy as np
|
22 |
+
|
23 |
+
# inp_text=sys.argv[1]
|
24 |
+
# inp_wav_dir=sys.argv[2]
|
25 |
+
# exp_name=sys.argv[3]
|
26 |
+
# i_part=sys.argv[4]
|
27 |
+
# all_parts=sys.argv[5]
|
28 |
+
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
|
29 |
+
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
30 |
+
# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
|
31 |
+
|
32 |
+
from time import time as ttime
|
33 |
+
import shutil
|
34 |
+
|
35 |
+
|
36 |
+
def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path
|
37 |
+
dir = os.path.dirname(path)
|
38 |
+
name = os.path.basename(path)
|
39 |
+
tmp_path = "%s/%s%s.pth" % (dir, ttime(), i_part)
|
40 |
+
torch.save(fea, tmp_path)
|
41 |
+
shutil.move(tmp_path, "%s/%s" % (dir, name))
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
|
46 |
+
if os.path.exists(txt_path) == False:
|
47 |
+
bert_dir = "%s/3-bert" % (opt_dir)
|
48 |
+
os.makedirs(opt_dir, exist_ok=True)
|
49 |
+
os.makedirs(bert_dir, exist_ok=True)
|
50 |
+
if torch.cuda.is_available():
|
51 |
+
device = "cuda:0"
|
52 |
+
elif torch.backends.mps.is_available():
|
53 |
+
device = "mps"
|
54 |
+
else:
|
55 |
+
device = "cpu"
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
|
57 |
+
bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
|
58 |
+
if is_half == True:
|
59 |
+
bert_model = bert_model.half().to(device)
|
60 |
+
else:
|
61 |
+
bert_model = bert_model.to(device)
|
62 |
+
|
63 |
+
def get_bert_feature(text, word2ph):
|
64 |
+
with torch.no_grad():
|
65 |
+
inputs = tokenizer(text, return_tensors="pt")
|
66 |
+
for i in inputs:
|
67 |
+
inputs[i] = inputs[i].to(device)
|
68 |
+
res = bert_model(**inputs, output_hidden_states=True)
|
69 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
70 |
+
|
71 |
+
assert len(word2ph) == len(text)
|
72 |
+
phone_level_feature = []
|
73 |
+
for i in range(len(word2ph)):
|
74 |
+
repeat_feature = res[i].repeat(word2ph[i], 1)
|
75 |
+
phone_level_feature.append(repeat_feature)
|
76 |
+
|
77 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
78 |
+
|
79 |
+
return phone_level_feature.T
|
80 |
+
|
81 |
+
def process(data, res):
|
82 |
+
for name, text, lan in data:
|
83 |
+
try:
|
84 |
+
name = os.path.basename(name)
|
85 |
+
phones, word2ph, norm_text = clean_text(
|
86 |
+
text.replace("%", "-").replace("¥", ","), lan
|
87 |
+
)
|
88 |
+
path_bert = "%s/%s.pt" % (bert_dir, name)
|
89 |
+
if os.path.exists(path_bert) == False and lan == "zh":
|
90 |
+
bert_feature = get_bert_feature(norm_text, word2ph)
|
91 |
+
assert bert_feature.shape[-1] == len(phones)
|
92 |
+
# torch.save(bert_feature, path_bert)
|
93 |
+
my_save(bert_feature, path_bert)
|
94 |
+
phones = " ".join(phones)
|
95 |
+
# res.append([name,phones])
|
96 |
+
res.append([name, phones, word2ph, norm_text])
|
97 |
+
except:
|
98 |
+
print(name, text, traceback.format_exc())
|
99 |
+
|
100 |
+
todo = []
|
101 |
+
res = []
|
102 |
+
with open(inp_text, "r", encoding="utf8") as f:
|
103 |
+
lines = f.read().strip("\n").split("\n")
|
104 |
+
|
105 |
+
language_v1_to_language_v2 = {
|
106 |
+
"ZH": "zh",
|
107 |
+
"zh": "zh",
|
108 |
+
"JP": "ja",
|
109 |
+
"jp": "ja",
|
110 |
+
"JA": "ja",
|
111 |
+
"ja": "ja",
|
112 |
+
"EN": "en",
|
113 |
+
"en": "en",
|
114 |
+
"En": "en",
|
115 |
+
}
|
116 |
+
for line in lines[int(i_part) :: int(all_parts)]:
|
117 |
+
try:
|
118 |
+
wav_name, spk_name, language, text = line.split("|")
|
119 |
+
# todo.append([name,text,"zh"])
|
120 |
+
todo.append(
|
121 |
+
[wav_name, text, language_v1_to_language_v2.get(language, language)]
|
122 |
+
)
|
123 |
+
except:
|
124 |
+
print(line, traceback.format_exc())
|
125 |
+
|
126 |
+
process(todo, res)
|
127 |
+
opt = []
|
128 |
+
for name, phones, word2ph, norm_text in res:
|
129 |
+
opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
|
130 |
+
with open(txt_path, "w", encoding="utf8") as f:
|
131 |
+
f.write("\n".join(opt) + "\n")
|
prepare_datasets/2-get-hubert-wav32k.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import sys,os
|
4 |
+
inp_text= os.environ.get("inp_text")
|
5 |
+
inp_wav_dir= os.environ.get("inp_wav_dir")
|
6 |
+
exp_name= os.environ.get("exp_name")
|
7 |
+
i_part= os.environ.get("i_part")
|
8 |
+
all_parts= os.environ.get("all_parts")
|
9 |
+
os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES")
|
10 |
+
from feature_extractor import cnhubert
|
11 |
+
opt_dir= os.environ.get("opt_dir")
|
12 |
+
cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir")
|
13 |
+
is_half=eval(os.environ.get("is_half","True"))
|
14 |
+
|
15 |
+
import pdb,traceback,numpy as np,logging
|
16 |
+
from scipy.io import wavfile
|
17 |
+
import librosa,torch
|
18 |
+
now_dir = os.getcwd()
|
19 |
+
sys.path.append(now_dir)
|
20 |
+
from my_utils import load_audio
|
21 |
+
|
22 |
+
# from config import cnhubert_base_path
|
23 |
+
# cnhubert.cnhubert_base_path=cnhubert_base_path
|
24 |
+
# inp_text=sys.argv[1]
|
25 |
+
# inp_wav_dir=sys.argv[2]
|
26 |
+
# exp_name=sys.argv[3]
|
27 |
+
# i_part=sys.argv[4]
|
28 |
+
# all_parts=sys.argv[5]
|
29 |
+
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
|
30 |
+
# cnhubert.cnhubert_base_path=sys.argv[7]
|
31 |
+
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
32 |
+
|
33 |
+
from time import time as ttime
|
34 |
+
import shutil
|
35 |
+
def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path
|
36 |
+
dir=os.path.dirname(path)
|
37 |
+
name=os.path.basename(path)
|
38 |
+
tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
|
39 |
+
torch.save(fea,tmp_path)
|
40 |
+
shutil.move(tmp_path,"%s/%s"%(dir,name))
|
41 |
+
|
42 |
+
hubert_dir="%s/4-cnhubert"%(opt_dir)
|
43 |
+
wav32dir="%s/5-wav32k"%(opt_dir)
|
44 |
+
os.makedirs(opt_dir,exist_ok=True)
|
45 |
+
os.makedirs(hubert_dir,exist_ok=True)
|
46 |
+
os.makedirs(wav32dir,exist_ok=True)
|
47 |
+
|
48 |
+
maxx=0.95
|
49 |
+
alpha=0.5
|
50 |
+
if torch.cuda.is_available():
|
51 |
+
device = "cuda:0"
|
52 |
+
elif torch.backends.mps.is_available():
|
53 |
+
device = "mps"
|
54 |
+
else:
|
55 |
+
device = "cpu"
|
56 |
+
model=cnhubert.get_model()
|
57 |
+
# is_half=False
|
58 |
+
if(is_half==True):
|
59 |
+
model=model.half().to(device)
|
60 |
+
else:
|
61 |
+
model = model.to(device)
|
62 |
+
|
63 |
+
nan_fails=[]
|
64 |
+
def name2go(wav_name):
|
65 |
+
hubert_path="%s/%s.pt"%(hubert_dir,wav_name)
|
66 |
+
if(os.path.exists(hubert_path)):return
|
67 |
+
wav_path="%s/%s"%(inp_wav_dir,wav_name)
|
68 |
+
tmp_audio = load_audio(wav_path, 32000)
|
69 |
+
tmp_max = np.abs(tmp_audio).max()
|
70 |
+
if tmp_max > 2.2:
|
71 |
+
print("%s-filtered" % (wav_name, tmp_max))
|
72 |
+
return
|
73 |
+
tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio
|
74 |
+
tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio
|
75 |
+
tmp_audio = librosa.resample(
|
76 |
+
tmp_audio32b, orig_sr=32000, target_sr=16000
|
77 |
+
)#不是重采样问题
|
78 |
+
tensor_wav16 = torch.from_numpy(tmp_audio)
|
79 |
+
if (is_half == True):
|
80 |
+
tensor_wav16=tensor_wav16.half().to(device)
|
81 |
+
else:
|
82 |
+
tensor_wav16 = tensor_wav16.to(device)
|
83 |
+
ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215])
|
84 |
+
if np.isnan(ssl.detach().numpy()).sum()!= 0:
|
85 |
+
nan_fails.append(wav_name)
|
86 |
+
print("nan filtered:%s"%wav_name)
|
87 |
+
return
|
88 |
+
wavfile.write(
|
89 |
+
"%s/%s"%(wav32dir,wav_name),
|
90 |
+
32000,
|
91 |
+
tmp_audio32.astype("int16"),
|
92 |
+
)
|
93 |
+
my_save(ssl,hubert_path )
|
94 |
+
|
95 |
+
with open(inp_text,"r",encoding="utf8")as f:
|
96 |
+
lines=f.read().strip("\n").split("\n")
|
97 |
+
|
98 |
+
for line in lines[int(i_part)::int(all_parts)]:
|
99 |
+
try:
|
100 |
+
# wav_name,text=line.split("\t")
|
101 |
+
wav_name, spk_name, language, text = line.split("|")
|
102 |
+
wav_name=os.path.basename(wav_name)
|
103 |
+
name2go(wav_name)
|
104 |
+
except:
|
105 |
+
print(line,traceback.format_exc())
|
106 |
+
|
107 |
+
if(len(nan_fails)>0 and is_half==True):
|
108 |
+
is_half=False
|
109 |
+
model=model.float()
|
110 |
+
for wav_name in nan_fails:
|
111 |
+
try:
|
112 |
+
name2go(wav_name)
|
113 |
+
except:
|
114 |
+
print(wav_name,traceback.format_exc())
|
prepare_datasets/3-get-semantic.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
inp_text = os.environ.get("inp_text")
|
4 |
+
exp_name = os.environ.get("exp_name")
|
5 |
+
i_part = os.environ.get("i_part")
|
6 |
+
all_parts = os.environ.get("all_parts")
|
7 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES")
|
8 |
+
opt_dir = os.environ.get("opt_dir")
|
9 |
+
pretrained_s2G = os.environ.get("pretrained_s2G")
|
10 |
+
s2config_path = os.environ.get("s2config_path")
|
11 |
+
is_half = eval(os.environ.get("is_half", "True"))
|
12 |
+
import math, traceback
|
13 |
+
import multiprocessing
|
14 |
+
import sys, pdb
|
15 |
+
|
16 |
+
now_dir = os.getcwd()
|
17 |
+
sys.path.append(now_dir)
|
18 |
+
from random import shuffle
|
19 |
+
import torch.multiprocessing as mp
|
20 |
+
from glob import glob
|
21 |
+
from tqdm import tqdm
|
22 |
+
import logging, librosa, utils, torch
|
23 |
+
from module.models import SynthesizerTrn
|
24 |
+
|
25 |
+
logging.getLogger("numba").setLevel(logging.WARNING)
|
26 |
+
# from config import pretrained_s2G
|
27 |
+
|
28 |
+
# inp_text=sys.argv[1]
|
29 |
+
# exp_name=sys.argv[2]
|
30 |
+
# i_part=sys.argv[3]
|
31 |
+
# all_parts=sys.argv[4]
|
32 |
+
# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5]
|
33 |
+
# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
|
34 |
+
|
35 |
+
|
36 |
+
hubert_dir = "%s/4-cnhubert" % (opt_dir)
|
37 |
+
semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
|
38 |
+
if os.path.exists(semantic_path) == False:
|
39 |
+
os.makedirs(opt_dir, exist_ok=True)
|
40 |
+
|
41 |
+
if torch.cuda.is_available():
|
42 |
+
device = "cuda"
|
43 |
+
elif torch.backends.mps.is_available():
|
44 |
+
device = "mps"
|
45 |
+
else:
|
46 |
+
device = "cpu"
|
47 |
+
hps = utils.get_hparams_from_file(s2config_path)
|
48 |
+
vq_model = SynthesizerTrn(
|
49 |
+
hps.data.filter_length // 2 + 1,
|
50 |
+
hps.train.segment_size // hps.data.hop_length,
|
51 |
+
n_speakers=hps.data.n_speakers,
|
52 |
+
**hps.model
|
53 |
+
)
|
54 |
+
if is_half == True:
|
55 |
+
vq_model = vq_model.half().to(device)
|
56 |
+
else:
|
57 |
+
vq_model = vq_model.to(device)
|
58 |
+
vq_model.eval()
|
59 |
+
# utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True)
|
60 |
+
# utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
|
61 |
+
print(
|
62 |
+
vq_model.load_state_dict(
|
63 |
+
torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
|
64 |
+
)
|
65 |
+
)
|
66 |
+
|
67 |
+
def name2go(wav_name, lines):
|
68 |
+
hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
|
69 |
+
if os.path.exists(hubert_path) == False:
|
70 |
+
return
|
71 |
+
ssl_content = torch.load(hubert_path, map_location="cpu")
|
72 |
+
if is_half == True:
|
73 |
+
ssl_content = ssl_content.half().to(device)
|
74 |
+
else:
|
75 |
+
ssl_content = ssl_content.to(device)
|
76 |
+
codes = vq_model.extract_latent(ssl_content)
|
77 |
+
semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
|
78 |
+
lines.append("%s\t%s" % (wav_name, semantic))
|
79 |
+
|
80 |
+
with open(inp_text, "r", encoding="utf8") as f:
|
81 |
+
lines = f.read().strip("\n").split("\n")
|
82 |
+
|
83 |
+
lines1 = []
|
84 |
+
for line in lines[int(i_part) :: int(all_parts)]:
|
85 |
+
# print(line)
|
86 |
+
try:
|
87 |
+
# wav_name,text=line.split("\t")
|
88 |
+
wav_name, spk_name, language, text = line.split("|")
|
89 |
+
wav_name = os.path.basename(wav_name)
|
90 |
+
# name2go(name,lines1)
|
91 |
+
name2go(wav_name, lines1)
|
92 |
+
except:
|
93 |
+
print(line, traceback.format_exc())
|
94 |
+
with open(semantic_path, "w", encoding="utf8") as f:
|
95 |
+
f.write("\n".join(lines1))
|
process_ckpt.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import traceback
|
2 |
+
from collections import OrderedDict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from tools.i18n.i18n import I18nAuto
|
6 |
+
|
7 |
+
i18n = I18nAuto()
|
8 |
+
|
9 |
+
|
10 |
+
def savee(ckpt, name, epoch, steps, hps):
|
11 |
+
try:
|
12 |
+
opt = OrderedDict()
|
13 |
+
opt["weight"] = {}
|
14 |
+
for key in ckpt.keys():
|
15 |
+
if "enc_q" in key:
|
16 |
+
continue
|
17 |
+
opt["weight"][key] = ckpt[key].half()
|
18 |
+
opt["config"] = hps
|
19 |
+
opt["info"] = "%sepoch_%siteration" % (epoch, steps)
|
20 |
+
torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
|
21 |
+
return "Success."
|
22 |
+
except:
|
23 |
+
return traceback.format_exc()
|
text/tone_sandhi.py
CHANGED
@@ -455,6 +455,35 @@ class ToneSandhi:
|
|
455 |
"电子",
|
456 |
"人人",
|
457 |
"虎虎",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
}
|
459 |
self.punc = ":,;。?!“”‘’':,;.?!"
|
460 |
|
|
|
455 |
"电子",
|
456 |
"人人",
|
457 |
"虎虎",
|
458 |
+
"幺幺",
|
459 |
+
"干嘛",
|
460 |
+
"学子",
|
461 |
+
"哈哈",
|
462 |
+
"数数",
|
463 |
+
"袅袅",
|
464 |
+
"局地",
|
465 |
+
"以下",
|
466 |
+
"娃哈哈",
|
467 |
+
"花花草草",
|
468 |
+
"留得",
|
469 |
+
"耕地",
|
470 |
+
"想想",
|
471 |
+
"熙熙",
|
472 |
+
"攘攘",
|
473 |
+
"卵子",
|
474 |
+
"死死",
|
475 |
+
"冉冉",
|
476 |
+
"恳恳",
|
477 |
+
"佼佼",
|
478 |
+
"吵吵",
|
479 |
+
"打打",
|
480 |
+
"考考",
|
481 |
+
"整整",
|
482 |
+
"莘莘",
|
483 |
+
"落地",
|
484 |
+
"算子",
|
485 |
+
"家家户户",
|
486 |
+
"青青",
|
487 |
}
|
488 |
self.punc = ":,;。?!“”‘’':,;.?!"
|
489 |
|