diff --git a/AR/__init__.py b/AR/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/AR/__pycache__/__init__.cpython-39.pyc b/AR/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..872855cfd30ad60f6fa2cbde7f8cd24211c6022e Binary files /dev/null and b/AR/__pycache__/__init__.cpython-39.pyc differ diff --git a/AR/data/__init__.py b/AR/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/AR/data/bucket_sampler.py b/AR/data/bucket_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..45f91d8eb4eaee7867fee29e89971b843fa23f38 --- /dev/null +++ b/AR/data/bucket_sampler.py @@ -0,0 +1,163 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py +# reference: https://github.com/lifeiteng/vall-e +import itertools +import math +import random +from random import shuffle +from typing import Iterator +from typing import Optional +from typing import TypeVar + +import torch +import torch.distributed as dist +from torch.utils.data import Dataset +from torch.utils.data import Sampler + +__all__ = [ + "DistributedBucketSampler", +] + +T_co = TypeVar("T_co", covariant=True) + + +class DistributedBucketSampler(Sampler[T_co]): + r""" + sort the dataset wrt. input length + divide samples into buckets + sort within buckets + divide buckets into batches + sort batches + """ + + def __init__( + self, + dataset: Dataset, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + shuffle: bool = True, + seed: int = 0, + drop_last: bool = False, + batch_size: int = 32, + ) -> None: + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1 + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() if torch.cuda.is_available() else 0 + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1) + ) + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if ( + self.drop_last and len(self.dataset) % self.num_replicas != 0 + ): # type: ignore[arg-type] + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil( + (len(self.dataset) - self.num_replicas) + / self.num_replicas # type: ignore[arg-type] + ) + else: + self.num_samples = math.ceil( + len(self.dataset) / self.num_replicas + ) # type: ignore[arg-type] + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + self.seed = seed + self.batch_size = batch_size + self.id_with_length = self._get_sample_lengths() + self.id_buckets = self.make_buckets(bucket_width=2.0) + + def _get_sample_lengths(self): + id_with_lengths = [] + for i in range(len(self.dataset)): + id_with_lengths.append((i, self.dataset.get_sample_length(i))) + id_with_lengths.sort(key=lambda x: x[1]) + return id_with_lengths + + def make_buckets(self, bucket_width: float = 2.0): + buckets = [] + cur = [] + max_sec = bucket_width + for id, sec in self.id_with_length: + if sec < max_sec: + cur.append(id) + else: + buckets.append(cur) + cur = [id] + max_sec += bucket_width + if len(cur) > 0: + buckets.append(cur) + return buckets + + def __iter__(self) -> Iterator[T_co]: + if self.shuffle: + # deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + random.seed(self.epoch + self.seed) + shuffled_bucket = [] + for buc in self.id_buckets: + buc_copy = buc.copy() + shuffle(buc_copy) + shuffled_bucket.append(buc_copy) + grouped_batch_size = self.batch_size * self.num_replicas + shuffled_bucket = list(itertools.chain(*shuffled_bucket)) + n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) + batches = [ + shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] + for b in range(n_batch) + ] + shuffle(batches) + indices = list(itertools.chain(*batches)) + else: + # type: ignore[arg-type] + indices = list(range(len(self.dataset))) + + if not self.drop_last: + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[ + :padding_size + ] + else: + # remove tail of data to make it evenly divisible. + indices = indices[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self) -> int: + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + r""" + Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas + use a different random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/AR/data/data_module.py b/AR/data/data_module.py new file mode 100644 index 0000000000000000000000000000000000000000..cb947959b3f2fdb0951e6a6383399a9fb3826536 --- /dev/null +++ b/AR/data/data_module.py @@ -0,0 +1,76 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py +# reference: https://github.com/lifeiteng/vall-e +from pytorch_lightning import LightningDataModule +from AR.data.bucket_sampler import DistributedBucketSampler +from AR.data.dataset import Text2SemanticDataset +from torch.utils.data import DataLoader + + +class Text2SemanticDataModule(LightningDataModule): + def __init__( + self, + config, + train_semantic_path, + train_phoneme_path, + dev_semantic_path=None, + dev_phoneme_path=None, + ): + super().__init__() + self.config = config + self.train_semantic_path = train_semantic_path + self.train_phoneme_path = train_phoneme_path + self.dev_semantic_path = dev_semantic_path + self.dev_phoneme_path = dev_phoneme_path + self.num_workers = self.config["data"]["num_workers"] + + def prepare_data(self): + pass + + def setup(self, stage=None, output_logs=False): + self._train_dataset = Text2SemanticDataset( + phoneme_path=self.train_phoneme_path, + semantic_path=self.train_semantic_path, + max_sec=self.config["data"]["max_sec"], + pad_val=self.config["data"]["pad_val"], + ) + self._dev_dataset = self._train_dataset + # self._dev_dataset = Text2SemanticDataset( + # phoneme_path=self.dev_phoneme_path, + # semantic_path=self.dev_semantic_path, + # max_sample=self.config['data']['max_eval_sample'], + # max_sec=self.config['data']['max_sec'], + # pad_val=self.config['data']['pad_val']) + + def train_dataloader(self): + batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] + batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 + sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) + return DataLoader( + self._train_dataset, + batch_size=batch_size, + sampler=sampler, + collate_fn=self._train_dataset.collate, + num_workers=self.num_workers, + persistent_workers=True, + prefetch_factor=16, + ) + + def val_dataloader(self): + return DataLoader( + self._dev_dataset, + batch_size=1, + shuffle=False, + collate_fn=self._train_dataset.collate, + num_workers=max(self.num_workers, 12), + persistent_workers=True, + prefetch_factor=16, + ) + + # 这个会使用到嘛? + def test_dataloader(self): + return DataLoader( + self._dev_dataset, + batch_size=1, + shuffle=False, + collate_fn=self._train_dataset.collate, + ) diff --git a/AR/data/dataset.py b/AR/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1a2ffef17cafa906cbc65c1c610b5bc8f5f6c3da --- /dev/null +++ b/AR/data/dataset.py @@ -0,0 +1,321 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py +# reference: https://github.com/lifeiteng/vall-e +import pdb +import sys + +# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") +import traceback, os +from typing import Dict +from typing import List + +import numpy as np +import pandas as pd +import torch, json +from torch.utils.data import DataLoader +from torch.utils.data import Dataset +from transformers import AutoTokenizer + +from text import cleaned_text_to_sequence + +# from config import exp_dir + + +def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0): + seq = sequences[0] + ndim = seq.ndim + if axis < 0: + axis += ndim + dtype = seq.dtype + pad_value = dtype.type(pad_value) + seq_lengths = [seq.shape[axis] for seq in sequences] + max_length = np.max(seq_lengths) + + padded_sequences = [] + for seq, length in zip(sequences, seq_lengths): + padding = ( + [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) + ) + padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) + padded_sequences.append(padded_seq) + batch = np.stack(padded_sequences) + return batch + + +class Text2SemanticDataset(Dataset): + """dataset class for text tokens to semantic model training.""" + + def __init__( + self, + phoneme_path: str, + semantic_path: str, + max_sample: int = None, + max_sec: int = 100, + pad_val: int = 1024, + # min value of phoneme/sec + min_ps_ratio: int = 3, + # max value of phoneme/sec + max_ps_ratio: int = 25, + ) -> None: + super().__init__() + + self.semantic_data = pd.read_csv( + semantic_path, delimiter="\t", encoding="utf-8" + ) + # get dict + self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path + self.path3 = "%s/3-bert" % ( + os.path.basename(phoneme_path) + ) # "%s/3-bert"%exp_dir#bert_dir + self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path + assert os.path.exists(self.path2) + assert os.path.exists(self.path6) + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + for line in lines: + tmp = line.split("\t") + if len(tmp) != 4: + continue + self.phoneme_data[tmp[0]] = [tmp[1], tmp[2], tmp[3]] + + # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item() + # pad for semantic tokens + self.PAD: int = pad_val + # self.hz = 25 + # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read() + # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz + # self.hz=int(data[:-2])# + self.hz = int(os.environ.get("hz", "25hz")[:-2]) + + # max seconds of semantic token + self.max_sec = max_sec + self.min_ps_ratio = min_ps_ratio + self.max_ps_ratio = max_ps_ratio + + if max_sample is not None: + self.semantic_data = self.semantic_data[:max_sample] + + # {idx: (semantic, phoneme)} + # semantic list, phoneme list + self.semantic_phoneme = [] + self.item_names = [] + + self.inited = False + + if not self.inited: + # 调用初始化函数 + self.init_batch() + self.inited = True + del self.semantic_data + del self.phoneme_data + # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large") + # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large") + + def init_batch(self): + semantic_data_len = len(self.semantic_data) + phoneme_data_len = len(self.phoneme_data.keys()) + print("semantic_data_len:", semantic_data_len) + print("phoneme_data_len:", phoneme_data_len) + print(self.semantic_data) + idx = 0 + num_not_in = 0 + num_deleted_bigger = 0 + num_deleted_ps = 0 + for i in range(semantic_data_len): + # 先依次遍历 + # get str + item_name = self.semantic_data.iloc[i,0] + # print(self.phoneme_data) + try: + phoneme, word2ph, text = self.phoneme_data[item_name] + except Exception: + traceback.print_exc() + # print(f"{item_name} not in self.phoneme_data !") + num_not_in += 1 + continue + + semantic_str = self.semantic_data.iloc[i,1] + # get token list + semantic_ids = [int(idx) for idx in semantic_str.split(" ")] + # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len + # 过滤掉太长的样本 + if ( + len(semantic_ids) > self.max_sec * self.hz + ): #########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k + num_deleted_bigger += 1 + continue + # (T, ), 这个速度不会很慢,所以可以在一开始就处理,无需在 __getitem__ 里面单个处理#### + phoneme = phoneme.split(" ") + + try: + phoneme_ids = cleaned_text_to_sequence(phoneme) + except: + traceback.print_exc() + # print(f"{item_name} not in self.phoneme_data !") + num_not_in += 1 + continue + # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 + if ( + len(phoneme_ids) > self.max_sec * self.hz / 2.5 + ): ###########2:改为恒定限制为semantic/2.5就行 + num_deleted_ps += 1 + continue + # if len(semantic_ids) > 1000:###########3 + # num_deleted_bigger += 1 + # continue + + ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) + + if ( + ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio + ): ##########4#3~25#每秒多少个phone + num_deleted_ps += 1 + # print(item_name) + continue + + self.semantic_phoneme.append((semantic_ids, phoneme_ids)) + idx += 1 + self.item_names.append(item_name) + + min_num = 100 # 20直接不补#30补了也不存ckpt + leng = len(self.semantic_phoneme) + if leng < min_num: + tmp1 = self.semantic_phoneme + tmp2 = self.item_names + self.semantic_phoneme = [] + self.item_names = [] + for _ in range(max(2, int(min_num / leng))): + self.semantic_phoneme += tmp1 + self.item_names += tmp2 + if num_not_in > 0: + print(f"there are {num_not_in} semantic datas not in phoneme datas") + if num_deleted_bigger > 0: + print( + f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds" + ) + if num_deleted_ps > 0: + # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值 + print( + f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}" + ) + """ + there are 31 semantic datas not in phoneme datas + deleted 34 audios who's duration are bigger than 54 seconds + deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3 + dataset.__len__(): 366463 + + """ + # 345410 for LibriTTS + print("dataset.__len__():", self.__len__()) + + def __get_item_names__(self) -> List[str]: + return self.item_names + + def __len__(self) -> int: + return len(self.semantic_phoneme) + + def __getitem__(self, idx: int) -> Dict: + semantic_ids, phoneme_ids = self.semantic_phoneme[idx] + item_name = self.item_names[idx] + phoneme_ids_len = len(phoneme_ids) + # semantic tokens target + semantic_ids_len = len(semantic_ids) + + flag = 0 + path_bert = "%s/%s.pt" % (self.path3, item_name) + if os.path.exists(path_bert) == True: + bert_feature = torch.load(path_bert, map_location="cpu") + else: + flag = 1 + if flag == 1: + # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32) + bert_feature = None + else: + assert bert_feature.shape[-1] == len(phoneme_ids) + return { + "idx": idx, + "phoneme_ids": phoneme_ids, + "phoneme_ids_len": phoneme_ids_len, + "semantic_ids": semantic_ids, + "semantic_ids_len": semantic_ids_len, + "bert_feature": bert_feature, + } + + def get_sample_length(self, idx: int): + semantic_ids = self.semantic_phoneme[idx][0] + sec = 1.0 * len(semantic_ids) / self.hz + return sec + + def collate(self, examples: List[Dict]) -> Dict: + sample_index: List[int] = [] + phoneme_ids: List[torch.Tensor] = [] + phoneme_ids_lens: List[int] = [] + semantic_ids: List[torch.Tensor] = [] + semantic_ids_lens: List[int] = [] + # return + + for item in examples: + sample_index.append(item["idx"]) + phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64)) + semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64)) + phoneme_ids_lens.append(item["phoneme_ids_len"]) + semantic_ids_lens.append(item["semantic_ids_len"]) + + # pad 0 + phoneme_ids = batch_sequences(phoneme_ids) + semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD) + + # # convert each batch to torch.tensor + phoneme_ids = torch.tensor(phoneme_ids) + semantic_ids = torch.tensor(semantic_ids) + phoneme_ids_lens = torch.tensor(phoneme_ids_lens) + semantic_ids_lens = torch.tensor(semantic_ids_lens) + bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens)) + bert_padded.zero_() + + for idx, item in enumerate(examples): + bert = item["bert_feature"] + if bert != None: + bert_padded[idx, :, : bert.shape[-1]] = bert + + return { + # List[int] + "ids": sample_index, + # torch.Tensor (B, max_phoneme_length) + "phoneme_ids": phoneme_ids, + # torch.Tensor (B) + "phoneme_ids_len": phoneme_ids_lens, + # torch.Tensor (B, max_semantic_ids_length) + "semantic_ids": semantic_ids, + # torch.Tensor (B) + "semantic_ids_len": semantic_ids_lens, + # torch.Tensor (B, 1024, max_phoneme_length) + "bert_feature": bert_padded, + } + + +if __name__ == "__main__": + root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/" + dataset = Text2SemanticDataset( + phoneme_path=root_dir + "phoneme_train.npy", + semantic_path=root_dir + "semantic_train.tsv", + ) + + batch_size = 12 + dataloader = DataLoader( + dataset, batch_size=batch_size, collate_fn=dataset.collate, shuffle=False + ) + for i, batch in enumerate(dataloader): + if i % 1000 == 0: + print(i) + # if i == 0: + # print('batch["ids"]:', batch["ids"]) + # print('batch["phoneme_ids"]:', batch["phoneme_ids"], + # batch["phoneme_ids"].shape) + # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"], + # batch["phoneme_ids_len"].shape) + # print('batch["semantic_ids"]:', batch["semantic_ids"], + # batch["semantic_ids"].shape) + # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"], + # batch["semantic_ids_len"].shape) diff --git a/AR/models/__init__.py b/AR/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/AR/models/__pycache__/__init__.cpython-39.pyc b/AR/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ff3db58258e65b573b3c98b3eaa064f45877a9c Binary files /dev/null and b/AR/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/AR/models/__pycache__/t2s_lightning_module.cpython-39.pyc b/AR/models/__pycache__/t2s_lightning_module.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c84dd1d1fada29a0348d1b66e151f3898e72c82 Binary files /dev/null and b/AR/models/__pycache__/t2s_lightning_module.cpython-39.pyc differ diff --git a/AR/models/__pycache__/t2s_model.cpython-39.pyc b/AR/models/__pycache__/t2s_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..110ab8303d313cef87a7157bee451d7c1d4f3101 Binary files /dev/null and b/AR/models/__pycache__/t2s_model.cpython-39.pyc differ diff --git a/AR/models/__pycache__/utils.cpython-39.pyc b/AR/models/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eac9d1e40e28ac941ddf21ca0e81da3f46e47323 Binary files /dev/null and b/AR/models/__pycache__/utils.cpython-39.pyc differ diff --git a/AR/models/t2s_lightning_module.py b/AR/models/t2s_lightning_module.py new file mode 100644 index 0000000000000000000000000000000000000000..2dd3f392893f1ea08a6e848f2ff2d9be1a425f15 --- /dev/null +++ b/AR/models/t2s_lightning_module.py @@ -0,0 +1,141 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py +# reference: https://github.com/lifeiteng/vall-e +import os, sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from typing import Dict + +import torch +from pytorch_lightning import LightningModule +from AR.models.t2s_model import Text2SemanticDecoder +from AR.modules.lr_schedulers import WarmupCosineLRSchedule +from AR.modules.optim import ScaledAdam + +class Text2SemanticLightningModule(LightningModule): + def __init__(self, config, output_dir, is_train=True): + super().__init__() + self.config = config + self.top_k = 3 + self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) + pretrained_s1 = config.get("pretrained_s1") + if pretrained_s1 and is_train: + # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) + print( + self.load_state_dict( + torch.load(pretrained_s1, map_location="cpu")["weight"] + ) + ) + if is_train: + self.automatic_optimization = False + self.save_hyperparameters() + self.eval_dir = output_dir / "eval" + self.eval_dir.mkdir(parents=True, exist_ok=True) + + def training_step(self, batch: Dict, batch_idx: int): + opt = self.optimizers() + scheduler = self.lr_schedulers() + forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old + loss, acc = forward( + batch["phoneme_ids"], + batch["phoneme_ids_len"], + batch["semantic_ids"], + batch["semantic_ids_len"], + batch["bert_feature"], + ) + self.manual_backward(loss) + if batch_idx > 0 and batch_idx % 4 == 0: + opt.step() + opt.zero_grad() + scheduler.step() + + self.log( + "total_loss", + loss, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + "lr", + scheduler.get_last_lr()[0], + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + f"top_{self.top_k}_acc", + acc, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + + def validation_step(self, batch: Dict, batch_idx: int): + return + + # # get loss + # loss, acc = self.model.forward( + # batch['phoneme_ids'], batch['phoneme_ids_len'], + # batch['semantic_ids'], batch['semantic_ids_len'], + # batch['bert_feature'] + # ) + # + # self.log( + # "val_total_loss", + # loss, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # self.log( + # f"val_top_{self.top_k}_acc", + # acc, + # on_step=True, + # on_epoch=True, + # prog_bar=True, + # sync_dist=True) + # + # # get infer output + # semantic_len = batch['semantic_ids'].size(1) + # prompt_len = min(int(semantic_len * 0.5), 150) + # prompt = batch['semantic_ids'][:, :prompt_len] + # pred_semantic = self.model.infer(batch['phoneme_ids'], + # batch['phoneme_ids_len'], prompt, + # batch['bert_feature'] + # ) + # save_name = f'semantic_toks_{batch_idx}.pt' + # save_path = os.path.join(self.eval_dir, save_name) + # torch.save(pred_semantic.detach().cpu(), save_path) + + def configure_optimizers(self): + model_parameters = self.model.parameters() + parameters_names = [] + parameters_names.append( + [name_param_pair[0] for name_param_pair in self.model.named_parameters()] + ) + lm_opt = ScaledAdam( + model_parameters, + lr=0.01, + betas=(0.9, 0.95), + clipping_scale=2.0, + parameters_names=parameters_names, + show_dominant_parameters=False, + clipping_update_period=1000, + ) + + return { + "optimizer": lm_opt, + "lr_scheduler": { + "scheduler": WarmupCosineLRSchedule( + lm_opt, + init_lr=self.config["optimizer"]["lr_init"], + peak_lr=self.config["optimizer"]["lr"], + end_lr=self.config["optimizer"]["lr_end"], + warmup_steps=self.config["optimizer"]["warmup_steps"], + total_steps=self.config["optimizer"]["decay_steps"], + ) + }, + } diff --git a/AR/models/t2s_lightning_module_onnx.py b/AR/models/t2s_lightning_module_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..487edb015203ebd70f9f6d3475ef892e28c57927 --- /dev/null +++ b/AR/models/t2s_lightning_module_onnx.py @@ -0,0 +1,107 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py +# reference: https://github.com/lifeiteng/vall-e +import os, sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from typing import Dict + +import torch +from pytorch_lightning import LightningModule +from AR.models.t2s_model_onnx import Text2SemanticDecoder +from AR.modules.lr_schedulers import WarmupCosineLRSchedule +from AR.modules.optim import ScaledAdam + + +class Text2SemanticLightningModule(LightningModule): + def __init__(self, config, output_dir, is_train=True): + super().__init__() + self.config = config + self.top_k = 3 + self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) + pretrained_s1 = config.get("pretrained_s1") + if pretrained_s1 and is_train: + # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) + print( + self.load_state_dict( + torch.load(pretrained_s1, map_location="cpu")["weight"] + ) + ) + if is_train: + self.automatic_optimization = False + self.save_hyperparameters() + self.eval_dir = output_dir / "eval" + self.eval_dir.mkdir(parents=True, exist_ok=True) + + def training_step(self, batch: Dict, batch_idx: int): + opt = self.optimizers() + scheduler = self.lr_schedulers() + loss, acc = self.model.forward( + batch["phoneme_ids"], + batch["phoneme_ids_len"], + batch["semantic_ids"], + batch["semantic_ids_len"], + batch["bert_feature"], + ) + self.manual_backward(loss) + if batch_idx > 0 and batch_idx % 4 == 0: + opt.step() + opt.zero_grad() + scheduler.step() + + self.log( + "total_loss", + loss, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + "lr", + scheduler.get_last_lr()[0], + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + self.log( + f"top_{self.top_k}_acc", + acc, + on_step=True, + on_epoch=True, + prog_bar=True, + sync_dist=True, + ) + + def validation_step(self, batch: Dict, batch_idx: int): + return + + def configure_optimizers(self): + model_parameters = self.model.parameters() + parameters_names = [] + parameters_names.append( + [name_param_pair[0] for name_param_pair in self.model.named_parameters()] + ) + lm_opt = ScaledAdam( + model_parameters, + lr=0.01, + betas=(0.9, 0.95), + clipping_scale=2.0, + parameters_names=parameters_names, + show_dominant_parameters=False, + clipping_update_period=1000, + ) + + return { + "optimizer": lm_opt, + "lr_scheduler": { + "scheduler": WarmupCosineLRSchedule( + lm_opt, + init_lr=self.config["optimizer"]["lr_init"], + peak_lr=self.config["optimizer"]["lr"], + end_lr=self.config["optimizer"]["lr_end"], + warmup_steps=self.config["optimizer"]["warmup_steps"], + total_steps=self.config["optimizer"]["decay_steps"], + ) + }, + } diff --git a/AR/models/t2s_model.py b/AR/models/t2s_model.py new file mode 100644 index 0000000000000000000000000000000000000000..bc1261d4c503490510a4fd3ccd0ce02d2aa8c57b --- /dev/null +++ b/AR/models/t2s_model.py @@ -0,0 +1,588 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py +# reference: https://github.com/lifeiteng/vall-e +from typing import List + +import torch +from tqdm import tqdm + +from AR.models.utils import make_pad_mask +from AR.models.utils import ( + topk_sampling, + sample, + logits_to_probs, + multinomial_sample_one_no_sync, + dpo_loss, + make_reject_y, + get_batch_logps +) +from AR.modules.embedding import SinePositionalEmbedding +from AR.modules.embedding import TokenEmbedding +from AR.modules.transformer import LayerNorm +from AR.modules.transformer import TransformerEncoder +from AR.modules.transformer import TransformerEncoderLayer +from torch import nn +from torch.nn import functional as F +from torchmetrics.classification import MulticlassAccuracy + +default_config = { + "embedding_dim": 512, + "hidden_dim": 512, + "num_head": 8, + "num_layers": 12, + "num_codebook": 8, + "p_dropout": 0.0, + "vocab_size": 1024 + 1, + "phoneme_vocab_size": 512, + "EOS": 1024, +} + + +@torch.jit.script +class T2SMLP: + def __init__(self, w1, b1, w2, b2): + self.w1 = w1 + self.b1 = b1 + self.w2 = w2 + self.b2 = b2 + + def forward(self, x): + x = F.relu(F.linear(x, self.w1, self.b1)) + x = F.linear(x, self.w2, self.b2) + return x + + +@torch.jit.script +class T2SBlock: + def __init__( + self, + num_heads, + hidden_dim: int, + mlp: T2SMLP, + qkv_w, + qkv_b, + out_w, + out_b, + norm_w1, + norm_b1, + norm_eps1, + norm_w2, + norm_b2, + norm_eps2, + ): + self.num_heads = num_heads + self.mlp = mlp + self.hidden_dim: int = hidden_dim + self.qkv_w = qkv_w + self.qkv_b = qkv_b + self.out_w = out_w + self.out_b = out_b + self.norm_w1 = norm_w1 + self.norm_b1 = norm_b1 + self.norm_eps1 = norm_eps1 + self.norm_w2 = norm_w2 + self.norm_b2 = norm_b2 + self.norm_eps2 = norm_eps2 + + def process_prompt(self, x, attn_mask : torch.Tensor): + q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + batch_size = q.shape[0] + q_len = q.shape[1] + kv_len = k.shape[1] + + k_cache = k + v_cache = v + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) + + attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim) + attn = F.linear(attn, self.out_w, self.out_b) + + x = F.layer_norm( + x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 + ) + x = F.layer_norm( + x + self.mlp.forward(x), + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + def decode_next_token(self, x, k_cache, v_cache): + q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) + + k_cache = torch.cat([k_cache, k], dim=1) + v_cache = torch.cat([v_cache, v], dim=1) + kv_len = k_cache.shape[1] + + batch_size = q.shape[0] + q_len = q.shape[1] + + q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) + k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) + + + attn = F.scaled_dot_product_attention(q, k, v) + + attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim) + attn = F.linear(attn, self.out_w, self.out_b) + + x = F.layer_norm( + x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 + ) + x = F.layer_norm( + x + self.mlp.forward(x), + [self.hidden_dim], + self.norm_w2, + self.norm_b2, + self.norm_eps2, + ) + return x, k_cache, v_cache + + +@torch.jit.script +class T2STransformer: + def __init__(self, num_blocks : int, blocks: List[T2SBlock]): + self.num_blocks : int = num_blocks + self.blocks = blocks + + def process_prompt( + self, x, attn_mask : torch.Tensor): + k_cache : List[torch.Tensor] = [] + v_cache : List[torch.Tensor] = [] + for i in range(self.num_blocks): + x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask) + k_cache.append(k_cache_) + v_cache.append(v_cache_) + return x, k_cache, v_cache + + def decode_next_token( + self, x, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor] + ): + for i in range(self.num_blocks): + x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i]) + return x, k_cache, v_cache + + +class Text2SemanticDecoder(nn.Module): + def __init__(self, config, norm_first=False, top_k=3): + super(Text2SemanticDecoder, self).__init__() + self.model_dim = config["model"]["hidden_dim"] + self.embedding_dim = config["model"]["embedding_dim"] + self.num_head = config["model"]["head"] + self.num_layers = config["model"]["n_layer"] + self.norm_first = norm_first + self.vocab_size = config["model"]["vocab_size"] + self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] + self.p_dropout = config["model"]["dropout"] + self.EOS = config["model"]["EOS"] + self.norm_first = norm_first + assert self.EOS == self.vocab_size - 1 + # should be same as num of kmeans bin + # assert self.EOS == 1024 + self.bert_proj = nn.Linear(1024, self.embedding_dim) + self.ar_text_embedding = TokenEmbedding( + self.embedding_dim, self.phoneme_vocab_size, self.p_dropout + ) + self.ar_text_position = SinePositionalEmbedding( + self.embedding_dim, dropout=0.1, scale=False, alpha=True + ) + self.ar_audio_embedding = TokenEmbedding( + self.embedding_dim, self.vocab_size, self.p_dropout + ) + self.ar_audio_position = SinePositionalEmbedding( + self.embedding_dim, dropout=0.1, scale=False, alpha=True + ) + + self.h = TransformerEncoder( + TransformerEncoderLayer( + d_model=self.model_dim, + nhead=self.num_head, + dim_feedforward=self.model_dim * 4, + dropout=0.1, + batch_first=True, + norm_first=norm_first, + ), + num_layers=self.num_layers, + norm=LayerNorm(self.model_dim) if norm_first else None, + ) + + self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.loss_fct = nn.CrossEntropyLoss(reduction="sum") + + self.ar_accuracy_metric = MulticlassAccuracy( + self.vocab_size, + top_k=top_k, + average="micro", + multidim_average="global", + ignore_index=self.EOS, + ) + + blocks = [] + + for i in range(self.num_layers): + layer = self.h.layers[i] + t2smlp = T2SMLP( + layer.linear1.weight, + layer.linear1.bias, + layer.linear2.weight, + layer.linear2.bias + ) + + block = T2SBlock( + self.num_head, + self.model_dim, + t2smlp, + layer.self_attn.in_proj_weight, + layer.self_attn.in_proj_bias, + layer.self_attn.out_proj.weight, + layer.self_attn.out_proj.bias, + layer.norm1.weight, + layer.norm1.bias, + layer.norm1.eps, + layer.norm2.weight, + layer.norm2.bias, + layer.norm2.eps + ) + + blocks.append(block) + + self.t2s_transformer = T2STransformer(self.num_layers, blocks) + + # self.t2s_transformer.process_prompt = torch.compile(self.t2s_transformer.process_prompt,mode="reduce-overhead", fullgraph=True) + # self.t2s_transformer.decode_next_token = torch.compile(self.t2s_transformer.decode_next_token,mode="reduce-overhead", fullgraph=True) + + def make_input_data(self, x, x_lens, y, y_lens, bert_feature): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + x_mask = make_pad_mask(x_lens) + + y_mask = make_pad_mask(y_lens) + y_mask_int = y_mask.type(torch.int64) + codes = y.type(torch.int64) * (1 - y_mask_int) + + # Training + # AR Decoder + y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) + x_len = x_lens.max() + y_len = y_lens.max() + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + + xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) + + ar_xy_padding_mask = xy_padding_mask + + x_attn_mask = F.pad( + torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), + (0, y_len), + value=True, + ) + + y_attn_mask = F.pad( + torch.triu( + torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), + diagonal=1, + ), + (x_len, 0), + value=False, + ) + + xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) + bsz, src_len = x.shape[0], x_len + y_len + _xy_padding_mask = ( + ar_xy_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, self.num_head, -1, -1) + .reshape(bsz * self.num_head, 1, src_len) + ) + xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) + new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) + new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) + xy_attn_mask = new_attn_mask + # x 和完整的 y 一次性输入模型 + xy_pos = torch.concat([x, y_pos], dim=1) + + return xy_pos, xy_attn_mask, targets + + def forward(self, x, x_lens, y, y_lens, bert_feature): + """ + x: phoneme_ids + y: semantic_ids + """ + + reject_y, reject_y_lens = make_reject_y(y, y_lens) + + xy_pos, xy_attn_mask, targets = self.make_input_data(x, x_lens, y, y_lens, bert_feature) + + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + x_len = x_lens.max() + logits = self.ar_predict_layer(xy_dec[:, x_len:]) + + ###### DPO ############# + reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(x, x_lens, reject_y, reject_y_lens, bert_feature) + + reject_xy_dec, _ = self.h( + (reject_xy_pos, None), + mask=reject_xy_attn_mask, + ) + x_len = x_lens.max() + reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:]) + + # loss + # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum + + loss_1 = F.cross_entropy(logits.permute(0, 2, 1), targets, reduction="sum") + acc = self.ar_accuracy_metric(logits.permute(0, 2, 1).detach(), targets).item() + + A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets) + loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True) + + loss = loss_1 + loss_2 + + return loss, acc + + def forward_old(self, x, x_lens, y, y_lens, bert_feature): + """ + x: phoneme_ids + y: semantic_ids + """ + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + x_mask = make_pad_mask(x_lens) + + y_mask = make_pad_mask(y_lens) + y_mask_int = y_mask.type(torch.int64) + codes = y.type(torch.int64) * (1 - y_mask_int) + + # Training + # AR Decoder + y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) + x_len = x_lens.max() + y_len = y_lens.max() + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + + xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) + ar_xy_padding_mask = xy_padding_mask + + x_attn_mask = F.pad( + torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), + (0, y_len), + value=True, + ) + y_attn_mask = F.pad( + torch.triu( + torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), + diagonal=1, + ), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) + bsz, src_len = x.shape[0], x_len + y_len + _xy_padding_mask = ( + ar_xy_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, self.num_head, -1, -1) + .reshape(bsz * self.num_head, 1, src_len) + ) + xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) + new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) + new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) + xy_attn_mask = new_attn_mask + # x 和完整的 y 一次性输入模型 + xy_pos = torch.concat([x, y_pos], dim=1) + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1) + # loss + # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum + loss = F.cross_entropy(logits, targets, reduction="sum") + acc = self.ar_accuracy_metric(logits.detach(), targets).item() + return loss, acc + + # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 + def infer( + self, + x, + x_lens, + prompts, + bert_feature, + top_k: int = -100, + early_stop_num: int = -1, + temperature: float = 1.0, + ): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + prefix_len = y.shape[1] + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) + stop = False + for _ in tqdm(range(1500)): + y_emb = self.ar_audio_embedding(y) + y_pos = self.ar_audio_position(y_emb) + # x 和逐渐增长的 y 一起输入给模型 + xy_pos = torch.concat([x, y_pos], dim=1) + y_len = y.shape[1] + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), + value=True, + ) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( + y.device + ) + + xy_dec, _ = self.h( + (xy_pos, None), + mask=xy_attn_mask, + ) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = topk_sampling( + logits, top_k=top_k, top_p=1.0, temperature=temperature + ) + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("use early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS) + stop = True + if stop: + if prompts.shape[1] == y.shape[1]: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + # 本次生成的 semantic_ids 和之前的 y 构成新的 y + # print(samples.shape)#[1,1]#第一个1是bs + # import os + # os._exit(2333) + y = torch.concat([y, samples], dim=1) + return y + + def pad_y_eos(self, y, y_mask_int, eos_id): + targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad( + y_mask_int, (0, 1), value=1 + ) + # 错位 + return targets[:, :-1], targets[:, 1:] + + def infer_panel( + self, + x, #####全部文本token + x_lens, + prompts, ####参考音频token + bert_feature, + top_k: int = -100, + top_p: int = 100, + early_stop_num: int = -1, + temperature: float = 1.0, + ): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + x = self.ar_text_position(x) + + # AR Decoder + y = prompts + + x_len = x.shape[1] + x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) + stop = False + # print(1111111,self.num_layers) + + k_cache = None + v_cache = None + ################### first step ########################## + if y is not None: + y_emb = self.ar_audio_embedding(y) + y_len = y_emb.shape[1] + prefix_len = y.shape[1] + y_pos = self.ar_audio_position(y_emb) + xy_pos = torch.concat([x, y_pos], dim=1) + ref_free = False + else: + y_emb = None + y_len = 0 + prefix_len = 0 + y_pos = None + xy_pos = x + y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device) + ref_free = True + + x_attn_mask_pad = F.pad( + x_attn_mask, + (0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) + value=True, + ) + y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y) + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), + value=False, + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( + x.device + ) + + for idx in tqdm(range(1500)): + if xy_attn_mask is not None: + xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask) + else: + xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) + + logits = self.ar_predict_layer( + xy_dec[:, -1] + ) + + if idx == 0: + xy_attn_mask = None + logits = logits[:, :-1] + samples = sample( + logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature + )[0].unsqueeze(0) + + y = torch.concat([y, samples], dim=1) + + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + print("use early stop num:", early_stop_num) + stop = True + + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + if y.shape[1]==0: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + print("bad zero prediction") + print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") + break + + ####################### update next step ################################### + y_emb = self.ar_audio_embedding(y[:, -1:]) + xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) + + if ref_free: + return y[:, :-1], 0 + return y[:, :-1], idx - 1 diff --git a/AR/models/t2s_model_onnx.py b/AR/models/t2s_model_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..7834297d5561c87021c946cbe0671424b4a3466f --- /dev/null +++ b/AR/models/t2s_model_onnx.py @@ -0,0 +1,338 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py +# reference: https://github.com/lifeiteng/vall-e +import torch +from tqdm import tqdm + +from AR.modules.embedding_onnx import SinePositionalEmbedding +from AR.modules.embedding_onnx import TokenEmbedding +from AR.modules.transformer_onnx import LayerNorm +from AR.modules.transformer_onnx import TransformerEncoder +from AR.modules.transformer_onnx import TransformerEncoderLayer +from torch import nn +from torch.nn import functional as F +from torchmetrics.classification import MulticlassAccuracy + +default_config = { + "embedding_dim": 512, + "hidden_dim": 512, + "num_head": 8, + "num_layers": 12, + "num_codebook": 8, + "p_dropout": 0.0, + "vocab_size": 1024 + 1, + "phoneme_vocab_size": 512, + "EOS": 1024, +} + +inf_tensor_value = torch.FloatTensor([-float("Inf")]).float() + +def logits_to_probs( + logits, + previous_tokens = None, + temperature: float = 1.0, + top_k = None, + top_p = None, + repetition_penalty: float = 1.0, +): + previous_tokens = previous_tokens.squeeze() + if previous_tokens is not None and repetition_penalty != 1.0: + previous_tokens = previous_tokens.long() + score = torch.gather(logits, dim=0, index=previous_tokens) + score = torch.where( + score < 0, score * repetition_penalty, score / repetition_penalty + ) + logits.scatter_(dim=0, index=previous_tokens, src=score) + + if top_p is not None and top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum( + torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 + ) + sorted_indices_to_remove = cum_probs > top_p + sorted_indices_to_remove[0] = False # keep at least one option + indices_to_remove = sorted_indices_to_remove.scatter( + dim=0, index=sorted_indices, src=sorted_indices_to_remove + ) + logits = logits.masked_fill(indices_to_remove, -float("Inf")) + + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, top_k) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, inf_tensor_value, logits) + + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +def multinomial_sample_one_no_sync( + probs_sort +): # Does multinomial sampling without a cuda synchronization + q = torch.randn_like(probs_sort) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +def sample( + logits, + previous_tokens, + **sampling_kwargs, +): + probs = logits_to_probs( + logits=logits, previous_tokens=previous_tokens, **sampling_kwargs + ) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + +class OnnxEncoder(nn.Module): + def __init__(self, ar_text_embedding, bert_proj, ar_text_position): + super().__init__() + self.ar_text_embedding = ar_text_embedding + self.bert_proj = bert_proj + self.ar_text_position = ar_text_position + + def forward(self, x, bert_feature): + x = self.ar_text_embedding(x) + x = x + self.bert_proj(bert_feature.transpose(1, 2)) + return self.ar_text_position(x) + + +class T2SFirstStageDecoder(nn.Module): + def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, + top_k, early_stop_num, num_layers): + super().__init__() + self.ar_audio_embedding = ar_audio_embedding + self.ar_audio_position = ar_audio_position + self.h = h + self.ar_predict_layer = ar_predict_layer + self.loss_fct = loss_fct + self.ar_accuracy_metric = ar_accuracy_metric + self.top_k = top_k + self.early_stop_num = early_stop_num + self.num_layers = num_layers + + def forward(self, x, prompt): + y = prompt + x_example = x[:,:,0] * 0.0 + #N, 1, 512 + cache = { + "all_stage": self.num_layers, + "k": None, + "v": None, + "y_emb": None, + "first_infer": 1, + "stage": 0, + } + + y_emb = self.ar_audio_embedding(y) + + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + + xy_pos = torch.concat([x, y_pos], dim=1) + + y_example = y_pos[:,:,0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool() + y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64) + y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum( + torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0 + ) + y_attn_mask = y_attn_mask > 0 + + x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool() + y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool() + x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1) + y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ + .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) + cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ + .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) + + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + + y = torch.concat([y, samples], dim=1) + + return y, cache["k"], cache["v"], cache["y_emb"], x_example + + +class T2SStageDecoder(nn.Module): + def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, + top_k, early_stop_num, num_layers): + super().__init__() + self.ar_audio_embedding = ar_audio_embedding + self.ar_audio_position = ar_audio_position + self.h = h + self.ar_predict_layer = ar_predict_layer + self.loss_fct = loss_fct + self.ar_accuracy_metric = ar_accuracy_metric + self.top_k = top_k + self.early_stop_num = early_stop_num + self.num_layers = num_layers + + def forward(self, y, k, v, y_emb, x_example): + cache = { + "all_stage": self.num_layers, + "k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)), + "v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)), + "y_emb": y_emb, + "first_infer": 0, + "stage": 0, + } + + y_emb = torch.cat( + [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 + ) + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + + xy_pos = y_pos[:, -1:] + + y_example = y_pos[:,:,0] * 0.0 + + xy_attn_mask = torch.cat([x_example, y_example], dim=1) + xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool) + + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + + y = torch.concat([y, samples], dim=1) + + return y, cache["k"], cache["v"], cache["y_emb"], logits, samples + + +class Text2SemanticDecoder(nn.Module): + def __init__(self, config, norm_first=False, top_k=3): + super(Text2SemanticDecoder, self).__init__() + self.model_dim = config["model"]["hidden_dim"] + self.embedding_dim = config["model"]["embedding_dim"] + self.num_head = config["model"]["head"] + self.num_layers = config["model"]["n_layer"] + self.norm_first = norm_first + self.vocab_size = config["model"]["vocab_size"] + self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] + self.p_dropout = float(config["model"]["dropout"]) + self.EOS = config["model"]["EOS"] + self.norm_first = norm_first + assert self.EOS == self.vocab_size - 1 + self.bert_proj = nn.Linear(1024, self.embedding_dim) + self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout) + self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout) + self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) + self.h = TransformerEncoder( + TransformerEncoderLayer( + d_model=self.model_dim, + nhead=self.num_head, + dim_feedforward=self.model_dim * 4, + dropout=0.1, + batch_first=True, + norm_first=norm_first, + ), + num_layers=self.num_layers, + norm=LayerNorm(self.model_dim) if norm_first else None, + ) + self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) + self.loss_fct = nn.CrossEntropyLoss(reduction="sum") + self.ar_accuracy_metric = MulticlassAccuracy( + self.vocab_size, + top_k=top_k, + average="micro", + multidim_average="global", + ignore_index=self.EOS, + ) + self.top_k = torch.LongTensor([1]) + self.early_stop_num = torch.LongTensor([-1]) + + def init_onnx(self): + self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position) + self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, + self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, + self.num_layers) + self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, + self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, + self.num_layers) + + def forward(self, x, prompts, bert_feature): + early_stop_num = self.early_stop_num + prefix_len = prompts.shape[1] + + x = self.onnx_encoder(x, bert_feature) + y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts) + + stop = False + for idx in range(1, 1500): + enco = self.stage_decoder(y, k, v, y_emb, stage, x_example) + y, k, v, y_emb, stage, logits, samples = enco + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + break + y[0, -1] = 0 + return y, idx + + def infer(self, x, prompts, bert_feature): + top_k = self.top_k + early_stop_num = self.early_stop_num + + x = self.onnx_encoder(x, bert_feature) + + y = prompts + prefix_len = y.shape[1] + x_len = x.shape[1] + x_example = x[:,:,0] * 0.0 + x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example) + x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool) + + stop = False + cache = { + "all_stage": self.num_layers, + "k": [None] * self.num_layers, + "v": [None] * self.num_layers, + "y_emb": None, + "first_infer": 1, + "stage": 0, + } + for idx in range(1500): + if cache["first_infer"] == 1: + y_emb = self.ar_audio_embedding(y) + else: + y_emb = torch.cat( + [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 + ) + cache["y_emb"] = y_emb + y_pos = self.ar_audio_position(y_emb) + if cache["first_infer"] == 1: + xy_pos = torch.concat([x, y_pos], dim=1) + else: + xy_pos = y_pos[:, -1:] + y_len = y_pos.shape[1] + if cache["first_infer"] == 1: + x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) + y_attn_mask = F.pad( + torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), + (x_len, 0), value=False + ) + xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) + else: + xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool) + xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) + logits = self.ar_predict_layer(xy_dec[:, -1]) + samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: + stop = True + if stop: + if prompts.shape[1] == y.shape[1]: + y = torch.concat([y, torch.zeros_like(samples)], dim=1) + break + y = torch.concat([y, samples], dim=1) + cache["first_infer"] = 0 + return y, idx \ No newline at end of file diff --git a/AR/models/utils.py b/AR/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9678c7e13d81a6e885187c86d9d11e49cf707957 --- /dev/null +++ b/AR/models/utils.py @@ -0,0 +1,229 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py +# reference: https://github.com/lifeiteng/vall-e +import torch +import torch.nn.functional as F +from typing import Tuple + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """ + Args: + lengths: + A 1-D tensor containing sentence lengths. + max_len: + The length of masks. + Returns: + Return a 2-D bool tensor, where masked positions + are filled with `True` and non-masked positions are + filled with `False`. + + #>>> lengths = torch.tensor([1, 3, 2, 5]) + #>>> make_pad_mask(lengths) + tensor([[False, True, True, True, True], + [False, False, False, True, True], + [False, False, True, True, True], + [False, False, False, False, False]]) + """ + assert lengths.ndim == 1, lengths.ndim + max_len = max(max_len, lengths.max()) + n = lengths.size(0) + seq_range = torch.arange(0, max_len, device=lengths.device) + expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len) + + return expaned_lengths >= lengths.unsqueeze(-1) + + +# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py +def top_k_top_p_filtering( + logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1 +): + """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove + ) + logits[indices_to_remove] = filter_value + return logits + + +def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0): + # temperature: (`optional`) float + # The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. + # top_k: (`optional`) int + # The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + # top_p: (`optional`) float + # The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + logits = logits / temperature + # Top-p/top-k filtering + logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p) + # Sample + token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) + return token + + +from typing import Optional, Tuple + + +def multinomial_sample_one_no_sync( + probs_sort, +): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + +def logits_to_probs( + logits, + previous_tokens: Optional[torch.Tensor] = None, + temperature: float = 1.0, + top_k: Optional[int] = None, + top_p: Optional[int] = None, + repetition_penalty: float = 1.0, +): + if previous_tokens is not None: + previous_tokens = previous_tokens.squeeze() + # print(logits.shape,previous_tokens.shape) + # pdb.set_trace() + if previous_tokens is not None and repetition_penalty != 1.0: + previous_tokens = previous_tokens.long() + score = torch.gather(logits, dim=0, index=previous_tokens) + score = torch.where( + score < 0, score * repetition_penalty, score / repetition_penalty + ) + logits.scatter_(dim=0, index=previous_tokens, src=score) + + if top_p is not None and top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum( + torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 + ) + sorted_indices_to_remove = cum_probs > top_p + sorted_indices_to_remove[0] = False # keep at least one option + indices_to_remove = sorted_indices_to_remove.scatter( + dim=0, index=sorted_indices, src=sorted_indices_to_remove + ) + logits = logits.masked_fill(indices_to_remove, -float("Inf")) + + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + +def sample( + logits, + previous_tokens: Optional[torch.Tensor] = None, + **sampling_kwargs, +) -> Tuple[torch.Tensor, torch.Tensor]: + probs = logits_to_probs( + logits=logits, previous_tokens=previous_tokens, **sampling_kwargs + ) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + +def dpo_loss(policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + beta: float, + reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + pi_logratios = policy_chosen_logps - policy_rejected_logps + ref_logratios = reference_chosen_logps - reference_rejected_logps + + if reference_free: + ref_logratios = 0 + + logits = pi_logratios - ref_logratios + + losses = -F.logsigmoid(beta * logits) + chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach() + rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach() + + return losses.mean(), chosen_rewards, rejected_rewards + +def get_batch_logps(logits_target: torch.FloatTensor, logits_reject: torch.FloatTensor, labels_target: torch.LongTensor, labels_reject: torch.LongTensor, average_log_prob: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + + # dummy token; we'll ignore the losses on these tokens later + + per_token_logps_target = torch.gather(logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)).squeeze(2) + per_token_logps_reject = torch.gather(logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)).squeeze(2) + + return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1) + +def make_reject_y(y_o, y_lens): + def repeat_P(y): + range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() + pre = y[:range_idx[0]] + shf = y[range_idx[1]:] + range_text = y[range_idx[0]:range_idx[1]] + new_y = torch.cat([pre, range_text, range_text, shf]) + return new_y + def lost_P(y): + range_idx, _ = torch.randint(0, len(y), size=(2,)).sort() + pre = y[:range_idx[0]] + shf = y[range_idx[1]:] + range_text = y[range_idx[0]:range_idx[1]] + new_y = torch.cat([pre, shf]) + return new_y + bs = len(y_lens) + reject_y = [] + reject_y_lens = [] + for b in range(bs): + process_item_idx = torch.randint(0, 1, size=(1, ))[0] + if process_item_idx == 0: + new_y = repeat_P(y_o[b]) + reject_y.append(new_y) + reject_y_lens.append(len(new_y)) + elif process_item_idx==1: + new_y = lost_P(y_o[b]) + reject_y.append(new_y) + reject_y_lens.append(len(new_y)) + max_length = max(reject_y_lens) + for b in range(bs): + pad_length = max_length - reject_y_lens[b] + reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0) + + reject_y = torch.stack(reject_y, dim = 0) + reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device) + + return reject_y, reject_y_lens diff --git a/AR/modules/__init__.py b/AR/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/AR/modules/__pycache__/__init__.cpython-39.pyc b/AR/modules/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..978573d946e8b1b6a5967a2099738bebd7c767c3 Binary files /dev/null and b/AR/modules/__pycache__/__init__.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/activation.cpython-39.pyc b/AR/modules/__pycache__/activation.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bac8d2c9e204632845b1d7deb2a581c493168503 Binary files /dev/null and b/AR/modules/__pycache__/activation.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/embedding.cpython-39.pyc b/AR/modules/__pycache__/embedding.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1d49aa7b8e996805bdb41b31ad3a50473a4e180 Binary files /dev/null and b/AR/modules/__pycache__/embedding.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/lr_schedulers.cpython-39.pyc b/AR/modules/__pycache__/lr_schedulers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ddcb6a8760f1ecc2d4781e839f261a645681264 Binary files /dev/null and b/AR/modules/__pycache__/lr_schedulers.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/optim.cpython-39.pyc b/AR/modules/__pycache__/optim.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b3775b4c97b0cd799cbc1f5d8b0da7043f1bc42 Binary files /dev/null and b/AR/modules/__pycache__/optim.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/patched_mha_with_cache.cpython-39.pyc b/AR/modules/__pycache__/patched_mha_with_cache.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19d3a09bda5d55d1b04b4da7b94985133853aa0f Binary files /dev/null and b/AR/modules/__pycache__/patched_mha_with_cache.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/scaling.cpython-39.pyc b/AR/modules/__pycache__/scaling.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b445dd9114861e611a990d4c0d48c8b9f818f52e Binary files /dev/null and b/AR/modules/__pycache__/scaling.cpython-39.pyc differ diff --git a/AR/modules/__pycache__/transformer.cpython-39.pyc b/AR/modules/__pycache__/transformer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d47eb13ecac2880c4ae27b9293e98940a18a0d6 Binary files /dev/null and b/AR/modules/__pycache__/transformer.cpython-39.pyc differ diff --git a/AR/modules/activation.py b/AR/modules/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca888b5e557777be5a09482a84ce00967f67236 --- /dev/null +++ b/AR/modules/activation.py @@ -0,0 +1,428 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py +from typing import Optional +from typing import Tuple +import torch +from torch import Tensor +from torch.nn import Linear +from torch.nn import Module +from torch.nn.init import constant_ +from torch.nn.init import xavier_normal_ +from torch.nn.init import xavier_uniform_ +from torch.nn.modules.linear import NonDynamicallyQuantizableLinear +from torch.nn.parameter import Parameter + +from torch.nn import functional as F +from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched + +F.multi_head_attention_forward = multi_head_attention_forward_patched + + +class MultiheadAttention(Module): + r"""Allows the model to jointly attend to information + from different representation subspaces as described in the paper: + `Attention Is All You Need `_. + + Multi-Head Attention is defined as: + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + + where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. + + ``forward()`` will use a special optimized implementation if all of the following + conditions are met: + + - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This + restriction will be loosened in the future.) + - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` + - training is disabled (using ``.eval()``) + - dropout is 0 + - ``add_bias_kv`` is ``False`` + - ``add_zero_attn`` is ``False`` + - ``batch_first`` is ``True`` and the input is batched + - ``kdim`` and ``vdim`` are equal to ``embed_dim`` + - at most one of ``key_padding_mask`` or ``attn_mask`` is passed + - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` + nor ``attn_mask`` is passed + + If the optimized implementation is in use, a + `NestedTensor `_ can be passed for + ``query``/``key``/``value`` to represent padding more efficiently than using a + padding mask. In this case, a `NestedTensor `_ + will be returned, and an additional speedup proportional to the fraction of the input + that is padding can be expected. + + Args: + embed_dim: Total dimension of the model. + num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split + across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). + dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). + bias: If specified, adds bias to input / output projection layers. Default: ``True``. + add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. + add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. + Default: ``False``. + kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). + vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + + Examples:: + + >>> # xdoctest: +SKIP + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + + """ + __constants__ = ["batch_first"] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + linear1_cls=Linear, + linear2_cls=Linear, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + if linear1_cls == Linear: + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter( + torch.empty((embed_dim, embed_dim), **factory_kwargs) + ) + self.k_proj_weight = Parameter( + torch.empty((embed_dim, self.kdim), **factory_kwargs) + ) + self.v_proj_weight = Parameter( + torch.empty((embed_dim, self.vdim), **factory_kwargs) + ) + self.register_parameter("in_proj_weight", None) + else: + self.in_proj_weight = Parameter( + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + ) + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = Parameter( + torch.empty(3 * embed_dim, **factory_kwargs) + ) + else: + self.register_parameter("in_proj_bias", None) + self.out_proj = NonDynamicallyQuantizableLinear( + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) + + self._reset_parameters() + else: + if not self._qkv_same_embed_dim: + raise NotImplementedError + else: + self.in_proj_linear = linear1_cls( + embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + ) + self.in_proj_weight = self.in_proj_linear.weight + + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = self.in_proj_linear.bias + else: + self.register_parameter("in_proj_bias", None) + + self.out_proj = linear2_cls( + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + self.add_zero_attn = add_zero_attn + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.0) + constant_(self.out_proj.bias, 0.0) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if "_qkv_same_embed_dim" not in state: + state["_qkv_same_embed_dim"] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + cache=None, + ) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` + or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, + :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. + Queries are compared against key-value pairs to produce the output. + See "Attention Is All You Need" for more details. + key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` + or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, + :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. + See "Attention Is All You Need" for more details. + value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when + ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source + sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. + See "Attention Is All You Need" for more details. + key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` + to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. + Binary and byte masks are supported. + For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for + the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. + need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. + Default: ``True``. + attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape + :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, + :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be + broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. + Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the + corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the + corresponding position is not allowed to attend. For a float mask, the mask values will be added to + the attention weight. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across + heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an + effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) + + Outputs: + - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, + :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, + where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the + embedding dimension ``embed_dim``. + - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, + returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. + + .. note:: + `batch_first` argument is ignored for unbatched inputs. + """ + is_batched = query.dim() == 3 + if key_padding_mask is not None: + _kpm_dtype = key_padding_mask.dtype + if _kpm_dtype != torch.bool and not torch.is_floating_point( + key_padding_mask + ): + raise AssertionError( + "only bool and floating types of key_padding_mask are supported" + ) + why_not_fast_path = "" + if not is_batched: + why_not_fast_path = ( + f"input not batched; expected query.dim() of 3 but got {query.dim()}" + ) + elif query is not key or key is not value: + # When lifting this restriction, don't forget to either + # enforce that the dtypes all match or test cases where + # they don't! + why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" + elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" + elif ( + self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype + ): + # this case will fail anyway, but at least they'll get a useful error message. + why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" + elif self.training: + why_not_fast_path = "training is enabled" + elif not self.batch_first: + why_not_fast_path = "batch_first was not True" + elif self.bias_k is not None: + why_not_fast_path = "self.bias_k was not None" + elif self.bias_v is not None: + why_not_fast_path = "self.bias_v was not None" + elif self.dropout: + why_not_fast_path = f"dropout was {self.dropout}, required zero" + elif self.add_zero_attn: + why_not_fast_path = "add_zero_attn was enabled" + elif not self._qkv_same_embed_dim: + why_not_fast_path = "_qkv_same_embed_dim was not True" + elif attn_mask is not None: + why_not_fast_path = "attn_mask was not None" + elif query.is_nested and key_padding_mask is not None: + why_not_fast_path = ( + "key_padding_mask is not supported with NestedTensor input" + ) + elif self.num_heads % 2 == 1: + why_not_fast_path = "num_heads is odd" + elif torch.is_autocast_enabled(): + why_not_fast_path = "autocast is enabled" + + if not why_not_fast_path: + tensor_args = ( + query, + key, + value, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + ) + # We have to use list comprehensions below because TorchScript does not support + # generator expressions. + if torch.overrides.has_torch_function(tensor_args): + why_not_fast_path = "some Tensor argument has_torch_function" + elif not all( + [ + (x is None or x.is_cuda or "cpu" in str(x.device)) + for x in tensor_args + ] + ): + why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" + elif torch.is_grad_enabled() and any( + [x is not None and x.requires_grad for x in tensor_args] + ): + why_not_fast_path = ( + "grad is enabled and at least one of query or the " + "input/output projection weights or biases requires_grad" + ) + if not why_not_fast_path: + return torch._native_multi_head_attention( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.out_proj.weight, + self.out_proj.bias, + key_padding_mask if key_padding_mask is not None else attn_mask, + need_weights, + average_attn_weights, + 1 + if key_padding_mask is not None + else 0 + if attn_mask is not None + else None, + ) + + any_nested = query.is_nested or key.is_nested or value.is_nested + assert not any_nested, ( + "MultiheadAttention does not support NestedTensor outside of its fast path. " + + f"The fast path was not hit because {why_not_fast_path}" + ) + + if self.batch_first and is_batched: + # make sure that the transpose op does not affect the "is" property + if key is value: + if query is key: + query = key = value = query.transpose(1, 0) + else: + query, key = [x.transpose(1, 0) for x in (query, key)] + value = key + else: + query, key, value = [x.transpose(1, 0) for x in (query, key, value)] + + if not self._qkv_same_embed_dim: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, + k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight, + average_attn_weights=average_attn_weights, + cache=cache, + ) + else: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + cache=cache, + ) + if self.batch_first and is_batched: + return attn_output.transpose(1, 0), attn_output_weights + else: + return attn_output, attn_output_weights diff --git a/AR/modules/activation_onnx.py b/AR/modules/activation_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b54acd999f165cb7d47a9388461e3f31164cd380 --- /dev/null +++ b/AR/modules/activation_onnx.py @@ -0,0 +1,178 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py +from typing import Optional +from typing import Tuple +import torch +from torch import Tensor +from torch.nn import Linear +from torch.nn import Module +from torch.nn.init import constant_ +from torch.nn.init import xavier_normal_ +from torch.nn.init import xavier_uniform_ +from torch.nn.modules.linear import NonDynamicallyQuantizableLinear +from torch.nn.parameter import Parameter + +from torch.nn import functional as F +from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched + + +class MultiheadAttention(Module): + __constants__ = ["batch_first"] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__( + self, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kdim=None, + vdim=None, + batch_first=False, + linear1_cls=Linear, + linear2_cls=Linear, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + if linear1_cls == Linear: + if not self._qkv_same_embed_dim: + self.q_proj_weight = Parameter( + torch.empty((embed_dim, embed_dim), **factory_kwargs) + ) + self.k_proj_weight = Parameter( + torch.empty((embed_dim, self.kdim), **factory_kwargs) + ) + self.v_proj_weight = Parameter( + torch.empty((embed_dim, self.vdim), **factory_kwargs) + ) + self.register_parameter("in_proj_weight", None) + else: + self.in_proj_weight = Parameter( + torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) + ) + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = Parameter( + torch.empty(3 * embed_dim, **factory_kwargs) + ) + else: + self.register_parameter("in_proj_bias", None) + self.out_proj = NonDynamicallyQuantizableLinear( + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) + + self._reset_parameters() + else: + if not self._qkv_same_embed_dim: + raise NotImplementedError + else: + self.in_proj_linear = linear1_cls( + embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs + ) + self.in_proj_weight = self.in_proj_linear.weight + + self.register_parameter("q_proj_weight", None) + self.register_parameter("k_proj_weight", None) + self.register_parameter("v_proj_weight", None) + + if bias: + self.in_proj_bias = self.in_proj_linear.bias + else: + self.register_parameter("in_proj_bias", None) + + self.out_proj = linear2_cls( + embed_dim, embed_dim, bias=bias, **factory_kwargs + ) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + self.add_zero_attn = add_zero_attn + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.0) + constant_(self.out_proj.bias, 0.0) + + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if "_qkv_same_embed_dim" not in state: + state["_qkv_same_embed_dim"] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + average_attn_weights: bool = True, + cache=None, + ) -> Tuple[Tensor, Optional[Tensor]]: + any_nested = query.is_nested or key.is_nested or value.is_nested + query = key = value = query.transpose(1, 0) + attn_output = multi_head_attention_forward_patched( + query, + key, + value, + self.embed_dim, + self.num_heads, + self.in_proj_weight, + self.in_proj_bias, + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout, + self.out_proj.weight, + self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + average_attn_weights=average_attn_weights, + cache=cache, + ) + return attn_output.transpose(1, 0) diff --git a/AR/modules/embedding.py b/AR/modules/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..3a382f93d8e5c87d6d52354937494703dc81523b --- /dev/null +++ b/AR/modules/embedding.py @@ -0,0 +1,81 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py +import math + +import torch +from torch import nn + + +class TokenEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + vocab_size: int, + dropout: float = 0.0, + ): + super().__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + + self.dropout = torch.nn.Dropout(p=dropout) + self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) + + @property + def weight(self) -> torch.Tensor: + return self.word_embeddings.weight + + def embedding(self, index: int) -> torch.Tensor: + return self.word_embeddings.weight[index : index + 1] + + def forward(self, x: torch.Tensor): + x = self.word_embeddings(x) + x = self.dropout(x) + return x + + +class SinePositionalEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + dropout: float = 0.0, + scale: bool = False, + alpha: bool = False, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 + self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) + self.dropout = torch.nn.Dropout(p=dropout) + + self.reverse = False + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, 4000)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.embedding_dim) + if self.reverse: + position = torch.arange( + x.size(1) - 1, -1, -1.0, dtype=torch.float32 + ).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.embedding_dim) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype).detach() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + self.extend_pe(x) + output = x.unsqueeze(-1) if x.ndim == 2 else x + output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)] + return self.dropout(output) diff --git a/AR/modules/embedding_onnx.py b/AR/modules/embedding_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b93405b45005d6101da1a3947d3d188fc460ae4e --- /dev/null +++ b/AR/modules/embedding_onnx.py @@ -0,0 +1,63 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py +import math + +import torch +from torch import nn + + +class TokenEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + vocab_size: int, + dropout: float = 0.0, + ): + super().__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + + self.dropout = torch.nn.Dropout(p=dropout) + self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) + + @property + def weight(self) -> torch.Tensor: + return self.word_embeddings.weight + + def embedding(self, index: int) -> torch.Tensor: + return self.word_embeddings.weight[index : index + 1] + + def forward(self, x: torch.Tensor): + x = self.word_embeddings(x) + x = self.dropout(x) + return x + + +class SinePositionalEmbedding(nn.Module): + def __init__( + self, + embedding_dim: int, + dropout: float = 0.0, + scale: bool = False, + alpha: bool = False, + ): + super().__init__() + self.embedding_dim = embedding_dim + self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 + self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) + self.dropout = torch.nn.Dropout(p=dropout) + self.reverse = False + self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) + + def extend_pe(self, x): + position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) + scpe = (position * self.div_term).unsqueeze(0) + pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) + pe = pe.contiguous().view(1, -1, self.embedding_dim) + return pe + + def forward(self, x: torch.Tensor) -> torch.Tensor: + pe = self.extend_pe(x) + output = x.unsqueeze(-1) if x.ndim == 2 else x + output = output * self.x_scale + self.alpha * pe + return self.dropout(output) diff --git a/AR/modules/lr_schedulers.py b/AR/modules/lr_schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..b8867467381cd58e14ad09ac4434512bc46186fc --- /dev/null +++ b/AR/modules/lr_schedulers.py @@ -0,0 +1,83 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py +# reference: https://github.com/lifeiteng/vall-e +import math + +import torch +from matplotlib import pyplot as plt +from torch import nn +from torch.optim import Adam + + +class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): + """ + Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. + """ + + def __init__( + self, + optimizer, + init_lr, + peak_lr, + end_lr, + warmup_steps=10000, + total_steps=400000, + current_step=0, + ): + self.init_lr = init_lr + self.peak_lr = peak_lr + self.end_lr = end_lr + self.optimizer = optimizer + self._warmup_rate = (peak_lr - init_lr) / warmup_steps + self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) + self._current_step = current_step + self.lr = init_lr + self.warmup_steps = warmup_steps + self.total_steps = total_steps + self._last_lr = [self.lr] + + def set_lr(self, lr): + self._last_lr = [g["lr"] for g in self.optimizer.param_groups] + for g in self.optimizer.param_groups: + # g['lr'] = lr + g["lr"] = self.end_lr ###锁定用线性 + + def step(self): + if self._current_step < self.warmup_steps: + lr = self.init_lr + self._warmup_rate * self._current_step + + elif self._current_step > self.total_steps: + lr = self.end_lr + + else: + decay_ratio = (self._current_step - self.warmup_steps) / ( + self.total_steps - self.warmup_steps + ) + if decay_ratio < 0.0 or decay_ratio > 1.0: + raise RuntimeError( + "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." + ) + coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) + lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) + + self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! + self.set_lr(lr) + self.lr = lr + self._current_step += 1 + return self.lr + + +if __name__ == "__main__": + m = nn.Linear(10, 10) + opt = Adam(m.parameters(), lr=1e-4) + s = WarmupCosineLRSchedule( + opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 + ) + lrs = [] + for i in range(25000): + s.step() + lrs.append(s.lr) + print(s.lr) + + plt.plot(lrs) + plt.plot(range(0, 25000), lrs) + plt.show() diff --git a/AR/modules/optim.py b/AR/modules/optim.py new file mode 100644 index 0000000000000000000000000000000000000000..98785f05b2e3aa123ae22955d677b9f68efec256 --- /dev/null +++ b/AR/modules/optim.py @@ -0,0 +1,622 @@ +# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import logging +from collections import defaultdict +from typing import List +from typing import Tuple + +import torch +from torch import Tensor +from torch.optim import Optimizer + + +class BatchedOptimizer(Optimizer): + """ + This class adds to class Optimizer the capability to optimize parameters in batches: + it will stack the parameters and their grads for you so the optimizer can work + on tensors with an extra leading dimension. This is intended for speed with GPUs, + as it reduces the number of kernels launched in the optimizer. + + Args: + params: + """ + + def __init__(self, params, defaults): + super(BatchedOptimizer, self).__init__(params, defaults) + + @contextlib.contextmanager + def batched_params(self, param_group, group_params_names): + """ + This function returns (technically, yields) a list of + of tuples (p, state), where + p is a `fake` parameter that is stacked (over axis 0) from real parameters + that share the same shape, and its gradient is also stacked; + `state` is the state corresponding to this batch of parameters + (it will be physically located in the "state" for one of the real + parameters, the last one that has any particular shape and dtype). + + This function is decorated as a context manager so that it can + write parameters back to their "real" locations. + + The idea is, instead of doing: + + for p in group["params"]: + state = self.state[p] + ... + + you can do: + + with self.batched_params(group["params"]) as batches: + for p, state, p_names in batches: + ... + + + Args: + group: a parameter group, which is a list of parameters; should be + one of self.param_groups. + group_params_names: name for each parameter in group, + which is List[str]. + """ + batches = defaultdict( + list + ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter + batches_names = defaultdict( + list + ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str + + assert len(param_group) == len(group_params_names) + for p, named_p in zip(param_group, group_params_names): + key = (str(p.dtype), *p.shape) + batches[key].append(p) + batches_names[key].append(named_p) + + batches_names_keys = list(batches_names.keys()) + sorted_idx = sorted( + range(len(batches_names)), key=lambda i: batches_names_keys[i]) + batches_names = [ + batches_names[batches_names_keys[idx]] for idx in sorted_idx + ] + batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] + + stacked_params_dict = dict() + + # turn batches into a list, in deterministic order. + # tuples will contain tuples of (stacked_param, state, stacked_params_names), + # one for each batch in `batches`. + tuples = [] + + for batch, batch_names in zip(batches, batches_names): + p = batch[0] + # we arbitrarily store the state in the + # state corresponding to the 1st parameter in the + # group. class Optimizer will take care of saving/loading state. + state = self.state[p] + p_stacked = torch.stack(batch) + grad = torch.stack([ + torch.zeros_like(p) if p.grad is None else p.grad for p in batch + ]) + p_stacked.grad = grad + stacked_params_dict[key] = p_stacked + tuples.append((p_stacked, state, batch_names)) + + yield tuples # <-- calling code will do the actual optimization here! + + for ((stacked_params, _state, _names), batch) in zip(tuples, batches): + for i, p in enumerate(batch): # batch is list of Parameter + p.copy_(stacked_params[i]) + + +class ScaledAdam(BatchedOptimizer): + """ + Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update + proportional to the norm of that parameter; and also learn the scale of the parameter, + in log space, subject to upper and lower limits (as if we had factored each parameter as + param = underlying_param * log_scale.exp()) + + + Args: + params: The parameters or param_groups to optimize (like other Optimizer subclasses) + lr: The learning rate. We will typically use a learning rate schedule that starts + at 0.03 and decreases over time, i.e. much higher than other common + optimizers. + clipping_scale: (e.g. 2.0) + A scale for gradient-clipping: if specified, the normalized gradients + over the whole model will be clipped to have 2-norm equal to + `clipping_scale` times the median 2-norm over the most recent period + of `clipping_update_period` minibatches. By "normalized gradients", + we mean after multiplying by the rms parameter value for this tensor + [for non-scalars]; this is appropriate because our update is scaled + by this quantity. + betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad. + Must satisfy 0 < beta <= beta2 < 1. + scalar_lr_scale: A scaling factor on the learning rate, that we use to update the + scale of each parameter tensor and scalar parameters of the mode.. + If each parameter were decomposed + as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale + would be a the scaling factor on the learning rate of p_scale. + eps: A general-purpose epsilon to prevent division by zero + param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of + learning the scale on the parameters (we'll constrain the rms of each non-scalar + parameter tensor to be >= this value) + param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of + learning the scale on the parameters (we'll constrain the rms of each non-scalar + parameter tensor to be <= this value) + scalar_max: Maximum absolute value for scalar parameters (applicable if your + model has any parameters with numel() == 1). + size_update_period: The periodicity, in steps, with which we update the size (scale) + of the parameter tensor. This is provided to save a little time + in the update. + clipping_update_period: if clipping_scale is specified, this is the period + """ + + def __init__( + self, + params, + lr=3e-02, + clipping_scale=None, + betas=(0.9, 0.98), + scalar_lr_scale=0.1, + eps=1.0e-08, + param_min_rms=1.0e-05, + param_max_rms=3.0, + scalar_max=10.0, + size_update_period=4, + clipping_update_period=100, + parameters_names=None, + show_dominant_parameters=True, ): + + assert parameters_names is not None, ( + "Please prepare parameters_names," + "which is a List[List[str]]. Each List[str] is for a group" + "and each str is for a parameter") + defaults = dict( + lr=lr, + clipping_scale=clipping_scale, + betas=betas, + scalar_lr_scale=scalar_lr_scale, + eps=eps, + param_min_rms=param_min_rms, + param_max_rms=param_max_rms, + scalar_max=scalar_max, + size_update_period=size_update_period, + clipping_update_period=clipping_update_period, ) + + super(ScaledAdam, self).__init__(params, defaults) + assert len(self.param_groups) == len(parameters_names) + self.parameters_names = parameters_names + self.show_dominant_parameters = show_dominant_parameters + + def __setstate__(self, state): + super(ScaledAdam, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + batch = True + + for group, group_params_names in zip(self.param_groups, + self.parameters_names): + + with self.batched_params(group["params"], + group_params_names) as batches: + + # batches is list of pairs (stacked_param, state). stacked_param is like + # a regular parameter, and will have a .grad, but the 1st dim corresponds to + # a stacking dim, it is not a real dim. + + if (len(batches[0][1]) == + 0): # if len(first state) == 0: not yet initialized + clipping_scale = 1 + else: + clipping_scale = self._get_clipping_scale(group, batches) + + for p, state, _ in batches: + # Perform optimization step. + # grad is not going to be None, we handled that when creating the batches. + grad = p.grad + if grad.is_sparse: + raise RuntimeError( + "ScaledAdam optimizer does not support sparse gradients" + ) + # State initialization + if len(state) == 0: + self._init_state(group, p, state) + + self._step_one_batch(group, p, state, clipping_scale) + + return loss + + def _init_state(self, group: dict, p: Tensor, state: dict): + """ + Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p + is actually the batch dimension, corresponding to batched-together + parameters of a given shape. + + + Args: + group: Dict to look up configuration values. + p: The parameter that we are initializing the state for + state: Dict from string to whatever state we are initializing + """ + size_update_period = group["size_update_period"] + + state["step"] = 0 + + kwargs = {"device": p.device, "dtype": p.dtype} + + # 'delta' implements conventional momentum. There are + # several different kinds of update going on, so rather than + # compute "exp_avg" like in Adam, we store and decay a + # parameter-change "delta", which combines all forms of + # update. this is equivalent to how it's done in Adam, + # except for the first few steps. + state["delta"] = torch.zeros_like( + p, memory_format=torch.preserve_format) + + batch_size = p.shape[0] + numel = p.numel() // batch_size + numel = p.numel() + + if numel > 1: + # "param_rms" just periodically records the scalar root-mean-square value of + # the parameter tensor. + # it has a shape like (batch_size, 1, 1, 1, 1) + param_rms = ( + (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) + state["param_rms"] = param_rms + + state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) + state["scale_grads"] = torch.zeros(size_update_period, + *param_rms.shape, **kwargs) + + # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. + state["exp_avg_sq"] = torch.zeros_like( + p, memory_format=torch.preserve_format) + + def _get_clipping_scale(self, + group: dict, + tuples: List[Tuple[Tensor, dict, List[str]]] + ) -> float: + """ + Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients + by this amount before applying the rest of the update. + + Args: + group: the parameter group, an item in self.param_groups + tuples: a list of tuples of (param, state, param_names) + where param is a batched set of parameters, + with a .grad (1st dim is batch dim) + and state is the state-dict where optimization parameters are kept. + param_names is a List[str] while each str is name for a parameter + in batched set of parameters "param". + """ + assert len(tuples) >= 1 + clipping_scale = group["clipping_scale"] + (first_p, first_state, _) = tuples[0] + step = first_state["step"] + if clipping_scale is None or step == 0: + # no clipping. return early on step == 0 because the other + # parameters' state won't have been initialized yet. + return 1.0 + clipping_update_period = group["clipping_update_period"] + + tot_sumsq = torch.tensor(0.0, device=first_p.device) + for (p, state, param_names) in tuples: + grad = p.grad + if grad.is_sparse: + raise RuntimeError( + "ScaledAdam optimizer does not support sparse gradients") + if p.numel() == p.shape[0]: # a batch of scalars + tot_sumsq += (grad**2).sum() # sum() to change shape [1] to [] + else: + tot_sumsq += ((grad * state["param_rms"])**2).sum() + + tot_norm = tot_sumsq.sqrt() + if "model_norms" not in first_state: + first_state["model_norms"] = torch.zeros( + clipping_update_period, device=p.device) + first_state["model_norms"][step % clipping_update_period] = tot_norm + + if step % clipping_update_period == 0: + # Print some stats. + # We don't reach here if step == 0 because we would have returned + # above. + sorted_norms = first_state["model_norms"].sort()[0].to("cpu") + quartiles = [] + for n in range(0, 5): + index = min( + clipping_update_period - 1, + (clipping_update_period // 4) * n, ) + quartiles.append(sorted_norms[index].item()) + + median = quartiles[2] + threshold = clipping_scale * median + first_state["model_norm_threshold"] = threshold + percent_clipped = (first_state["num_clipped"] * 100.0 / + clipping_update_period + if "num_clipped" in first_state else 0.0) + first_state["num_clipped"] = 0 + quartiles = " ".join(["%.3e" % x for x in quartiles]) + logging.info( + f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, " + f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" + ) + + if step < clipping_update_period: + return 1.0 # We have not yet estimated a norm to clip to. + else: + try: + model_norm_threshold = first_state["model_norm_threshold"] + except KeyError: + logging.info( + "Warning: model_norm_threshold not in state: possibly " + "you changed config when restarting, adding clipping_scale option?" + ) + return 1.0 + ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) + if ans < 1.0: + first_state["num_clipped"] += 1 + if ans < 0.1: + logging.warn( + f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}" + ) + if self.show_dominant_parameters: + assert p.shape[0] == len(param_names) + self._show_gradient_dominating_parameter(tuples, tot_sumsq) + return ans + + def _show_gradient_dominating_parameter( + self, tuples: List[Tuple[Tensor, dict, List[str]]], + tot_sumsq: Tensor): + """ + Show information of parameter wihch dominanting tot_sumsq. + + Args: + tuples: a list of tuples of (param, state, param_names) + where param is a batched set of parameters, + with a .grad (1st dim is batch dim) + and state is the state-dict where optimization parameters are kept. + param_names is a List[str] while each str is name for a parameter + in batched set of parameters "param". + tot_sumsq: sumsq of all parameters. Though it's could be calculated + from tuples, we still pass it to save some time. + """ + all_sumsq_orig = {} + for (p, state, batch_param_names) in tuples: + # p is a stacked batch parameters. + batch_grad = p.grad + if p.numel() == p.shape[0]: # a batch of scalars + batch_sumsq_orig = batch_grad**2 + # Dummpy values used by following `zip` statement. + batch_rms_orig = torch.ones(p.shape[0]) + else: + batch_rms_orig = state["param_rms"] + batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum( + dim=list(range(1, batch_grad.ndim))) + + for name, sumsq_orig, rms, grad in zip(batch_param_names, + batch_sumsq_orig, + batch_rms_orig, batch_grad): + + proportion_orig = sumsq_orig / tot_sumsq + all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) + + assert torch.isclose( + sum([value[0] for value in all_sumsq_orig.values()]).cpu(), + torch.tensor(1.0), ) + sorted_by_proportion = { + k: v + for k, v in sorted( + all_sumsq_orig.items(), + key=lambda item: item[1][0], + reverse=True, ) + } + dominant_param_name = next(iter(sorted_by_proportion)) + (dominant_proportion, dominant_sumsq, dominant_rms, + dominant_grad, ) = sorted_by_proportion[dominant_param_name] + logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}" + f" with proportion {dominant_proportion:.2f}," + f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" + f"={dominant_sumsq:.3e}," + f" grad_sumsq = {(dominant_grad**2).sum():.3e}," + f" orig_rms_sq={(dominant_rms**2).item():.3e}") + + def _step_one_batch(self, + group: dict, + p: Tensor, + state: dict, + clipping_scale: float): + """ + Do the step for one parameter, which is actually going to be a batch of + `real` parameters, with dim 0 as the batch dim. + Args: + group: dict to look up configuration values + p: parameter to update (actually multiple parameters stacked together + as a batch) + state: state-dict for p, to look up the optimizer state + """ + lr = group["lr"] + size_update_period = group["size_update_period"] + beta1 = group["betas"][0] + + grad = p.grad + if clipping_scale != 1.0: + grad = grad * clipping_scale + step = state["step"] + delta = state["delta"] + + delta.mul_(beta1) + batch_size = p.shape[0] + numel = p.numel() // batch_size + if numel > 1: + # Update the size/scale of p, and set param_rms + scale_grads = state["scale_grads"] + scale_grads[step % size_update_period] = (p * grad).sum( + dim=list(range(1, p.ndim)), keepdim=True) + if step % size_update_period == size_update_period - 1: + param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) + param_rms.copy_((p**2) + .mean(dim=list(range(1, p.ndim)), keepdim=True) + .sqrt()) + if step > 0: + # self._size_update() learns the overall scale on the + # parameter, by shrinking or expanding it. + self._size_update(group, scale_grads, p, state) + + if numel == 1: + # For parameters with 1 element we just use regular Adam. + # Updates delta. + self._step_scalar(group, p, state) + else: + self._step(group, p, state) + + state["step"] = step + 1 + + def _size_update(self, + group: dict, + scale_grads: Tensor, + p: Tensor, + state: dict) -> None: + """ + Called only where p.numel() > 1, this updates the scale of the parameter. + If we imagine: p = underlying_param * scale.exp(), and we are doing + gradient descent on underlying param and on scale, this function does the update + on `scale`. + + Args: + group: dict to look up configuration values + scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing + grads w.r.t. the scales. + p: The parameter to update + state: The state-dict of p + """ + + param_rms = state["param_rms"] + beta1, beta2 = group["betas"] + size_lr = group["lr"] * group["scalar_lr_scale"] + param_min_rms = group["param_min_rms"] + param_max_rms = group["param_max_rms"] + eps = group["eps"] + step = state["step"] + batch_size = p.shape[0] + + size_update_period = scale_grads.shape[0] + # correct beta2 for the size update period: we will have + # faster decay at this level. + beta2_corr = beta2**size_update_period + + scale_exp_avg_sq = state[ + "scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) + scale_exp_avg_sq.mul_(beta2_corr).add_( + (scale_grads**2).mean(dim=0), # mean over dim `size_update_period` + alpha=1 - beta2_corr, ) # shape is (batch_size, 1, 1, ...) + + # The 1st time we reach here is when size_step == 1. + size_step = (step + 1) // size_update_period + bias_correction2 = 1 - beta2_corr**size_step + # we don't bother with bias_correction1; this will help prevent divergence + # at the start of training. + + denom = scale_exp_avg_sq.sqrt() + eps + + scale_step = (-size_lr * (bias_correction2**0.5) * + scale_grads.sum(dim=0) / denom) + + is_too_small = param_rms < param_min_rms + is_too_large = param_rms > param_max_rms + + # when the param gets too small, just don't shrink it any further. + scale_step.masked_fill_(is_too_small, 0.0) + # when it gets too large, stop it from getting any larger. + scale_step.masked_fill_(is_too_large, -size_lr * size_update_period) + delta = state["delta"] + # the factor of (1-beta1) relates to momentum. + delta.add_(p * scale_step, alpha=(1 - beta1)) + + def _step(self, group: dict, p: Tensor, state: dict): + """ + This function does the core update of self.step(), in the case where the members of + the batch have more than 1 element. + + Args: + group: A dict which will be used to look up configuration values + p: The parameter to be updated + grad: The grad of p + state: The state-dict corresponding to parameter p + + This function modifies p. + """ + grad = p.grad + lr = group["lr"] + beta1, beta2 = group["betas"] + eps = group["eps"] + param_min_rms = group["param_min_rms"] + step = state["step"] + + exp_avg_sq = state["exp_avg_sq"] + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) + + this_step = state["step"] - (state["zero_step"] + if "zero_step" in state else 0) + bias_correction2 = 1 - beta2**(this_step + 1) + if bias_correction2 < 0.99: + # note: not in-place. + exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) + + denom = exp_avg_sq.sqrt() + denom += eps + grad = grad / denom + + alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms) + + delta = state["delta"] + delta.add_(grad * alpha) + p.add_(delta) + + def _step_scalar(self, group: dict, p: Tensor, state: dict): + """ + A simplified form of the core update for scalar tensors, where we cannot get a good + estimate of the parameter rms. + """ + beta1, beta2 = group["betas"] + scalar_max = group["scalar_max"] + eps = group["eps"] + lr = group["lr"] * group["scalar_lr_scale"] + grad = p.grad + + exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + # bias_correction2 is like in Adam. Don't bother with bias_correction1; + # slower update at the start will help stability anyway. + bias_correction2 = 1 - beta2**(state["step"] + 1) + denom = (exp_avg_sq / bias_correction2).sqrt() + eps + + delta = state["delta"] + delta.add_(grad / denom, alpha=-lr * (1 - beta1)) + p.clamp_(min=-scalar_max, max=scalar_max) + p.add_(delta) diff --git a/AR/modules/patched_mha_with_cache.py b/AR/modules/patched_mha_with_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..7be241dadd378fc9312916f60433ba4b7aa7c764 --- /dev/null +++ b/AR/modules/patched_mha_with_cache.py @@ -0,0 +1,465 @@ +from torch.nn.functional import * +from torch.nn.functional import ( + _mha_shape_check, + _canonical_mask, + _none_or_dtype, + _in_projection_packed, +) +from torch.nn import functional as F +import torch +# Tensor = torch.Tensor +# from typing import Callable, List, Optional, Tuple, Union + + +def multi_head_attention_forward_patched( + query: Tensor, + key: Tensor, + value: Tensor, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight: Optional[Tensor], + in_proj_bias: Optional[Tensor], + bias_k: Optional[Tensor], + bias_v: Optional[Tensor], + add_zero_attn: bool, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Optional[Tensor], + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + use_separate_proj_weight: bool = False, + q_proj_weight: Optional[Tensor] = None, + k_proj_weight: Optional[Tensor] = None, + v_proj_weight: Optional[Tensor] = None, + static_k: Optional[Tensor] = None, + static_v: Optional[Tensor] = None, + average_attn_weights: bool = True, + is_causal: bool = False, + cache=None, +) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query, key, value: map a query and a set of key-value pairs to an output. + See "Attention Is All You Need" for more details. + embed_dim_to_check: total dimension of the model. + num_heads: parallel attention heads. + in_proj_weight, in_proj_bias: input projection weight and bias. + bias_k, bias_v: bias of the key and value sequences to be added at dim=0. + add_zero_attn: add a new batch of zeros to the key and + value sequences at dim=1. + dropout_p: probability of an element to be zeroed. + out_proj_weight, out_proj_bias: the output projection weight and bias. + training: apply dropout if is ``True``. + key_padding_mask: if provided, specified padding elements in the key will + be ignored by the attention. This is an binary mask. When the value is True, + the corresponding value on the attention layer will be filled with -inf. + need_weights: output attn_output_weights. + Default: `True` + Note: `needs_weight` defaults to `True`, but should be set to `False` + For best performance when attention weights are not nedeeded. + *Setting needs_weights to `True` + leads to a significant performance degradation.* + attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all + the batches while a 3D mask allows to specify a different mask for the entries of each batch. + is_causal: If specified, applies a causal mask as attention mask, and ignores + attn_mask for computing scaled dot product attention. + Default: ``False``. + .. warning:: + is_causal is provides a hint that the attn_mask is the + causal mask.Providing incorrect hints can result in + incorrect execution, including forward and backward + compatibility. + use_separate_proj_weight: the function accept the proj. weights for query, key, + and value in different forms. If false, in_proj_weight will be used, which is + a combination of q_proj_weight, k_proj_weight, v_proj_weight. + q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. + static_k, static_v: static key and value used for attention operators. + average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads. + Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect + when ``need_weights=True.``. Default: True + + + Shape: + Inputs: + - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is + the embedding dimension. + - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is + the embedding dimension. + - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length. + If a FloatTensor is provided, it will be directly added to the value. + If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. + 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, + S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked + positions. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, + N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. + - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, + N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. + + Outputs: + - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, + E is the embedding dimension. + - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns + attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or + :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and + :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per + head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`. + """ + tens_ops = ( + query, + key, + value, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + out_proj_weight, + out_proj_bias, + ) + if has_torch_function(tens_ops): + return handle_torch_function( + multi_head_attention_forward, + tens_ops, + query, + key, + value, + embed_dim_to_check, + num_heads, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + add_zero_attn, + dropout_p, + out_proj_weight, + out_proj_bias, + training=training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + is_causal=is_causal, + use_separate_proj_weight=use_separate_proj_weight, + q_proj_weight=q_proj_weight, + k_proj_weight=k_proj_weight, + v_proj_weight=v_proj_weight, + static_k=static_k, + static_v=static_v, + average_attn_weights=average_attn_weights, + cache=cache, + ) + + is_batched = _mha_shape_check( + query, key, value, key_padding_mask, attn_mask, num_heads + ) + + # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input + # is batched, run the computation and before returning squeeze the + # batch dimension so that the output doesn't carry this temporary batch dimension. + if not is_batched: + # unsqueeze if the input is unbatched + query = query.unsqueeze(1) + key = key.unsqueeze(1) + value = value.unsqueeze(1) + if key_padding_mask is not None: + key_padding_mask = key_padding_mask.unsqueeze(0) + + # set up shape vars + tgt_len, bsz, embed_dim = query.shape + src_len, _, _ = key.shape + + key_padding_mask = _canonical_mask( + mask=key_padding_mask, + mask_name="key_padding_mask", + other_type=_none_or_dtype(attn_mask), + other_name="attn_mask", + target_type=query.dtype, + ) + + if is_causal and attn_mask is None: + raise RuntimeError( + "Need attn_mask if specifying the is_causal hint. " + "You may use the Transformer module method " + "`generate_square_subsequent_mask` to create this mask." + ) + + if is_causal and key_padding_mask is None and not need_weights: + # when we have a kpm or need weights, we need attn_mask + # Otherwise, we use the is_causal hint go as is_causal + # indicator to SDPA. + attn_mask = None + else: + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + + if key_padding_mask is not None: + # We have the attn_mask, and use that to merge kpm into it. + # Turn off use of is_causal hint, as the merged mask is no + # longer causal. + is_causal = False + + assert ( + embed_dim == embed_dim_to_check + ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + if isinstance(embed_dim, torch.Tensor): + # embed_dim can be a tensor when JIT tracing + head_dim = embed_dim.div(num_heads, rounding_mode="trunc") + else: + head_dim = embed_dim // num_heads + assert ( + head_dim * num_heads == embed_dim + ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + if use_separate_proj_weight: + # allow MHA to have different embedding dimensions when separate projection weights are used + assert ( + key.shape[:2] == value.shape[:2] + ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + else: + assert ( + key.shape == value.shape + ), f"key shape {key.shape} does not match value shape {value.shape}" + + # + # compute in-projection + # + if not use_separate_proj_weight: + assert ( + in_proj_weight is not None + ), "use_separate_proj_weight is False but in_proj_weight is None" + q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) + else: + assert ( + q_proj_weight is not None + ), "use_separate_proj_weight is True but q_proj_weight is None" + assert ( + k_proj_weight is not None + ), "use_separate_proj_weight is True but k_proj_weight is None" + assert ( + v_proj_weight is not None + ), "use_separate_proj_weight is True but v_proj_weight is None" + if in_proj_bias is None: + b_q = b_k = b_v = None + else: + b_q, b_k, b_v = in_proj_bias.chunk(3) + q, k, v = _in_projection( + query, + key, + value, + q_proj_weight, + k_proj_weight, + v_proj_weight, + b_q, + b_k, + b_v, + ) + if cache != None: + if cache["first_infer"] == 1: + cache["k"][cache["stage"]] = k + # print(0,cache["k"].shape) + cache["v"][cache["stage"]] = v + else: ###12个layer每个都要留自己的cache_kv + # print(1,cache["k"].shape) + cache["k"][cache["stage"]] = torch.cat( + [cache["k"][cache["stage"]], k], 0 + ) ##本来时序是1,但是proj的时候可能transpose了所以时序到0维了 + cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]], v], 0) + # print(2, cache["k"].shape) + src_len = cache["k"][cache["stage"]].shape[0] + k = cache["k"][cache["stage"]] + v = cache["v"][cache["stage"]] + # if attn_mask is not None: + # attn_mask=attn_mask[-1:,] + # print(attn_mask.shape,attn_mask) + cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] + # print(2333,cache) + # prep attention mask + + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=q.dtype, + check_other=False, + ) + + if attn_mask is not None: + # ensure attn_mask's dim is 3 + if attn_mask.dim() == 2: + correct_2d_size = (tgt_len, src_len) + if attn_mask.shape != correct_2d_size: + raise RuntimeError( + f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." + ) + attn_mask = attn_mask.unsqueeze(0) + elif attn_mask.dim() == 3: + correct_3d_size = (bsz * num_heads, tgt_len, src_len) + if attn_mask.shape != correct_3d_size: + raise RuntimeError( + f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." + ) + else: + raise RuntimeError( + f"attn_mask's dimension {attn_mask.dim()} is not supported" + ) + + # add bias along batch dimension (currently second) + if bias_k is not None and bias_v is not None: + assert static_k is None, "bias cannot be added to static key." + assert static_v is None, "bias cannot be added to static value." + k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + else: + assert bias_k is None + assert bias_v is None + + # + # reshape q, k, v for multihead attention and make em batch first + # + q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) + if static_k is None: + k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert ( + static_k.size(0) == bsz * num_heads + ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" + assert ( + static_k.size(2) == head_dim + ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + k = static_k + if static_v is None: + v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert ( + static_v.size(0) == bsz * num_heads + ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" + assert ( + static_v.size(2) == head_dim + ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + v = static_v + + # add zero attention along batch dimension (now first) + if add_zero_attn: + zero_attn_shape = (bsz * num_heads, 1, head_dim) + k = torch.cat( + [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1 + ) + v = torch.cat( + [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1 + ) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + + # update source sequence length after adjustments + src_len = k.size(1) + + # merge key padding and attention masks + if key_padding_mask is not None: + assert key_padding_mask.shape == ( + bsz, + src_len, + ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" + key_padding_mask = ( + key_padding_mask.view(bsz, 1, 1, src_len) + .expand(-1, num_heads, -1, -1) + .reshape(bsz * num_heads, 1, src_len) + ) + if attn_mask is None: + attn_mask = key_padding_mask + else: + attn_mask = attn_mask + key_padding_mask + + # adjust dropout probability + if not training: + dropout_p = 0.0 + + # + # (deep breath) calculate attention and out projection + # + + if need_weights: + B, Nt, E = q.shape + q_scaled = q / math.sqrt(E) + + assert not ( + is_causal and attn_mask is None + ), "FIXME: is_causal not implemented for need_weights" + + if attn_mask is not None: + attn_output_weights = torch.baddbmm( + attn_mask, q_scaled, k.transpose(-2, -1) + ) + else: + attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) + attn_output_weights = softmax(attn_output_weights, dim=-1) + if dropout_p > 0.0: + attn_output_weights = dropout(attn_output_weights, p=dropout_p) + + attn_output = torch.bmm(attn_output_weights, v) + + attn_output = ( + attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) + ) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) + + # optionally average attention weights over heads + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + if average_attn_weights: + attn_output_weights = attn_output_weights.mean(dim=1) + + if not is_batched: + # squeeze the output if input was unbatched + attn_output = attn_output.squeeze(1) + attn_output_weights = attn_output_weights.squeeze(0) + return attn_output, attn_output_weights + else: + # attn_mask can be either (L,S) or (N*num_heads, L, S) + # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S) + # in order to match the input for SDPA of (N, num_heads, L, S) + if attn_mask is not None: + if attn_mask.size(0) == 1 and attn_mask.dim() == 3: + attn_mask = attn_mask.unsqueeze(0) + else: + attn_mask = attn_mask.view(bsz, num_heads, -1, src_len) + + q = q.view(bsz, num_heads, tgt_len, head_dim) + k = k.view(bsz, num_heads, src_len, head_dim) + v = v.view(bsz, num_heads, src_len, head_dim) + + # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + attn_output = scaled_dot_product_attention( + q, k, v, attn_mask, dropout_p, is_causal + ) + + attn_output = ( + attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) + ) + + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) + if not is_batched: + # squeeze the output if input was unbatched + attn_output = attn_output.squeeze(1) + return attn_output, None diff --git a/AR/modules/patched_mha_with_cache_onnx.py b/AR/modules/patched_mha_with_cache_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..14bdb550a09a2f1dac610ea653848689e6443b4d --- /dev/null +++ b/AR/modules/patched_mha_with_cache_onnx.py @@ -0,0 +1,92 @@ +from torch.nn.functional import * +from torch.nn.functional import ( + _mha_shape_check, + _canonical_mask, + _none_or_dtype, + _in_projection_packed, +) + +def multi_head_attention_forward_patched( + query, + key, + value, + embed_dim_to_check: int, + num_heads: int, + in_proj_weight, + in_proj_bias: Optional[Tensor], + bias_k: Optional[Tensor], + bias_v: Optional[Tensor], + add_zero_attn: bool, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Optional[Tensor], + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + use_separate_proj_weight: bool = False, + q_proj_weight: Optional[Tensor] = None, + k_proj_weight: Optional[Tensor] = None, + v_proj_weight: Optional[Tensor] = None, + static_k: Optional[Tensor] = None, + static_v: Optional[Tensor] = None, + average_attn_weights: bool = True, + is_causal: bool = False, + cache=None, +) -> Tuple[Tensor, Optional[Tensor]]: + + # set up shape vars + _, _, embed_dim = query.shape + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=query.dtype, + check_other=False, + ) + head_dim = embed_dim // num_heads + + proj_qkv = linear(query, in_proj_weight, in_proj_bias) + proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() + q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] + + if cache["first_infer"] == 1: + cache["k"][cache["stage"]] = k + cache["v"][cache["stage"]] = v + else: + cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) + cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) + k = cache["k"][cache["stage"]] + v = cache["v"][cache["stage"]] + cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] + + attn_mask = _canonical_mask( + mask=attn_mask, + mask_name="attn_mask", + other_type=None, + other_name="", + target_type=q.dtype, + check_other=False, + ) + attn_mask = attn_mask.unsqueeze(0) + + q = q.view(-1, num_heads, head_dim).transpose(0, 1) + k = k.view(-1, num_heads, head_dim).transpose(0, 1) + v = v.view(-1, num_heads, head_dim).transpose(0, 1) + + dropout_p = 0.0 + attn_mask = attn_mask.unsqueeze(0) + q = q.view(num_heads, -1, head_dim).unsqueeze(0) + k = k.view(num_heads, -1, head_dim).unsqueeze(0) + v = v.view(num_heads, -1, head_dim).unsqueeze(0) + attn_output = scaled_dot_product_attention( + q, k, v, attn_mask, dropout_p, is_causal + ) + attn_output = ( + attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) + ) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + attn_output = attn_output.view(-1, 1, attn_output.size(1)) + + return attn_output diff --git a/AR/modules/scaling.py b/AR/modules/scaling.py new file mode 100644 index 0000000000000000000000000000000000000000..9256a8cbf342b6a259c48fb8821fed0492c649fd --- /dev/null +++ b/AR/modules/scaling.py @@ -0,0 +1,335 @@ +# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import math +import random +from typing import Optional +from typing import Tuple +from typing import Union + +import torch +import torch.nn as nn +from torch import Tensor + + +class DoubleSwishFunction(torch.autograd.Function): + """ + double_swish(x) = x * torch.sigmoid(x-1) + This is a definition, originally motivated by its close numerical + similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). + + Memory-efficient derivative computation: + double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) + double_swish'(x) = d/dx double_swish(x) = x * s'(x) + x' * s(x) = x * s'(x) + s(x). + Now, s'(x) = s(x) * (1-s(x)). + double_swish'(x) = x * s'(x) + s(x). + = x * s(x) * (1-s(x)) + s(x). + = double_swish(x) * (1-s(x)) + s(x) + ... so we just need to remember s(x) but not x itself. + """ + + @staticmethod + def forward(ctx, x: Tensor) -> Tensor: + requires_grad = x.requires_grad + x_dtype = x.dtype + if x.dtype == torch.float16: + x = x.to(torch.float32) + + s = torch.sigmoid(x - 1.0) + y = x * s + + if requires_grad: + deriv = y * (1 - s) + s + # notes on derivative of x * sigmoid(x - 1): + # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 + # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bund + # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. + # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which + # floors), should be expectation-preserving. + floor = -0.043637 + ceil = 1.2 + d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( + deriv + ) + if __name__ == "__main__": + # for self-testing only. + assert d_scaled.min() >= 0.0 + assert d_scaled.max() < 256.0 + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x.dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: Tensor) -> Tensor: + (d,) = ctx.saved_tensors + # the same constants as used in forward pass. + floor = -0.043637 + ceil = 1.2 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class DoubleSwish(torch.nn.Module): + def forward(self, x: Tensor) -> Tensor: + """Return double-swish activation function which is an approximation to Swish(Swish(x)), + that we approximate closely with x * sigmoid(x-1). + """ + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return x * torch.sigmoid(x - 1.0) + return DoubleSwishFunction.apply(x) + + +class ActivationBalancerFunction(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + scale_factor: Tensor, + sign_factor: Optional[Tensor], + channel_dim: int, + ) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + ctx.channel_dim = channel_dim + xgt0 = x > 0 + if sign_factor is None: + ctx.save_for_backward(xgt0, scale_factor) + else: + ctx.save_for_backward(xgt0, scale_factor, sign_factor) + return x + + @staticmethod + def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]: + if len(ctx.saved_tensors) == 3: + xgt0, scale_factor, sign_factor = ctx.saved_tensors + for _ in range(ctx.channel_dim, x_grad.ndim - 1): + scale_factor = scale_factor.unsqueeze(-1) + sign_factor = sign_factor.unsqueeze(-1) + factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5) + else: + xgt0, scale_factor = ctx.saved_tensors + for _ in range(ctx.channel_dim, x_grad.ndim - 1): + scale_factor = scale_factor.unsqueeze(-1) + factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5) + neg_delta_grad = x_grad.abs() * factor + return ( + x_grad - neg_delta_grad, + None, + None, + None, + ) + + +def _compute_scale_factor( + x: Tensor, + channel_dim: int, + min_abs: float, + max_abs: float, + gain_factor: float, + max_factor: float, +) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + sum_dims = [d for d in range(x.ndim) if d != channel_dim] + x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32) + + if min_abs == 0.0: + below_threshold = 0.0 + else: + # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if + # x_abs)_mean , min_abs. + below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( + min=0, max=max_factor + ) + + above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( + min=0, max=max_factor + ) + + return below_threshold - above_threshold + + +def _compute_sign_factor( + x: Tensor, + channel_dim: int, + min_positive: float, + max_positive: float, + gain_factor: float, + max_factor: float, +) -> Tensor: + if channel_dim < 0: + channel_dim += x.ndim + sum_dims = [d for d in range(x.ndim) if d != channel_dim] + proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims) + if min_positive == 0.0: + factor1 = 0.0 + else: + # 0 if proportion_positive >= min_positive, else can be + # as large as max_factor. + factor1 = ( + (min_positive - proportion_positive) * (gain_factor / min_positive) + ).clamp_(min=0, max=max_factor) + + if max_positive == 1.0: + factor2 = 0.0 + else: + # 0 if self.proportion_positive <= max_positive, else can be + # as large as -max_factor. + factor2 = ( + (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) + ).clamp_(min=0, max=max_factor) + sign_factor = factor1 - factor2 + # require min_positive != 0 or max_positive != 1: + assert not isinstance(sign_factor, float) + return sign_factor + + +class ActivationBalancer(torch.nn.Module): + """ + Modifies the backpropped derivatives of a function to try to encourage, for + each channel, that it is positive at least a proportion `threshold` of the + time. It does this by multiplying negative derivative values by up to + (1+max_factor), and positive derivative values by up to (1-max_factor), + interpolated from 1 at the threshold to those extremal values when none + of the inputs are positive. + + Args: + num_channels: the number of channels + channel_dim: the dimension/axis corresponding to the channel, e.g. + -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative. + min_positive: the minimum, per channel, of the proportion of the time + that (x > 0), below which we start to modify the derivatives. + max_positive: the maximum, per channel, of the proportion of the time + that (x > 0), above which we start to modify the derivatives. + max_factor: the maximum factor by which we modify the derivatives for + either the sign constraint or the magnitude constraint; + e.g. with max_factor=0.02, the the derivatives would be multiplied by + values in the range [0.98..1.02]. + sign_gain_factor: determines the 'gain' with which we increase the + change in gradient once the constraints on min_positive and max_positive + are violated. + scale_gain_factor: determines the 'gain' with which we increase the + change in gradient once the constraints on min_abs and max_abs + are violated. + min_abs: the minimum average-absolute-value difference from the mean + value per channel, which we allow, before we start to modify + the derivatives to prevent this. + max_abs: the maximum average-absolute-value difference from the mean + value per channel, which we allow, before we start to modify + the derivatives to prevent this. + min_prob: determines the minimum probability with which we modify the + gradients for the {min,max}_positive and {min,max}_abs constraints, + on each forward(). This is done randomly to prevent all layers + from doing it at the same time. Early in training we may use + higher probabilities than this; it will decay to this value. + """ + + def __init__( + self, + num_channels: int, + channel_dim: int, + min_positive: float = 0.05, + max_positive: float = 0.95, + max_factor: float = 0.04, + sign_gain_factor: float = 0.01, + scale_gain_factor: float = 0.02, + min_abs: float = 0.2, + max_abs: float = 100.0, + min_prob: float = 0.1, + ): + super(ActivationBalancer, self).__init__() + self.num_channels = num_channels + self.channel_dim = channel_dim + self.min_positive = min_positive + self.max_positive = max_positive + self.max_factor = max_factor + self.min_abs = min_abs + self.max_abs = max_abs + self.min_prob = min_prob + self.sign_gain_factor = sign_gain_factor + self.scale_gain_factor = scale_gain_factor + + # count measures how many times the forward() function has been called. + # We occasionally sync this to a tensor called `count`, that exists to + # make sure it is synced to disk when we load and save the model. + self.cpu_count = 0 + self.register_buffer("count", torch.tensor(0, dtype=torch.int64)) + + def forward(self, x: Tensor) -> Tensor: + if torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing(): + return _no_op(x) + + count = self.cpu_count + self.cpu_count += 1 + + if random.random() < 0.01: + # Occasionally sync self.cpu_count with self.count. + # count affects the decay of 'prob'. don't do this on every iter, + # because syncing with the GPU is slow. + self.cpu_count = max(self.cpu_count, self.count.item()) + self.count.fill_(self.cpu_count) + + # the prob of doing some work exponentially decreases from 0.5 till it hits + # a floor at min_prob (==0.1, by default) + prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0))) + + if random.random() < prob: + sign_gain_factor = 0.5 + if self.min_positive != 0.0 or self.max_positive != 1.0: + sign_factor = _compute_sign_factor( + x, + self.channel_dim, + self.min_positive, + self.max_positive, + gain_factor=self.sign_gain_factor / prob, + max_factor=self.max_factor, + ) + else: + sign_factor = None + + scale_factor = _compute_scale_factor( + x.detach(), + self.channel_dim, + min_abs=self.min_abs, + max_abs=self.max_abs, + gain_factor=self.scale_gain_factor / prob, + max_factor=self.max_factor, + ) + return ActivationBalancerFunction.apply( + x, + scale_factor, + sign_factor, + self.channel_dim, + ) + else: + return _no_op(x) + + +def BalancedDoubleSwish( + d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 +) -> nn.Sequential: + """ + ActivationBalancer -> DoubleSwish + """ + balancer = ActivationBalancer( + d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob + ) + return nn.Sequential( + balancer, + DoubleSwish(), + ) diff --git a/AR/modules/transformer.py b/AR/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..7921f48e70bd6849897389d8e0a39d1ac4062b97 --- /dev/null +++ b/AR/modules/transformer.py @@ -0,0 +1,378 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py +import copy +import numbers +from functools import partial +from typing import Any +from typing import Callable +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import torch +from AR.modules.activation import MultiheadAttention +from AR.modules.scaling import BalancedDoubleSwish +from torch import nn +from torch import Tensor +from torch.nn import functional as F + +_shape_t = Union[int, List[int], torch.Size] + + +class LayerNorm(nn.Module): + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = nn.Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + self.bias = nn.Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + return ( + F.layer_norm( + input, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ), + embedding, + ) + + assert embedding is None + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps + ) + + def extra_repr(self) -> str: + return ( + "{normalized_shape}, eps={eps}, " + "elementwise_affine={elementwise_affine}".format(**self.__dict__) + ) + + +class IdentityNorm(nn.Module): + def __init__( + self, + d_model: int, + eps: float = 1e-5, + device=None, + dtype=None, + ) -> None: + super(IdentityNorm, self).__init__() + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + return input + + assert embedding is None + return input + + +class TransformerEncoder(nn.Module): + r"""TransformerEncoder is a stack of N encoder layers. Users can build the + BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + return_layer_states: bool = False, + cache=None, + ) -> Tensor: + r"""Pass the input through the encoder layers in turn. + + Args: + src: the sequence to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + return_layer_states: return layers' state (optional). + + Shape: + see the docs in Transformer class. + """ + if return_layer_states: + layer_states = [] # layers' output + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + layer_states.append(output[0]) + + if self.norm is not None: + output = self.norm(output) + + return layer_states, output + + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Module): + __constants__ = ["batch_first", "norm_first"] + + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + batch_first: bool = False, + norm_first: bool = False, + device=None, + dtype=None, + linear1_self_attention_cls: nn.Module = nn.Linear, + linear2_self_attention_cls: nn.Module = nn.Linear, + linear1_feedforward_cls: nn.Module = nn.Linear, + linear2_feedforward_cls: nn.Module = nn.Linear, + layer_norm_cls: nn.Module = LayerNorm, + layer_norm_eps: float = 1e-5, + adaptive_layer_norm=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(TransformerEncoderLayer, self).__init__() + # print(233333333333,d_model,nhead) + # import os + # os._exit(2333333) + self.self_attn = MultiheadAttention( + d_model, # 512 16 + nhead, + dropout=dropout, + batch_first=batch_first, + linear1_cls=linear1_self_attention_cls, + linear2_cls=linear2_self_attention_cls, + **factory_kwargs, + ) + + # Implementation of Feedforward model + self.linear1 = linear1_feedforward_cls( + d_model, dim_feedforward, **factory_kwargs + ) + self.dropout = nn.Dropout(dropout) + self.linear2 = linear2_feedforward_cls( + dim_feedforward, d_model, **factory_kwargs + ) + + self.norm_first = norm_first + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + activation = _get_activation_fn(activation) + elif isinstance(activation, partial): + activation = activation(d_model) + elif activation == BalancedDoubleSwish: + activation = BalancedDoubleSwish(d_model) + + # # We can't test self.activation in forward() in TorchScript, + # # so stash some information about it instead. + # if activation is F.relu or isinstance(activation, torch.nn.ReLU): + # self.activation_relu_or_gelu = 1 + # elif activation is F.gelu or isinstance(activation, torch.nn.GELU): + # self.activation_relu_or_gelu = 2 + # else: + # self.activation_relu_or_gelu = 0 + self.activation = activation + + norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + if layer_norm_cls == IdentityNorm: + norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + else: + norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + + if adaptive_layer_norm: + self.norm1 = AdaptiveLayerNorm(d_model, norm1) + self.norm2 = AdaptiveLayerNorm(d_model, norm2) + else: + self.norm1 = norm1 + self.norm2 = norm2 + + def __setstate__(self, state): + super(TransformerEncoderLayer, self).__setstate__(state) + if not hasattr(self, "activation"): + self.activation = F.relu + + def forward( + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + cache=None, + ) -> Tensor: + r"""Pass the input through the encoder layer. + + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + + Shape: + see the docs in Transformer class. + """ + x, stage_embedding = src, None + is_src_tuple = False + if isinstance(src, tuple): + x, stage_embedding = src + is_src_tuple = True + + if src_key_padding_mask is not None: + _skpm_dtype = src_key_padding_mask.dtype + if _skpm_dtype != torch.bool and not torch.is_floating_point( + src_key_padding_mask + ): + raise AssertionError( + "only bool and floating types of key_padding_mask are supported" + ) + + if self.norm_first: + x = x + self._sa_block( + self.norm1(x, stage_embedding), + src_mask, + src_key_padding_mask, + cache=cache, + ) + x = x + self._ff_block(self.norm2(x, stage_embedding)) + else: + x = self.norm1( + x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), + stage_embedding, + ) + x = self.norm2(x + self._ff_block(x), stage_embedding) + + if is_src_tuple: + return (x, stage_embedding) + return x + + # self-attention block + def _sa_block( + self, + x: Tensor, + attn_mask: Optional[Tensor], + key_padding_mask: Optional[Tensor], + cache=None, + ) -> Tensor: + # print(x.shape,attn_mask.shape,key_padding_mask) + # torch.Size([1, 188, 512]) torch.Size([188, 188]) None + # import os + # os._exit(23333) + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + cache=cache, + )[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class AdaptiveLayerNorm(nn.Module): + r"""Adaptive Layer Normalization""" + + def __init__(self, d_model, norm) -> None: + super(AdaptiveLayerNorm, self).__init__() + self.project_layer = nn.Linear(d_model, 2 * d_model) + self.norm = norm + self.d_model = d_model + self.eps = self.norm.eps + + def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return (weight * self.norm(input) + bias, embedding) + + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return weight * self.norm(input) + bias + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/AR/modules/transformer_onnx.py b/AR/modules/transformer_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..a3f68b43e7a4f3c8d989140009477e07a2d44d19 --- /dev/null +++ b/AR/modules/transformer_onnx.py @@ -0,0 +1,292 @@ +# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py +import copy +import numbers +from functools import partial +from typing import Any +from typing import Callable +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import torch +from AR.modules.activation_onnx import MultiheadAttention +from AR.modules.scaling import BalancedDoubleSwish +from torch import nn +from torch import Tensor +from torch.nn import functional as F + +_shape_t = Union[int, List[int], torch.Size] + + +class LayerNorm(nn.Module): + __constants__ = ["normalized_shape", "eps", "elementwise_affine"] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__( + self, + normalized_shape: _shape_t, + eps: float = 1e-5, + elementwise_affine: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = nn.Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + self.bias = nn.Parameter( + torch.empty(self.normalized_shape, **factory_kwargs) + ) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + return ( + F.layer_norm( + input, + self.normalized_shape, + self.weight, + self.bias, + self.eps, + ), + embedding, + ) + + assert embedding is None + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps + ) + + def extra_repr(self) -> str: + return ( + "{normalized_shape}, eps={eps}, " + "elementwise_affine={elementwise_affine}".format(**self.__dict__) + ) + + +class IdentityNorm(nn.Module): + def __init__( + self, + d_model: int, + eps: float = 1e-5, + device=None, + dtype=None, + ) -> None: + super(IdentityNorm, self).__init__() + + def forward(self, input: Tensor, embedding: Any = None) -> Tensor: + if isinstance(input, tuple): + return input + + assert embedding is None + return input + + +class TransformerEncoder(nn.Module): + r"""TransformerEncoder is a stack of N encoder layers. Users can build the + BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. + + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + enable_nested_tensor: if True, input will automatically convert to nested tensor + (and convert back on output). This will improve the overall performance of + TransformerEncoder when padding rate is high. Default: ``True`` (enabled). + + Examples:: + >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + __constants__ = ["norm"] + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src: Tensor, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + return_layer_states: bool = False, + cache=None, + ) -> Tensor: + output = src + for mod in self.layers: + output = mod( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + cache=cache, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Module): + __constants__ = ["batch_first", "norm_first"] + def __init__( + self, + d_model: int, + nhead: int, + dim_feedforward: int = 2048, + dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + batch_first: bool = False, + norm_first: bool = False, + device=None, + dtype=None, + linear1_self_attention_cls: nn.Module = nn.Linear, + linear2_self_attention_cls: nn.Module = nn.Linear, + linear1_feedforward_cls: nn.Module = nn.Linear, + linear2_feedforward_cls: nn.Module = nn.Linear, + layer_norm_cls: nn.Module = LayerNorm, + layer_norm_eps: float = 1e-5, + adaptive_layer_norm=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention( + d_model, # 512 16 + nhead, + dropout=dropout, + batch_first=batch_first, + linear1_cls=linear1_self_attention_cls, + linear2_cls=linear2_self_attention_cls, + **factory_kwargs, + ) + self.linear1 = linear1_feedforward_cls( + d_model, dim_feedforward, **factory_kwargs + ) + self.dropout = nn.Dropout(dropout) + self.linear2 = linear2_feedforward_cls( + dim_feedforward, d_model, **factory_kwargs + ) + self.norm_first = norm_first + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + if isinstance(activation, str): + activation = _get_activation_fn(activation) + elif isinstance(activation, partial): + activation = activation(d_model) + elif activation == BalancedDoubleSwish: + activation = BalancedDoubleSwish(d_model) + self.activation = activation + + norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + if layer_norm_cls == IdentityNorm: + norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + else: + norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) + + if adaptive_layer_norm: + self.norm1 = AdaptiveLayerNorm(d_model, norm1) + self.norm2 = AdaptiveLayerNorm(d_model, norm2) + else: + self.norm1 = norm1 + self.norm2 = norm2 + + def __setstate__(self, state): + super(TransformerEncoderLayer, self).__setstate__(state) + if not hasattr(self, "activation"): + self.activation = F.relu + + def forward( + self, + src: Tensor, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + cache=None, + ) -> Tensor: + x = src + stage_embedding = None + x = self.norm1( + x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), + stage_embedding, + ) + x = self.norm2(x + self._ff_block(x), stage_embedding) + + return x + + def _sa_block( + self, + x: Tensor, + attn_mask: Optional[Tensor], + key_padding_mask: Optional[Tensor], + cache=None, + ) -> Tensor: + x = self.self_attn( + x, + x, + x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False, + cache=cache, + ) + return self.dropout1(x) + + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class AdaptiveLayerNorm(nn.Module): + r"""Adaptive Layer Normalization""" + + def __init__(self, d_model, norm) -> None: + super(AdaptiveLayerNorm, self).__init__() + self.project_layer = nn.Linear(d_model, 2 * d_model) + self.norm = norm + self.d_model = d_model + self.eps = self.norm.eps + + def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: + if isinstance(input, tuple): + input, embedding = input + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return (weight * self.norm(input) + bias, embedding) + + weight, bias = torch.split( + self.project_layer(embedding), + split_size_or_sections=self.d_model, + dim=-1, + ) + return weight * self.norm(input) + bias + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/AR/text_processing/__init__.py b/AR/text_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/AR/text_processing/phonemizer.py b/AR/text_processing/phonemizer.py new file mode 100644 index 0000000000000000000000000000000000000000..9c5f58fb74da836764cc9d71b8556e979f2b2830 --- /dev/null +++ b/AR/text_processing/phonemizer.py @@ -0,0 +1,79 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py +# reference: https://github.com/lifeiteng/vall-e +import itertools +import re +from typing import Dict +from typing import List + +import regex +from gruut import sentences +from gruut.const import Sentence +from gruut.const import Word +from AR.text_processing.symbols import SYMBOL_TO_ID + + +class GruutPhonemizer: + def __init__(self, language: str): + self._phonemizer = sentences + self.lang = language + self.symbol_to_id = SYMBOL_TO_ID + self._special_cases_dict: Dict[str] = { + r"\.\.\.": "... ", + ";": "; ", + ":": ": ", + ",": ", ", + r"\.": ". ", + "!": "! ", + r"\?": "? ", + "—": "—", + "…": "… ", + "«": "«", + "»": "»", + } + self._punctuation_regexp: str = ( + rf"([{''.join(self._special_cases_dict.keys())}])" + ) + + def _normalize_punctuation(self, text: str) -> str: + text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) + text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) + text = regex.sub(r"\pZ+", r" ", text) + return text.strip() + + def _convert_punctuation(self, word: Word) -> str: + if not word.phonemes: + return "" + if word.phonemes[0] in ["‖", "|"]: + return word.text.strip() + + phonemes = "".join(word.phonemes) + # remove modifier characters ˈˌː with regex + phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) + return phonemes.strip() + + def phonemize(self, text: str, espeak: bool = False) -> str: + text_to_phonemize: str = self._normalize_punctuation(text) + sents: List[Sentence] = [ + sent + for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) + ] + words: List[str] = [ + self._convert_punctuation(word) for word in itertools.chain(*sents) + ] + return " ".join(words) + + def transform(self, phonemes): + # convert phonemes to ids + # dictionary is in symbols.py + return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] + + +if __name__ == "__main__": + phonemizer = GruutPhonemizer("en-us") + # text -> IPA + phonemes = phonemizer.phonemize("Hello, wor-ld ?") + print("phonemes:", phonemes) + print("len(phonemes):", len(phonemes)) + phoneme_ids = phonemizer.transform(phonemes) + print("phoneme_ids:", phoneme_ids) + print("len(phoneme_ids):", len(phoneme_ids)) diff --git a/AR/text_processing/symbols.py b/AR/text_processing/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..7d754a78b1fd5b3d89768585e1891404bb318118 --- /dev/null +++ b/AR/text_processing/symbols.py @@ -0,0 +1,10 @@ +# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py +# reference: https://github.com/lifeiteng/vall-e +PAD = "_" +PUNCTUATION = ';:,.!?¡¿—…"«»“” ' +LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" +SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) +SPACE_ID = SYMBOLS.index(" ") +SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} +ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} diff --git a/AR/utils/__init__.py b/AR/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c2eaf61adcfee96d6e7ec8fd70a7603e18afb567 --- /dev/null +++ b/AR/utils/__init__.py @@ -0,0 +1,37 @@ +import re + + +def str2bool(str): + return True if str.lower() == 'true' else False + + +def get_newest_ckpt(string_list): + # 定义一个正则表达式模式,用于匹配字符串中的数字 + pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' + + # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 + extracted_info = [] + for string in string_list: + match = re.match(pattern, string) + if match: + epoch = int(match.group(1)) + step = int(match.group(2)) + extracted_info.append((epoch, step, string)) + # 按照 epoch 后面的数字和 step 后面的数字进行排序 + sorted_info = sorted( + extracted_info, key=lambda x: (x[0], x[1]), reverse=True) + # 获取最新的 ckpt 文件名 + newest_ckpt = sorted_info[0][2] + return newest_ckpt + + +# 文本存在且不为空时 return True +def check_txt_file(file_path): + try: + with open(file_path, 'r') as file: + text = file.readline().strip() + assert text.strip() != '' + return text + except Exception: + return False + return False diff --git a/AR/utils/initialize.py b/AR/utils/initialize.py new file mode 100644 index 0000000000000000000000000000000000000000..17ff9f92e51c8941973139d6e34d4a1c7cd8daaa --- /dev/null +++ b/AR/utils/initialize.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +"""Initialize modules for espnet2 neural networks.""" +import torch +from typeguard import check_argument_types + + +def initialize(model: torch.nn.Module, init: str): + """Initialize weights of a neural network module. + + Parameters are initialized using the given method or distribution. + + Custom initialization routines can be implemented into submodules + as function `espnet_initialization_fn` within the custom module. + + Args: + model: Target. + init: Method of initialization. + """ + assert check_argument_types() + print("init with", init) + + # weight init + for p in model.parameters(): + if p.dim() > 1: + if init == "xavier_uniform": + torch.nn.init.xavier_uniform_(p.data) + elif init == "xavier_normal": + torch.nn.init.xavier_normal_(p.data) + elif init == "kaiming_uniform": + torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") + elif init == "kaiming_normal": + torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") + else: + raise ValueError("Unknown initialization: " + init) + # bias init + for name, p in model.named_parameters(): + if ".bias" in name and p.dim() == 1: + p.data.zero_() diff --git a/AR/utils/io.py b/AR/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..52f1f3c991506a9ea5d3fdbc71ded61e914694f1 --- /dev/null +++ b/AR/utils/io.py @@ -0,0 +1,34 @@ +import sys + +import torch +import yaml + + +def load_yaml_config(path): + with open(path) as f: + config = yaml.full_load(f) + return config + + +def save_config_to_yaml(config, path): + assert path.endswith(".yaml") + with open(path, "w") as f: + f.write(yaml.dump(config)) + f.close() + + +def write_args(args, path): + args_dict = dict( + (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") + ) + with open(path, "a") as args_file: + args_file.write("==> torch version: {}\n".format(torch.__version__)) + args_file.write( + "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) + ) + args_file.write("==> Cmd:\n") + args_file.write(str(sys.argv)) + args_file.write("\n==> args:\n") + for k, v in sorted(args_dict.items()): + args_file.write(" %s: %s\n" % (str(k), str(v))) + args_file.close() diff --git a/__pycache__/utils.cpython-39.pyc b/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bdc72c57efa9e5fbd15fc7da2bf4073477dd0f1 Binary files /dev/null and b/__pycache__/utils.cpython-39.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..c678db2549a3291534808feed417d54fa5131eac --- /dev/null +++ b/app.py @@ -0,0 +1,678 @@ +''' +按中英混合识别 +按日英混合识别 +多语种启动切分识别语种 +全部按中文识别 +全部按英文识别 +全部按日文识别 +''' +import os, re, logging +import LangSegment +logging.getLogger("markdown_it").setLevel(logging.ERROR) +logging.getLogger("urllib3").setLevel(logging.ERROR) +logging.getLogger("httpcore").setLevel(logging.ERROR) +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("asyncio").setLevel(logging.ERROR) +logging.getLogger("charset_normalizer").setLevel(logging.ERROR) +logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) +import pdb + +if os.path.exists("./gweight.txt"): + with open("./gweight.txt", 'r', encoding="utf-8") as file: + gweight_data = file.read() + gpt_path = os.environ.get( + "gpt_path", gweight_data) +else: + gpt_path = os.environ.get( + "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt") + +if os.path.exists("./sweight.txt"): + with open("./sweight.txt", 'r', encoding="utf-8") as file: + sweight_data = file.read() + sovits_path = os.environ.get("sovits_path", sweight_data) +else: + sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth") +# gpt_path = os.environ.get( +# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +# ) +# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") +cnhubert_base_path = os.environ.get( + "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base" +) +bert_path = os.environ.get( + "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +) +infer_ttswebui = os.environ.get("infer_ttswebui", 9872) +infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share = eval(is_share) +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +is_half = eval(os.environ.get("is_half", "True")) +import gradio as gr +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np +import librosa, torch +from feature_extractor import cnhubert + +cnhubert.cnhubert_base_path = cnhubert_base_path + +from module.models import SynthesizerTrn +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from text import cleaned_text_to_sequence +from text.cleaner import clean_text +from time import time as ttime +from module.mel_processing import spectrogram_torch +from my_utils import load_audio +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto() + +os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 + +if torch.cuda.is_available(): + device = "cuda" +elif torch.backends.mps.is_available(): + device = "mps" +else: + device = "cpu" + +tokenizer = AutoTokenizer.from_pretrained(bert_path) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +if is_half == True: + bert_model = bert_model.half().to(device) +else: + bert_model = bert_model.to(device) + + +def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + return phone_level_feature.T + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +ssl_model = cnhubert.get_model() +if is_half == True: + ssl_model = ssl_model.half().to(device) +else: + ssl_model = ssl_model.to(device) + + +def change_sovits_weights(sovits_path): + global vq_model, hps + dict_s2 = torch.load(sovits_path, map_location="cpu") + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model + ) + if ("pretrained" not in sovits_path): + del vq_model.enc_q + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) + with open("./sweight.txt", "w", encoding="utf-8") as f: + f.write(sovits_path) + + +change_sovits_weights(sovits_path) + + +def change_gpt_weights(gpt_path): + global hz, max_sec, t2s_model, config + hz = 50 + dict_s1 = torch.load(gpt_path, map_location="cpu") + config = dict_s1["config"] + max_sec = config["data"]["max_sec"] + t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) + t2s_model.load_state_dict(dict_s1["weight"]) + if is_half == True: + t2s_model = t2s_model.half() + t2s_model = t2s_model.to(device) + t2s_model.eval() + total = sum([param.nelement() for param in t2s_model.parameters()]) + print("Number of parameter: %.2fM" % (total / 1e6)) + with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path) + + +change_gpt_weights(gpt_path) + + +def get_spepc(hps, filename): + audio = load_audio(filename, int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) + return spec + + +dict_language = { + i18n("中文"): "all_zh",#全部按中文识别 + i18n("英文"): "en",#全部按英文识别#######不变 + i18n("日文"): "all_ja",#全部按日文识别 + i18n("中英混合"): "zh",#按中英混合识别####不变 + i18n("日英混合"): "ja",#按日英混合识别####不变 + i18n("多语种混合"): "auto",#多语种启动切分识别语种 +} + + +def splite_en_inf(sentence, language): + pattern = re.compile(r'[a-zA-Z ]+') + textlist = [] + langlist = [] + pos = 0 + for match in pattern.finditer(sentence): + start, end = match.span() + if start > pos: + textlist.append(sentence[pos:start]) + langlist.append(language) + textlist.append(sentence[start:end]) + langlist.append("en") + pos = end + if pos < len(sentence): + textlist.append(sentence[pos:]) + langlist.append(language) + # Merge punctuation into previous word + for i in range(len(textlist)-1, 0, -1): + if re.match(r'^[\W_]+$', textlist[i]): + textlist[i-1] += textlist[i] + del textlist[i] + del langlist[i] + # Merge consecutive words with the same language tag + i = 0 + while i < len(langlist) - 1: + if langlist[i] == langlist[i+1]: + textlist[i] += textlist[i+1] + del textlist[i+1] + del langlist[i+1] + else: + i += 1 + + return textlist, langlist + + +def clean_text_inf(text, language): + formattext = "" + language = language.replace("all_","") + for tmp in LangSegment.getTexts(text): + if language == "ja": + if tmp["lang"] == language or tmp["lang"] == "zh": + formattext += tmp["text"] + " " + continue + if tmp["lang"] == language: + formattext += tmp["text"] + " " + while " " in formattext: + formattext = formattext.replace(" ", " ") + phones, word2ph, norm_text = clean_text(formattext, language) + phones = cleaned_text_to_sequence(phones) + return phones, word2ph, norm_text + +dtype=torch.float16 if is_half == True else torch.float32 +def get_bert_inf(phones, word2ph, norm_text, language): + language=language.replace("all_","") + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +def nonen_clean_text_inf(text, language): + if(language!="auto"): + textlist, langlist = splite_en_inf(text, language) + else: + textlist=[] + langlist=[] + for tmp in LangSegment.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + phones_list = [] + word2ph_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang) + phones_list.append(phones) + if lang == "zh": + word2ph_list.append(word2ph) + norm_text_list.append(norm_text) + print(word2ph_list) + phones = sum(phones_list, []) + word2ph = sum(word2ph_list, []) + norm_text = ' '.join(norm_text_list) + + return phones, word2ph, norm_text + + +def nonen_get_bert_inf(text, language): + if(language!="auto"): + textlist, langlist = splite_en_inf(text, language) + else: + textlist=[] + langlist=[] + for tmp in LangSegment.getTexts(text): + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + print(textlist) + print(langlist) + bert_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + + return bert + + +splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } + + +def get_first(text): + pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" + text = re.split(pattern, text)[0].strip() + return text + + +def get_cleaned_text_final(text,language): + if language in {"en","all_zh","all_ja"}: + phones, word2ph, norm_text = clean_text_inf(text, language) + elif language in {"zh", "ja","auto"}: + phones, word2ph, norm_text = nonen_clean_text_inf(text, language) + return phones, word2ph, norm_text + +def get_bert_final(phones, word2ph, text,language,device): + if language == "en": + bert = get_bert_inf(phones, word2ph, text, language) + elif language in {"zh", "ja","auto"}: + bert = nonen_get_bert_inf(text, language) + elif language == "all_zh": + bert = get_bert_feature(text, word2ph).to(device) + else: + bert = torch.zeros((1024, len(phones))).to(device) + return bert + +def merge_short_text_in_array(texts, threshold): + if (len(texts)) < 2: + return texts + result = [] + text = "" + for ele in texts: + text += ele + if len(text) >= threshold: + result.append(text) + text = "" + if (len(text) > 0): + if len(result) == 0: + result.append(text) + else: + result[len(result) - 1] += text + return result + +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False): + if prompt_text is None or len(prompt_text) == 0: + ref_free = True + t0 = ttime() + prompt_language = dict_language[prompt_language] + text_language = dict_language[text_language] + if not ref_free: + prompt_text = prompt_text.strip("\n") + if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + print(i18n("实际输入的参考文本:"), prompt_text) + text = text.strip("\n") + if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text + + print(i18n("实际输入的目标文本:"), text) + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() + codes = vq_model.extract_latent(ssl_content) + + prompt_semantic = codes[0, 0] + t1 = ttime() + + if (how_to_cut == i18n("凑四句一切")): + text = cut1(text) + elif (how_to_cut == i18n("凑50字一切")): + text = cut2(text) + elif (how_to_cut == i18n("按中文句号。切")): + text = cut3(text) + elif (how_to_cut == i18n("按英文句号.切")): + text = cut4(text) + elif (how_to_cut == i18n("按标点符号切")): + text = cut5(text) + while "\n\n" in text: + text = text.replace("\n\n", "\n") + print(i18n("实际输入的目标文本(切句后):"), text) + texts = text.split("\n") + texts = merge_short_text_in_array(texts, 5) + audio_opt = [] + if not ref_free: + phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language) + print("前端处理后的参考文本:%s"%norm_text1) + bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype) + + for text in texts: + # 解决输入目标文本的空行导致报错的问题 + if (len(text.strip()) == 0): + continue + if (text[-1] not in splits): text += "。" if text_language != "en" else "." + print(i18n("实际输入的目标文本(每句):"), text) + phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language) + print(i18n("前端处理后的文本(每句):"), norm_text2) + bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype) + if not ref_free: + bert = torch.cat([bert1, bert2], 1) + all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + else: + bert = bert2 + all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + prompt = prompt_semantic.unsqueeze(0).to(device) + t2 = ttime() + with torch.no_grad(): + # pred_semantic = t2s_model.model.infer( + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + None if ref_free else prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + t3 = ttime() + # print(pred_semantic.shape,idx) + pred_semantic = pred_semantic[:, -idx:].unsqueeze( + 0 + ) # .unsqueeze(0)#mq要多unsqueeze一次 + refer = get_spepc(hps, ref_wav_path) # .to(device) + if is_half == True: + refer = refer.half().to(device) + else: + refer = refer.to(device) + # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] + audio = ( + vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer + ) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 + max_audio=np.abs(audio).max()#简单防止16bit爆音 + if max_audio>1:audio/=max_audio + audio_opt.append(audio) + audio_opt.append(zero_wav) + t4 = ttime() + print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype( + np.int16 + ) + + +def split(todo_text): + todo_text = todo_text.replace("……", "。").replace("——", ",") + if todo_text[-1] not in splits: + todo_text += "。" + i_split_head = i_split_tail = 0 + len_text = len(todo_text) + todo_texts = [] + while 1: + if i_split_head >= len_text: + break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 + if todo_text[i_split_head] in splits: + i_split_head += 1 + todo_texts.append(todo_text[i_split_tail:i_split_head]) + i_split_tail = i_split_head + else: + i_split_head += 1 + return todo_texts + + +def cut1(inp): + inp = inp.strip("\n") + inps = split(inp) + split_idx = list(range(0, len(inps), 4)) + split_idx[-1] = None + if len(split_idx) > 1: + opts = [] + for idx in range(len(split_idx) - 1): + opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + else: + opts = [inp] + return "\n".join(opts) + + +def cut2(inp): + inp = inp.strip("\n") + inps = split(inp) + if len(inps) < 2: + return inp + opts = [] + summ = 0 + tmp_str = "" + for i in range(len(inps)): + summ += len(inps[i]) + tmp_str += inps[i] + if summ > 50: + summ = 0 + opts.append(tmp_str) + tmp_str = "" + if tmp_str != "": + opts.append(tmp_str) + # print(opts) + if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 + opts[-2] = opts[-2] + opts[-1] + opts = opts[:-1] + return "\n".join(opts) + + +def cut3(inp): + inp = inp.strip("\n") + return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) + + +def cut4(inp): + inp = inp.strip("\n") + return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) + + +# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py +def cut5(inp): + # if not re.search(r'[^\w\s]', inp[-1]): + # inp += '。' + inp = inp.strip("\n") + punds = r'[,.;?!、,。?!;:]' + items = re.split(f'({punds})', inp) + items = ["".join(group) for group in zip(items[::2], items[1::2])] + opt = "\n".join(items) + return opt + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split('(\d+)', s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names() + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + + +pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +SoVITS_weight_root = "SoVITS_weights" +GPT_weight_root = "GPT_weights" +os.makedirs(SoVITS_weight_root, exist_ok=True) +os.makedirs(GPT_weight_root, exist_ok=True) + + +def get_weights_names(): + SoVITS_names = [pretrained_sovits_name] + for name in os.listdir(SoVITS_weight_root): + if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name)) + GPT_names = [pretrained_gpt_name] + for name in os.listdir(GPT_weight_root): + if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names() + +with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + with gr.Group(): + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) + SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], []) + GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + gr.Markdown(value=i18n("*请上传并填写参考信息")) + with gr.Row(): + inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath") + with gr.Column(): + ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) + gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。")) + with gr.Row(): + text = gr.Textbox(label=i18n("需要合成的文本"), value="") + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], + value=i18n("凑四句一切"), + interactive=True, + ) + with gr.Row(): + gr.Markdown("gpt采样参数(无参考文本时不要太低):") + top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) + top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) + inference_button = gr.Button(i18n("合成语音"), variant="primary") + output = gr.Audio(label=i18n("输出的语音")) + + inference_button.click( + get_tts_wav, + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free], + [output], api_name="GetVoice" + ) + + gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + with gr.Row(): + text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="") + button1 = gr.Button(i18n("凑四句一切"), variant="primary") + button2 = gr.Button(i18n("凑50字一切"), variant="primary") + button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + button4 = gr.Button(i18n("按英文句号.切"), variant="primary") + button5 = gr.Button(i18n("按标点符号切"), variant="primary") + text_opt = gr.Textbox(label=i18n("切分后文本"), value="") + button1.click(cut1, [text_inp], [text_opt]) + button2.click(cut2, [text_inp], [text_opt]) + button3.click(cut3, [text_inp], [text_opt]) + button4.click(cut4, [text_inp], [text_opt]) + button5.click(cut5, [text_inp], [text_opt]) + gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) + +app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, +) diff --git a/configs/s1.yaml b/configs/s1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8ae17d4415cbbacca2d30e1403239ecc554e98b --- /dev/null +++ b/configs/s1.yaml @@ -0,0 +1,31 @@ +train: + seed: 1234 + epochs: 300 + batch_size: 8 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16 + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model +model: + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 12 + dropout: 0 + EOS: 1024 +inference: + top_k: 5 diff --git a/configs/s1big.yaml b/configs/s1big.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a811150de4c418be57b3bb932a7fe12cc7b6f679 --- /dev/null +++ b/configs/s1big.yaml @@ -0,0 +1,31 @@ +train: + seed: 1234 + epochs: 300 + batch_size: 8 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model +model: + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 1024 + hidden_dim: 1024 + head: 16 + linear_units: 2048 + n_layer: 16 + dropout: 0 + EOS: 1024 +inference: + top_k: 5 diff --git a/configs/s1big2.yaml b/configs/s1big2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b8b889babbc43ada8c17cee52db68fd484cc0c97 --- /dev/null +++ b/configs/s1big2.yaml @@ -0,0 +1,31 @@ +train: + seed: 1234 + epochs: 300 + batch_size: 12 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model +model: + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 1024 + hidden_dim: 1024 + head: 16 + linear_units: 2048 + n_layer: 6 + dropout: 0 + EOS: 1024 +inference: + top_k: 5 diff --git a/configs/s1longer.yaml b/configs/s1longer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f57abd21a85a36107d247488e988797f0f8d484 --- /dev/null +++ b/configs/s1longer.yaml @@ -0,0 +1,31 @@ +train: + seed: 1234 + epochs: 20 + batch_size: 8 + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 54 + num_workers: 4 + pad_val: 1024 # same with EOS in model +model: + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 24 + dropout: 0 + EOS: 1024 + random_bert: 0 +inference: + top_k: 5 diff --git a/configs/s1mq.yaml b/configs/s1mq.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b554fd34f88faca8bc71eb3eaf6b1bdf301a326b --- /dev/null +++ b/configs/s1mq.yaml @@ -0,0 +1,77 @@ +train: + seed: 1234 + epochs: 100 + batch_size: 6 + gradient_accumulation: 4 + save_every_n_epoch: 1 + precision: 32 + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 40 + num_workers: 1 + pad_val: 1024 # same with EOS in model +model: + saving_path: "ckpt/" + resume_checkpoint: null + vocoder_config_path: "quantizer/new_ckpt/config.json" + vocoder_ckpt_path: "quantizer/new_ckpt/g_00600000" + datadir: "/home/liweiche/GigaSpeech/wavs" + metapath: "/home/liweiche/GigaSpeech/train2.json" + val_metapath: "/home/liweiche/GigaSpeech/dev2.json" + sampledir: "logs/" + pretrained_path: null + lr: 0.0001 + batch_size: 200.0 + train_bucket_size: 8192 + training_step: 800000 + optim_flat_percent: 0.0 + warmup_step: 50 + adam_beta1: 0.9 + adam_beta2: 0.98 + ffd_size: 3072 + hidden_size: 768 + enc_nlayers: 6 + dec_nlayers: 6 + nheads: 12 + ar_layer: 4 + ar_ffd_size: 1024 + ar_hidden_size: 256 + ar_nheads: 4 + aligner_softmax_temp: 1.0 + layer_norm_eps: 0.00001 + speaker_embed_dropout: 0.05 + label_smoothing: 0.0 + val_check_interval: 5000 + check_val_every_n_epoch: 1 + precision: "fp16" + nworkers: 16 + distributed: true + accelerator: "ddp" + version: null + accumulate_grad_batches: 1 + use_repetition_token: true + use_repetition_gating: false + repetition_penalty: 1.0 + sampling_temperature: 1.0 + top_k: -1 + min_top_k: 3 + top_p: 0.8 + sample_num: 4 + length_penalty_max_length: 15000 + length_penalty_max_prob: 0.95 + max_input_length: 2048 + max_output_length: 2000 + sample_rate: 16000 + n_codes: 1024 + n_cluster_groups: 1 + phone_context_window: 4 + phoneset_size: 1000 +inference: + top_k: 5 diff --git a/configs/s2.json b/configs/s2.json new file mode 100644 index 0000000000000000000000000000000000000000..e44e1eb7d9864a20c76d35201f1ce0b9b15f3e95 --- /dev/null +++ b/configs/s2.json @@ -0,0 +1,90 @@ +{ + "train": { + "log_interval": 100, + "eval_interval": 500, + "seed": 1234, + "epochs": 100, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "batch_size": 32, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 20480, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "text_low_lr_rate": 0.4 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 2048, + "hop_length": 640, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null, + "add_blank": true, + "n_speakers": 300, + "cleaned_text": true + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 8, + 2, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 8, + 2, + 2 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "gin_channels": 512, + "semantic_frame_rate": "25hz", + "freeze_quantizer": true + }, + "s2_ckpt_dir": "logs/s2/big2k1", + "content_module": "cnhubert" +} \ No newline at end of file diff --git a/configs/train.yaml b/configs/train.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be5333571980807b7d8f788165035c02cb0d176c --- /dev/null +++ b/configs/train.yaml @@ -0,0 +1,32 @@ +gpu: + n_card: 1 + n_process_per_card: 2 +io: + text_path: D:\RVC1006\GPT-SoVITS\GPT_SoVITS + save_every_n_epoch: 1 + precision: 16-mixed + gradient_clip: 1.0 +optimizer: + lr: 0.01 + lr_init: 0.00001 + lr_end: 0.0001 + warmup_steps: 2000 + decay_steps: 40000 +data: + max_eval_sample: 8 + max_sec: 54 + num_workers: 1 + pad_val: 1024 # same with EOS in model +model: + vocab_size: 1025 + phoneme_vocab_size: 512 + embedding_dim: 512 + hidden_dim: 512 + head: 16 + linear_units: 2048 + n_layer: 24 + dropout: 0 + EOS: 1024 + random_bert: 0 +inference: + top_k: 5 diff --git a/feature_extractor/__init__.py b/feature_extractor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..79aa9294fec8fd823afae7d1a18e3279533dd7cf --- /dev/null +++ b/feature_extractor/__init__.py @@ -0,0 +1,6 @@ +from . import cnhubert, whisper_enc + +content_module_map = { + 'cnhubert': cnhubert, + 'whisper': whisper_enc +} \ No newline at end of file diff --git a/feature_extractor/__pycache__/__init__.cpython-39.pyc b/feature_extractor/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe85b7b45544a2853920c91f2491db73e91cc099 Binary files /dev/null and b/feature_extractor/__pycache__/__init__.cpython-39.pyc differ diff --git a/feature_extractor/__pycache__/cnhubert.cpython-39.pyc b/feature_extractor/__pycache__/cnhubert.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..263e29f160dc712e8deb70e48b1760ba19f2b287 Binary files /dev/null and b/feature_extractor/__pycache__/cnhubert.cpython-39.pyc differ diff --git a/feature_extractor/__pycache__/whisper_enc.cpython-39.pyc b/feature_extractor/__pycache__/whisper_enc.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7997bb15790ed7177245ba5e3abbd2324d88788 Binary files /dev/null and b/feature_extractor/__pycache__/whisper_enc.cpython-39.pyc differ diff --git a/feature_extractor/cnhubert.py b/feature_extractor/cnhubert.py new file mode 100644 index 0000000000000000000000000000000000000000..dc155bddb8de0ddc4fec06def2c5a7746f25af4e --- /dev/null +++ b/feature_extractor/cnhubert.py @@ -0,0 +1,104 @@ +import time + +import librosa +import torch +import torch.nn.functional as F +import soundfile as sf +import logging + +logging.getLogger("numba").setLevel(logging.WARNING) + +from transformers import ( + Wav2Vec2FeatureExtractor, + HubertModel, +) + +import utils +import torch.nn as nn + +cnhubert_base_path = None + + +class CNHubert(nn.Module): + def __init__(self): + super().__init__() + self.model = HubertModel.from_pretrained(cnhubert_base_path) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + cnhubert_base_path + ) + + def forward(self, x): + input_values = self.feature_extractor( + x, return_tensors="pt", sampling_rate=16000 + ).input_values.to(x.device) + feats = self.model(input_values)["last_hidden_state"] + return feats + + +# class CNHubertLarge(nn.Module): +# def __init__(self): +# super().__init__() +# self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") +# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") +# def forward(self, x): +# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) +# feats = self.model(input_values)["last_hidden_state"] +# return feats +# +# class CVec(nn.Module): +# def __init__(self): +# super().__init__() +# self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") +# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") +# def forward(self, x): +# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) +# feats = self.model(input_values)["last_hidden_state"] +# return feats +# +# class cnw2v2base(nn.Module): +# def __init__(self): +# super().__init__() +# self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") +# self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") +# def forward(self, x): +# input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) +# feats = self.model(input_values)["last_hidden_state"] +# return feats + + +def get_model(): + model = CNHubert() + model.eval() + return model + + +# def get_large_model(): +# model = CNHubertLarge() +# model.eval() +# return model +# +# def get_model_cvec(): +# model = CVec() +# model.eval() +# return model +# +# def get_model_cnw2v2base(): +# model = cnw2v2base() +# model.eval() +# return model + + +def get_content(hmodel, wav_16k_tensor): + with torch.no_grad(): + feats = hmodel(wav_16k_tensor) + return feats.transpose(1, 2) + + +if __name__ == "__main__": + model = get_model() + src_path = "/Users/Shared/原音频2.wav" + wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000) + model = model + wav_16k_tensor = wav_16k_tensor + feats = get_content(model, wav_16k_tensor) + print(feats.shape) diff --git a/feature_extractor/whisper_enc.py b/feature_extractor/whisper_enc.py new file mode 100644 index 0000000000000000000000000000000000000000..983c3e4d8a96232d29f41847abfedb70c42ebb02 --- /dev/null +++ b/feature_extractor/whisper_enc.py @@ -0,0 +1,25 @@ +import torch + + +def get_model(): + import whisper + + model = whisper.load_model("small", device="cpu") + + return model.encoder + + +def get_content(model=None, wav_16k_tensor=None): + from whisper import log_mel_spectrogram, pad_or_trim + + dev = next(model.parameters()).device + mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000] + # if torch.cuda.is_available(): + # mel = mel.to(torch.float16) + feature_len = mel.shape[-1] // 2 + assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频" + with torch.no_grad(): + feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[ + :1, :feature_len, : + ].transpose(1, 2) + return feature diff --git a/hutao.wav b/hutao.wav new file mode 100644 index 0000000000000000000000000000000000000000..aaeb3f999682b6c2359fc623b49b947fe653cf49 Binary files /dev/null and b/hutao.wav differ diff --git a/inference_gui.py b/inference_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..f6cfdc5e23471ade2fd56dc16a5f101810219dee --- /dev/null +++ b/inference_gui.py @@ -0,0 +1,340 @@ +import sys +from PyQt5.QtCore import QEvent +from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit +from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox +import soundfile as sf + +from tools.i18n.i18n import I18nAuto +i18n = I18nAuto() + +from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav + + +class GPTSoVITSGUI(QMainWindow): + def __init__(self): + super().__init__() + + self.init_ui() + + def init_ui(self): + self.setWindowTitle('GPT-SoVITS GUI') + self.setGeometry(800, 450, 950, 850) + + self.setStyleSheet(""" + QWidget { + background-color: #a3d3b1; + } + + QTabWidget::pane { + background-color: #a3d3b1; + } + + QTabWidget::tab-bar { + alignment: left; + } + + QTabBar::tab { + background: #8da4bf; + color: #ffffff; + padding: 8px; + } + + QTabBar::tab:selected { + background: #2a3f54; + } + + QLabel { + color: #000000; + } + + QPushButton { + background-color: #4CAF50; + color: white; + padding: 8px; + border: 1px solid #4CAF50; + border-radius: 4px; + } + + QPushButton:hover { + background-color: #45a049; + border: 1px solid #45a049; + box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); + } + """) + + license_text = ( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " + "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + license_label = QLabel(license_text) + license_label.setWordWrap(True) + + self.GPT_model_label = QLabel("选择GPT模型:") + self.GPT_model_input = QLineEdit() + self.GPT_model_input.setPlaceholderText("拖拽或选择文件") + self.GPT_model_input.setReadOnly(True) + self.GPT_model_button = QPushButton("选择GPT模型文件") + self.GPT_model_button.clicked.connect(self.select_GPT_model) + + self.SoVITS_model_label = QLabel("选择SoVITS模型:") + self.SoVITS_model_input = QLineEdit() + self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") + self.SoVITS_model_input.setReadOnly(True) + self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") + self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) + + self.ref_audio_label = QLabel("上传参考音频:") + self.ref_audio_input = QLineEdit() + self.ref_audio_input.setPlaceholderText("拖拽或选择文件") + self.ref_audio_input.setReadOnly(True) + self.ref_audio_button = QPushButton("选择音频文件") + self.ref_audio_button.clicked.connect(self.select_ref_audio) + + self.ref_text_label = QLabel("参考音频文本:") + self.ref_text_input = QLineEdit() + self.ref_text_input.setPlaceholderText("拖拽或选择文件") + self.ref_text_input.setReadOnly(True) + self.ref_text_button = QPushButton("上传文本") + self.ref_text_button.clicked.connect(self.upload_ref_text) + + self.language_label = QLabel("参考音频语言:") + self.language_combobox = QComboBox() + self.language_combobox.addItems(["中文", "英文", "日文"]) + + self.target_text_label = QLabel("合成目标文本:") + self.target_text_input = QLineEdit() + self.target_text_input.setPlaceholderText("拖拽或选择文件") + self.target_text_input.setReadOnly(True) + self.target_text_button = QPushButton("上传文本") + self.target_text_button.clicked.connect(self.upload_target_text) + + self.language_label_02 = QLabel("合成音频语言:") + self.language_combobox_02 = QComboBox() + self.language_combobox_02.addItems(["中文", "英文", "日文"]) + + self.output_label = QLabel("输出音频路径:") + self.output_input = QLineEdit() + self.output_input.setPlaceholderText("拖拽或选择文件") + self.output_input.setReadOnly(True) + self.output_button = QPushButton("选择文件夹") + self.output_button.clicked.connect(self.select_output_path) + + self.output_text = QTextEdit() + self.output_text.setReadOnly(True) + + self.add_drag_drop_events([ + self.GPT_model_input, + self.SoVITS_model_input, + self.ref_audio_input, + self.ref_text_input, + self.target_text_input, + self.output_input, + ]) + + self.synthesize_button = QPushButton("合成") + self.synthesize_button.clicked.connect(self.synthesize) + + self.clear_output_button = QPushButton("清空输出") + self.clear_output_button.clicked.connect(self.clear_output) + + self.status_bar = QStatusBar() + + main_layout = QVBoxLayout() + + input_layout = QGridLayout() + input_layout.setSpacing(10) + + self.setLayout(input_layout) + + input_layout.addWidget(license_label, 0, 0, 1, 3) + + input_layout.addWidget(self.GPT_model_label, 1, 0) + input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2) + input_layout.addWidget(self.GPT_model_button, 2, 2) + + input_layout.addWidget(self.SoVITS_model_label, 3, 0) + input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2) + input_layout.addWidget(self.SoVITS_model_button, 4, 2) + + input_layout.addWidget(self.ref_audio_label, 5, 0) + input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) + input_layout.addWidget(self.ref_audio_button, 6, 2) + + input_layout.addWidget(self.language_label, 7, 0) + input_layout.addWidget(self.language_combobox, 8, 0, 1, 1) + input_layout.addWidget(self.ref_text_label, 9, 0) + input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) + input_layout.addWidget(self.ref_text_button, 10, 2) + + input_layout.addWidget(self.language_label_02, 11, 0) + input_layout.addWidget(self.language_combobox_02, 12, 0, 1, 1) + input_layout.addWidget(self.target_text_label, 13, 0) + input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) + input_layout.addWidget(self.target_text_button, 14, 2) + + input_layout.addWidget(self.output_label, 15, 0) + input_layout.addWidget(self.output_input, 16, 0, 1, 2) + input_layout.addWidget(self.output_button, 16, 2) + + main_layout.addLayout(input_layout) + + output_layout = QVBoxLayout() + output_layout.addWidget(self.output_text) + main_layout.addLayout(output_layout) + + main_layout.addWidget(self.synthesize_button) + + main_layout.addWidget(self.clear_output_button) + + main_layout.addWidget(self.status_bar) + + self.central_widget = QWidget() + self.central_widget.setLayout(main_layout) + self.setCentralWidget(self.central_widget) + + def dragEnterEvent(self, event): + if event.mimeData().hasUrls(): + event.acceptProposedAction() + + def dropEvent(self, event): + if event.mimeData().hasUrls(): + file_paths = [url.toLocalFile() for url in event.mimeData().urls()] + + if len(file_paths) == 1: + self.update_ref_audio(file_paths[0]) + self.update_input_paths(self.ref_audio_input, file_paths[0]) + else: + self.update_ref_audio(", ".join(file_paths)) + + def add_drag_drop_events(self, widgets): + for widget in widgets: + widget.setAcceptDrops(True) + widget.installEventFilter(self) + + def eventFilter(self, obj, event): + if event.type() == QEvent.DragEnter: + mime_data = event.mimeData() + if mime_data.hasUrls(): + event.acceptProposedAction() + + elif event.type() == QEvent.Drop: + mime_data = event.mimeData() + if mime_data.hasUrls(): + file_paths = [url.toLocalFile() for url in mime_data.urls()] + if len(file_paths) == 1: + self.update_input_paths(obj, file_paths[0]) + else: + self.update_input_paths(obj, ", ".join(file_paths)) + event.acceptProposedAction() + + return super().eventFilter(obj, event) + + def select_GPT_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") + if file_path: + self.GPT_model_input.setText(file_path) + + def select_SoVITS_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)") + if file_path: + self.SoVITS_model_input.setText(file_path) + + def select_ref_audio(self): + options = QFileDialog.Options() + options |= QFileDialog.DontUseNativeDialog + options |= QFileDialog.ShowDirsOnly + + file_dialog = QFileDialog() + file_dialog.setOptions(options) + + file_dialog.setFileMode(QFileDialog.AnyFile) + file_dialog.setNameFilter("Audio Files (*.wav *.mp3)") + + if file_dialog.exec_(): + file_paths = file_dialog.selectedFiles() + + if len(file_paths) == 1: + self.update_ref_audio(file_paths[0]) + self.update_input_paths(self.ref_audio_input, file_paths[0]) + else: + self.update_ref_audio(", ".join(file_paths)) + + def upload_ref_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + self.ref_text_input.setText(content) + self.update_input_paths(self.ref_text_input, file_path) + + def upload_target_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + self.target_text_input.setText(content) + self.update_input_paths(self.target_text_input, file_path) + + def select_output_path(self): + options = QFileDialog.Options() + options |= QFileDialog.DontUseNativeDialog + options |= QFileDialog.ShowDirsOnly + + folder_dialog = QFileDialog() + folder_dialog.setOptions(options) + folder_dialog.setFileMode(QFileDialog.Directory) + + if folder_dialog.exec_(): + folder_path = folder_dialog.selectedFiles()[0] + self.output_input.setText(folder_path) + + def update_ref_audio(self, file_path): + self.ref_audio_input.setText(file_path) + + def update_input_paths(self, input_box, file_path): + input_box.setText(file_path) + + def clear_output(self): + self.output_text.clear() + + def synthesize(self): + GPT_model_path = self.GPT_model_input.text() + SoVITS_model_path = self.SoVITS_model_input.text() + ref_audio_path = self.ref_audio_input.text() + language_combobox = self.language_combobox.currentText() + language_combobox = i18n(language_combobox) + ref_text = self.ref_text_input.text() + language_combobox_02 = self.language_combobox_02.currentText() + language_combobox_02 = i18n(language_combobox_02) + target_text = self.target_text_input.text() + output_path = self.output_input.text() + + change_gpt_weights(gpt_path=GPT_model_path) + change_sovits_weights(sovits_path=SoVITS_model_path) + + synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=language_combobox, + text=target_text, + text_language=language_combobox_02) + + result_list = list(synthesis_result) + + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + output_wav_path = os.path.join(output_path, "output.wav") + sf.write(output_wav_path, last_audio_data, last_sampling_rate) + + result = "Audio saved to " + output_wav_path + + self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) + self.output_text.append("处理结果:\n" + result) + +def main(): + app = QApplication(sys.argv) + mainWin = GPTSoVITSGUI() + mainWin.show() + sys.exit(app.exec_()) + + +if __name__ == '__main__': + main() diff --git a/inference_webui0306.py b/inference_webui0306.py new file mode 100644 index 0000000000000000000000000000000000000000..b63b4ba347196bae0d8d5ec679c47a611c6288d8 --- /dev/null +++ b/inference_webui0306.py @@ -0,0 +1,622 @@ +''' +按中英混合识别 +按日英混合识别 +多语种启动切分识别语种 +全部按中文识别 +全部按英文识别 +全部按日文识别 +''' +import os, re, logging +import LangSegment +logging.getLogger("markdown_it").setLevel(logging.ERROR) +logging.getLogger("urllib3").setLevel(logging.ERROR) +logging.getLogger("httpcore").setLevel(logging.ERROR) +logging.getLogger("httpx").setLevel(logging.ERROR) +logging.getLogger("asyncio").setLevel(logging.ERROR) +logging.getLogger("charset_normalizer").setLevel(logging.ERROR) +logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) +import pdb +import torch + +if os.path.exists("./gweight.txt"): + with open("./gweight.txt", 'r', encoding="utf-8") as file: + gweight_data = file.read() + gpt_path = os.environ.get( + "gpt_path", gweight_data) +else: + gpt_path = os.environ.get( + "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt") + +if os.path.exists("./sweight.txt"): + with open("./sweight.txt", 'r', encoding="utf-8") as file: + sweight_data = file.read() + sovits_path = os.environ.get("sovits_path", sweight_data) +else: + sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth") +# gpt_path = os.environ.get( +# "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +# ) +# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth") +cnhubert_base_path = os.environ.get( + "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base" +) +bert_path = os.environ.get( + "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +) +infer_ttswebui = os.environ.get("infer_ttswebui", 9872) +infer_ttswebui = int(infer_ttswebui) +is_share = os.environ.get("is_share", "False") +is_share = eval(is_share) +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +is_half = eval(os.environ.get("is_half", "True")) and not torch.backends.mps.is_available() +import gradio as gr +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np +import librosa +from feature_extractor import cnhubert + +cnhubert.cnhubert_base_path = cnhubert_base_path + +from module.models import SynthesizerTrn +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from text import cleaned_text_to_sequence +from text.cleaner import clean_text +from time import time as ttime +from module.mel_processing import spectrogram_torch +from my_utils import load_audio +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto() + +os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 + +if torch.cuda.is_available(): + device = "cuda" +else: + device = "cpu" + +tokenizer = AutoTokenizer.from_pretrained(bert_path) +bert_model = AutoModelForMaskedLM.from_pretrained(bert_path) +if is_half == True: + bert_model = bert_model.half().to(device) +else: + bert_model = bert_model.to(device) + + +def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + phone_level_feature = torch.cat(phone_level_feature, dim=0) + return phone_level_feature.T + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +ssl_model = cnhubert.get_model() +if is_half == True: + ssl_model = ssl_model.half().to(device) +else: + ssl_model = ssl_model.to(device) + + +def change_sovits_weights(sovits_path): + global vq_model, hps + dict_s2 = torch.load(sovits_path, map_location="cpu") + hps = dict_s2["config"] + hps = DictToAttrRecursive(hps) + hps.model.semantic_frame_rate = "25hz" + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model + ) + if ("pretrained" not in sovits_path): + del vq_model.enc_q + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + print(vq_model.load_state_dict(dict_s2["weight"], strict=False)) + with open("./sweight.txt", "w", encoding="utf-8") as f: + f.write(sovits_path) + + +change_sovits_weights(sovits_path) + + +def change_gpt_weights(gpt_path): + global hz, max_sec, t2s_model, config + hz = 50 + dict_s1 = torch.load(gpt_path, map_location="cpu") + config = dict_s1["config"] + max_sec = config["data"]["max_sec"] + t2s_model = Text2SemanticLightningModule(config, "****", is_train=False) + t2s_model.load_state_dict(dict_s1["weight"]) + if is_half == True: + t2s_model = t2s_model.half() + t2s_model = t2s_model.to(device) + t2s_model.eval() + total = sum([param.nelement() for param in t2s_model.parameters()]) + print("Number of parameter: %.2fM" % (total / 1e6)) + with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path) + + +change_gpt_weights(gpt_path) + + +def get_spepc(hps, filename): + audio = load_audio(filename, int(hps.data.sampling_rate)) + audio = torch.FloatTensor(audio) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) + return spec + + +dict_language = { + i18n("中文"): "all_zh",#全部按中文识别 + i18n("英文"): "en",#全部按英文识别#######不变 + i18n("日文"): "all_ja",#全部按日文识别 + i18n("中英混合"): "zh",#按中英混合识别####不变 + i18n("日英混合"): "ja",#按日英混合识别####不变 + i18n("多语种混合"): "auto",#多语种启动切分识别语种 +} + + +def clean_text_inf(text, language): + phones, word2ph, norm_text = clean_text(text, language) + phones = cleaned_text_to_sequence(phones) + return phones, word2ph, norm_text + +dtype=torch.float16 if is_half == True else torch.float32 +def get_bert_inf(phones, word2ph, norm_text, language): + language=language.replace("all_","") + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + + return bert + + +splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } + + +def get_first(text): + pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" + text = re.split(pattern, text)[0].strip() + return text + + +def get_phones_and_bert(text,language): + if language in {"en","all_zh","all_ja"}: + language = language.replace("all_","") + if language == "en": + LangSegment.setfilters(["en"]) + formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text)) + else: + # 因无法区别中日文汉字,以用户输入为准 + formattext = text + while " " in formattext: + formattext = formattext.replace(" ", " ") + phones, word2ph, norm_text = clean_text_inf(formattext, language) + if language == "zh": + bert = get_bert_feature(norm_text, word2ph).to(device) + else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) + elif language in {"zh", "ja","auto"}: + textlist=[] + langlist=[] + LangSegment.setfilters(["zh","ja","en","ko"]) + if language == "auto": + for tmp in LangSegment.getTexts(text): + if tmp["lang"] == "ko": + langlist.append("zh") + textlist.append(tmp["text"]) + else: + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + else: + for tmp in LangSegment.getTexts(text): + if tmp["lang"] == "en": + langlist.append(tmp["lang"]) + else: + # 因无法区别中日文汉字,以用户输入为准 + langlist.append(language) + textlist.append(tmp["text"]) + print(textlist) + print(langlist) + phones_list = [] + bert_list = [] + norm_text_list = [] + for i in range(len(textlist)): + lang = langlist[i] + phones, word2ph, norm_text = clean_text_inf(textlist[i], lang) + bert = get_bert_inf(phones, word2ph, norm_text, lang) + phones_list.append(phones) + norm_text_list.append(norm_text) + bert_list.append(bert) + bert = torch.cat(bert_list, dim=1) + phones = sum(phones_list, []) + norm_text = ''.join(norm_text_list) + + return phones,bert.to(dtype),norm_text + + +def merge_short_text_in_array(texts, threshold): + if (len(texts)) < 2: + return texts + result = [] + text = "" + for ele in texts: + text += ele + if len(text) >= threshold: + result.append(text) + text = "" + if (len(text) > 0): + if len(result) == 0: + result.append(text) + else: + result[len(result) - 1] += text + return result + +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False): + if prompt_text is None or len(prompt_text) == 0: + ref_free = True + t0 = ttime() + prompt_language = dict_language[prompt_language] + text_language = dict_language[text_language] + if not ref_free: + prompt_text = prompt_text.strip("\n") + if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "." + print(i18n("实际输入的参考文本:"), prompt_text) + text = text.strip("\n") + if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text + + print(i18n("实际输入的目标文本:"), text) + zero_wav = np.zeros( + int(hps.data.sampling_rate * 0.3), + dtype=np.float16 if is_half == True else np.float32, + ) + with torch.no_grad(): + wav16k, sr = librosa.load(ref_wav_path, sr=16000) + if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000): + raise OSError(i18n("参考音频在3~10秒范围外,请更换!")) + wav16k = torch.from_numpy(wav16k) + zero_wav_torch = torch.from_numpy(zero_wav) + if is_half == True: + wav16k = wav16k.half().to(device) + zero_wav_torch = zero_wav_torch.half().to(device) + else: + wav16k = wav16k.to(device) + zero_wav_torch = zero_wav_torch.to(device) + wav16k = torch.cat([wav16k, zero_wav_torch]) + ssl_content = ssl_model.model(wav16k.unsqueeze(0))[ + "last_hidden_state" + ].transpose( + 1, 2 + ) # .float() + codes = vq_model.extract_latent(ssl_content) + + prompt_semantic = codes[0, 0] + t1 = ttime() + + if (how_to_cut == i18n("凑四句一切")): + text = cut1(text) + elif (how_to_cut == i18n("凑50字一切")): + text = cut2(text) + elif (how_to_cut == i18n("按中文句号。切")): + text = cut3(text) + elif (how_to_cut == i18n("按英文句号.切")): + text = cut4(text) + elif (how_to_cut == i18n("按标点符号切")): + text = cut5(text) + while "\n\n" in text: + text = text.replace("\n\n", "\n") + print(i18n("实际输入的目标文本(切句后):"), text) + texts = text.split("\n") + texts = merge_short_text_in_array(texts, 5) + audio_opt = [] + if not ref_free: + phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language) + print("前端处理后的参考文本:%s"%norm_text1) + for text in texts: + # 解决输入目标文本的空行导致报错的问题 + if (len(text.strip()) == 0): + continue + if (text[-1] not in splits): text += "。" if text_language != "en" else "." + print(i18n("实际输入的目标文本(每句):"), text) + phones2,bert2,norm_text2=get_phones_and_bert(text, text_language) + print(i18n("前端处理后的文本(每句):"), norm_text2) + if not ref_free: + bert = torch.cat([bert1, bert2], 1) + all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0) + else: + bert = bert2 + all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0) + + bert = bert.to(device).unsqueeze(0) + all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device) + prompt = prompt_semantic.unsqueeze(0).to(device) + t2 = ttime() + with torch.no_grad(): + # pred_semantic = t2s_model.model.infer( + pred_semantic, idx = t2s_model.model.infer_panel( + all_phoneme_ids, + all_phoneme_len, + None if ref_free else prompt, + bert, + # prompt_phone_len=ph_offset, + top_k=top_k, + top_p=top_p, + temperature=temperature, + early_stop_num=hz * max_sec, + ) + t3 = ttime() + # print(pred_semantic.shape,idx) + pred_semantic = pred_semantic[:, -idx:].unsqueeze( + 0 + ) # .unsqueeze(0)#mq要多unsqueeze一次 + refer = get_spepc(hps, ref_wav_path) # .to(device) + if is_half == True: + refer = refer.half().to(device) + else: + refer = refer.to(device) + # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0] + audio = ( + vq_model.decode( + pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer + ) + .detach() + .cpu() + .numpy()[0, 0] + ) ###试试重建不带上prompt部分 + max_audio=np.abs(audio).max()#简单防止16bit爆音 + if max_audio>1:audio/=max_audio + audio_opt.append(audio) + audio_opt.append(zero_wav) + t4 = ttime() + print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype( + np.int16 + ) + + +def split(todo_text): + todo_text = todo_text.replace("……", "。").replace("——", ",") + if todo_text[-1] not in splits: + todo_text += "。" + i_split_head = i_split_tail = 0 + len_text = len(todo_text) + todo_texts = [] + while 1: + if i_split_head >= len_text: + break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 + if todo_text[i_split_head] in splits: + i_split_head += 1 + todo_texts.append(todo_text[i_split_tail:i_split_head]) + i_split_tail = i_split_head + else: + i_split_head += 1 + return todo_texts + + +def cut1(inp): + inp = inp.strip("\n") + inps = split(inp) + split_idx = list(range(0, len(inps), 4)) + split_idx[-1] = None + if len(split_idx) > 1: + opts = [] + for idx in range(len(split_idx) - 1): + opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) + else: + opts = [inp] + return "\n".join(opts) + + +def cut2(inp): + inp = inp.strip("\n") + inps = split(inp) + if len(inps) < 2: + return inp + opts = [] + summ = 0 + tmp_str = "" + for i in range(len(inps)): + summ += len(inps[i]) + tmp_str += inps[i] + if summ > 50: + summ = 0 + opts.append(tmp_str) + tmp_str = "" + if tmp_str != "": + opts.append(tmp_str) + # print(opts) + if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 + opts[-2] = opts[-2] + opts[-1] + opts = opts[:-1] + return "\n".join(opts) + + +def cut3(inp): + inp = inp.strip("\n") + return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) + + +def cut4(inp): + inp = inp.strip("\n") + return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) + + +# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py +def cut5(inp): + # if not re.search(r'[^\w\s]', inp[-1]): + # inp += '。' + inp = inp.strip("\n") + punds = r'[,.;?!、,。?!;:…]' + items = re.split(f'({punds})', inp) + mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] + # 在句子不存在符号或句尾无符号的时候保证文本完整 + if len(items)%2 == 1: + mergeitems.append(items[-1]) + opt = "\n".join(mergeitems) + return opt + + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split('(\d+)', s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def change_choices(): + SoVITS_names, GPT_names = get_weights_names() + return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + + +pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +SoVITS_weight_root = "SoVITS_weights" +GPT_weight_root = "GPT_weights" +os.makedirs(SoVITS_weight_root, exist_ok=True) +os.makedirs(GPT_weight_root, exist_ok=True) + + +def get_weights_names(): + SoVITS_names = [pretrained_sovits_name] + for name in os.listdir(SoVITS_weight_root): + if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name)) + GPT_names = [pretrained_gpt_name] + for name in os.listdir(GPT_weight_root): + if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name)) + return SoVITS_names, GPT_names + + +SoVITS_names, GPT_names = get_weights_names() + +with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + ) + with gr.Group(): + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) + SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) + SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], []) + GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) + gr.Markdown(value=i18n("*请上传并填写参考信息")) + with gr.Row(): + inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath") + with gr.Column(): + ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) + gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。")) + prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="") + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) + with gr.Row(): + text = gr.Textbox(label=i18n("需要合成的文本"), value="") + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") + ) + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], + value=i18n("凑四句一切"), + interactive=True, + ) + with gr.Row(): + gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低):")) + top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) + top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) + temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) + inference_button = gr.Button(i18n("合成语音"), variant="primary") + output = gr.Audio(label=i18n("输出的语音")) + + inference_button.click( + get_tts_wav, + [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free], + [output], + ) + + gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) + with gr.Row(): + text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="") + button1 = gr.Button(i18n("凑四句一切"), variant="primary") + button2 = gr.Button(i18n("凑50字一切"), variant="primary") + button3 = gr.Button(i18n("按中文句号。切"), variant="primary") + button4 = gr.Button(i18n("按英文句号.切"), variant="primary") + button5 = gr.Button(i18n("按标点符号切"), variant="primary") + text_opt = gr.Textbox(label=i18n("切分后文本"), value="") + button1.click(cut1, [text_inp], [text_opt]) + button2.click(cut2, [text_inp], [text_opt]) + button3.click(cut3, [text_inp], [text_opt]) + button4.click(cut4, [text_inp], [text_opt]) + button5.click(cut5, [text_inp], [text_opt]) + gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) + +app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, +) diff --git a/module/__init__.py b/module/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/module/__pycache__/__init__.cpython-39.pyc b/module/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78ada0898abd880d82aa689f45ad64b8c3fb89d9 Binary files /dev/null and b/module/__pycache__/__init__.cpython-39.pyc differ diff --git a/module/__pycache__/attentions.cpython-39.pyc b/module/__pycache__/attentions.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d28e1dc0a3d57c65f263d8861b28eab48879dfe Binary files /dev/null and b/module/__pycache__/attentions.cpython-39.pyc differ diff --git a/module/__pycache__/commons.cpython-39.pyc b/module/__pycache__/commons.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bff0fb7d7bdcc4c30623b98de342f0b625cfcf4 Binary files /dev/null and b/module/__pycache__/commons.cpython-39.pyc differ diff --git a/module/__pycache__/core_vq.cpython-39.pyc b/module/__pycache__/core_vq.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a027d6ddb459b5feab6f1883019da8259699de0c Binary files /dev/null and b/module/__pycache__/core_vq.cpython-39.pyc differ diff --git a/module/__pycache__/mel_processing.cpython-39.pyc b/module/__pycache__/mel_processing.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..807bf9a9516694d6eb7a9132cabdab89524480fd Binary files /dev/null and b/module/__pycache__/mel_processing.cpython-39.pyc differ diff --git a/module/__pycache__/models.cpython-39.pyc b/module/__pycache__/models.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93a6cc6555550b7fcd2f019bee45624d8bf4a0eb Binary files /dev/null and b/module/__pycache__/models.cpython-39.pyc differ diff --git a/module/__pycache__/modules.cpython-39.pyc b/module/__pycache__/modules.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffcee949d5531fb37ffc7e1e330f124ae13709f8 Binary files /dev/null and b/module/__pycache__/modules.cpython-39.pyc differ diff --git a/module/__pycache__/mrte_model.cpython-39.pyc b/module/__pycache__/mrte_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17754521ba18dffbcf24f40d8af92d8a4932be26 Binary files /dev/null and b/module/__pycache__/mrte_model.cpython-39.pyc differ diff --git a/module/__pycache__/quantize.cpython-39.pyc b/module/__pycache__/quantize.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e7b9ce38bf001ced499e1351f600e70259d25f1 Binary files /dev/null and b/module/__pycache__/quantize.cpython-39.pyc differ diff --git a/module/__pycache__/transforms.cpython-39.pyc b/module/__pycache__/transforms.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb492d6d56bcf55ef22e4a30ca616e99d6c07045 Binary files /dev/null and b/module/__pycache__/transforms.cpython-39.pyc differ diff --git a/module/attentions.py b/module/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e9e5155ed10884a61531d65f6a9792ec595486 --- /dev/null +++ b/module/attentions.py @@ -0,0 +1,709 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +from module import commons +from module.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=False, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + if isflow: + cond_layer = torch.nn.Conv1d( + kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 + ) + self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) + self.cond_layer = weight_norm_modules(cond_layer, name="weight") + self.gin_channels = kwargs["gin_channels"] + + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + if g is not None: + x = self.cond_pre(x) + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + x = commons.fused_add_tanh_sigmoid_multiply( + x, g_l, torch.IntTensor([self.hidden_channels]) + ) + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + +import torch.nn as nn +from torch.nn.utils import remove_weight_norm, weight_norm + + +class Depthwise_Separable_Conv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode="zeros", # TODO: refine this type + device=None, + dtype=None, + ): + super().__init__() + self.depth_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + groups=in_channels, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + self.point_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + bias=bias, + device=device, + dtype=dtype, + ) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name="weight") + self.point_conv = weight_norm(self.point_conv, name="weight") + + def remove_weight_norm(self): + self.depth_conv = remove_weight_norm(self.depth_conv, name="weight") + self.point_conv = remove_weight_norm(self.point_conv, name="weight") + + +class Depthwise_Separable_TransposeConv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + bias=True, + dilation=1, + padding_mode="zeros", # TODO: refine this type + device=None, + dtype=None, + ): + super().__init__() + self.depth_conv = nn.ConvTranspose1d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + groups=in_channels, + stride=stride, + output_padding=output_padding, + padding=padding, + dilation=dilation, + bias=bias, + padding_mode=padding_mode, + device=device, + dtype=dtype, + ) + self.point_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + bias=bias, + device=device, + dtype=dtype, + ) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name="weight") + self.point_conv = weight_norm(self.point_conv, name="weight") + + def remove_weight_norm(self): + remove_weight_norm(self.depth_conv, name="weight") + remove_weight_norm(self.point_conv, name="weight") + + +def weight_norm_modules(module, name="weight", dim=0): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( + module, Depthwise_Separable_TransposeConv1D + ): + module.weight_norm() + return module + else: + return weight_norm(module, name, dim) + + +def remove_weight_norm_modules(module, name="weight"): + if isinstance(module, Depthwise_Separable_Conv1D) or isinstance( + module, Depthwise_Separable_TransposeConv1D + ): + module.remove_weight_norm() + else: + remove_weight_norm(module, name) + + +class FFT(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers=1, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + isflow=False, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + if isflow: + cond_layer = torch.nn.Conv1d( + kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1 + ) + self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1) + self.cond_layer = weight_norm_modules(cond_layer, name="weight") + self.gin_channels = kwargs["gin_channels"] + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g=None): + """ + x: decoder input + h: encoder output + """ + if g is not None: + g = self.cond_layer(g) + + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + x = x * x_mask + for i in range(self.n_layers): + if g is not None: + x = self.cond_pre(x) + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + x = commons.fused_add_tanh_sigmoid_multiply( + x, g_l, torch.IntTensor([self.hidden_channels]) + ) + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + x = x * x_mask + return x + + +class TransformerCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + n_layers, + n_heads, + p_dropout=0, + filter_channels=0, + mean_only=False, + wn_sharing_parameter=None, + gin_channels=0, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = ( + Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + isflow=True, + gin_channels=gin_channels, + ) + if wn_sharing_parameter is None + else wn_sharing_parameter + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x diff --git a/module/attentions_onnx.py b/module/attentions_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..bc63a06ff571429b1b8ca267ac7b4b64deaa21ce --- /dev/null +++ b/module/attentions_onnx.py @@ -0,0 +1,354 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +from module import commons +from module.modules import LayerNorm + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + isflow=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + # if isflow: + # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1) + # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) + # self.cond_layer = weight_norm(cond_layer, name='weight') + # self.gin_channels = 256 + self.cond_layer_idx = self.n_layers + if "gin_channels" in kwargs: + self.gin_channels = kwargs["gin_channels"] + if self.gin_channels != 0: + self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels) + # vits2 says 3rd block, so idx is 2 by default + self.cond_layer_idx = ( + kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2 + ) + logging.debug(self.gin_channels, self.cond_layer_idx) + assert ( + self.cond_layer_idx < self.n_layers + ), "cond_layer_idx should be less than n_layers" + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g=None): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + if i == self.cond_layer_idx and g is not None: + g = self.spk_emb_linear(g.transpose(1, 2)) + g = g.transpose(1, 2) + x = x + g + x = x * x_mask + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, _ = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3) + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + + if self.window_size is not None: + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + + p_attn = F.softmax(scores, dim=-1) + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + + output = (output.transpose(2, 3).contiguous().view(b, d, -1)) + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_l = torch.zeros((1), dtype = torch.int64) + length - (self.window_size + 1) + pad_s = torch.zeros((1), dtype = torch.int64) + (self.window_size + 1) - length + pad_length = torch.max(pad_l, other=torch.zeros((1), dtype = torch.int64)) + slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype = torch.int64)) + + slice_end_position = slice_start_position + 2 * length - 1 + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/module/commons.py b/module/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..e96cf923515c7f5648af9dfedaa803df3f4f28be --- /dev/null +++ b/module/commons.py @@ -0,0 +1,189 @@ +import math +import torch +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm + + +def squeeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + t = (t // n_sqz) * n_sqz + x = x[:, :, :t] + x_sqz = x.view(b, c, t // n_sqz, n_sqz) + x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) + + if x_mask is not None: + x_mask = x_mask[:, :, n_sqz - 1 :: n_sqz] + else: + x_mask = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) + return x_sqz * x_mask, x_mask + + +def unsqueeze(x, x_mask=None, n_sqz=2): + b, c, t = x.size() + + x_unsqz = x.view(b, n_sqz, c // n_sqz, t) + x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) + + if x_mask is not None: + x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) + else: + x_mask = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) + return x_unsqz * x_mask, x_mask diff --git a/module/core_vq.py b/module/core_vq.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e22d667631d22a561189651e18e821800b57a5 --- /dev/null +++ b/module/core_vq.py @@ -0,0 +1,383 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +# This implementation is inspired from +# https://github.com/lucidrains/vector-quantize-pytorch +# which is released under MIT License. Hereafter, the original license: +# MIT License +# +# Copyright (c) 2020 Phil Wang +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +"""Core vector quantization implementation.""" +import typing as tp + +from einops import rearrange, repeat +import torch +from torch import nn +import torch.nn.functional as F +from tqdm import tqdm + + +def default(val: tp.Any, d: tp.Any) -> tp.Any: + return val if val is not None else d + + +def ema_inplace(moving_avg, new, decay: float): + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + + +def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5): + return (x + epsilon) / (x.sum() + n_categories * epsilon) + + +def uniform_init(*shape: int): + t = torch.empty(shape) + nn.init.kaiming_uniform_(t) + return t + + +def sample_vectors(samples, num: int): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = torch.randperm(num_samples, device=device)[:num] + else: + indices = torch.randint(0, num_samples, (num,), device=device) + + return samples[indices] + + +def kmeans(samples, num_clusters: int, num_iters: int = 10): + dim, dtype = samples.shape[-1], samples.dtype + max_kmeans_samples = 500 + samples = samples[:max_kmeans_samples, :] + means = sample_vectors(samples, num_clusters) + + print("kmeans start ... ") + for _ in tqdm(range(num_iters)): + diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d") + dists = -(diffs**2).sum(dim=-1) + + buckets = dists.max(dim=-1).indices + bins = torch.bincount(buckets, minlength=num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype) + new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + means = torch.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EuclideanCodebook(nn.Module): + """Codebook with Euclidean distance. + Args: + dim (int): Dimension. + codebook_size (int): Codebook size. + kmeans_init (bool): Whether to use k-means to initialize the codebooks. + If set to true, run the k-means algorithm on the first training batch and use + the learned centroids as initialization. + kmeans_iters (int): Number of iterations used for k-means algorithm at initialization. + decay (float): Decay for exponential moving average over the codebooks. + epsilon (float): Epsilon value for numerical stability. + threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes + that have an exponential moving average cluster size less than the specified threshold with + randomly selected vector from the current batch. + """ + + def __init__( + self, + dim: int, + codebook_size: int, + kmeans_init: int = False, + kmeans_iters: int = 10, + decay: float = 0.99, + epsilon: float = 1e-5, + threshold_ema_dead_code: int = 2, + ): + super().__init__() + self.decay = decay + init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = ( + uniform_init if not kmeans_init else torch.zeros + ) + embed = init_fn(codebook_size, dim) + + self.codebook_size = codebook_size + + self.kmeans_iters = kmeans_iters + self.epsilon = epsilon + self.threshold_ema_dead_code = threshold_ema_dead_code + + self.register_buffer("inited", torch.Tensor([not kmeans_init])) + self.register_buffer("cluster_size", torch.zeros(codebook_size)) + self.register_buffer("embed", embed) + self.register_buffer("embed_avg", embed.clone()) + + @torch.jit.ignore + def init_embed_(self, data): + if self.inited: + return + + embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters) + self.embed.data.copy_(embed) + self.embed_avg.data.copy_(embed.clone()) + self.cluster_size.data.copy_(cluster_size) + self.inited.data.copy_(torch.Tensor([True])) + # Make sure all buffers across workers are in sync after initialization + # broadcast_tensors(self.buffers()) + + def replace_(self, samples, mask): + modified_codebook = torch.where( + mask[..., None], sample_vectors(samples, self.codebook_size), self.embed + ) + self.embed.data.copy_(modified_codebook) + + def expire_codes_(self, batch_samples): + if self.threshold_ema_dead_code == 0: + return + + expired_codes = self.cluster_size < self.threshold_ema_dead_code + if not torch.any(expired_codes): + return + + batch_samples = rearrange(batch_samples, "... d -> (...) d") + self.replace_(batch_samples, mask=expired_codes) + # broadcast_tensors(self.buffers()) + + def preprocess(self, x): + x = rearrange(x, "... d -> (...) d") + return x + + def quantize(self, x): + embed = self.embed.t() + dist = -( + x.pow(2).sum(1, keepdim=True) + - 2 * x @ embed + + embed.pow(2).sum(0, keepdim=True) + ) + embed_ind = dist.max(dim=-1).indices + return embed_ind + + def postprocess_emb(self, embed_ind, shape): + return embed_ind.view(*shape[:-1]) + + def dequantize(self, embed_ind): + quantize = F.embedding(embed_ind, self.embed) + return quantize + + def encode(self, x): + shape = x.shape + # pre-process + x = self.preprocess(x) + # quantize + embed_ind = self.quantize(x) + # post-process + embed_ind = self.postprocess_emb(embed_ind, shape) + return embed_ind + + def decode(self, embed_ind): + quantize = self.dequantize(embed_ind) + return quantize + + def forward(self, x): + shape, dtype = x.shape, x.dtype + x = self.preprocess(x) + + self.init_embed_(x) + + embed_ind = self.quantize(x) + embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype) + embed_ind = self.postprocess_emb(embed_ind, shape) + quantize = self.dequantize(embed_ind) + + if self.training: + # We do the expiry of code at that point as buffers are in sync + # and all the workers will take the same decision. + self.expire_codes_(x) + ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay) + embed_sum = x.t() @ embed_onehot + ema_inplace(self.embed_avg, embed_sum.t(), self.decay) + cluster_size = ( + laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) + * self.cluster_size.sum() + ) + embed_normalized = self.embed_avg / cluster_size.unsqueeze(1) + self.embed.data.copy_(embed_normalized) + + return quantize, embed_ind + + +class VectorQuantization(nn.Module): + """Vector quantization implementation. + Currently supports only euclidean distance. + Args: + dim (int): Dimension + codebook_size (int): Codebook size + codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim. + decay (float): Decay for exponential moving average over the codebooks. + epsilon (float): Epsilon value for numerical stability. + kmeans_init (bool): Whether to use kmeans to initialize the codebooks. + kmeans_iters (int): Number of iterations used for kmeans initialization. + threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes + that have an exponential moving average cluster size less than the specified threshold with + randomly selected vector from the current batch. + commitment_weight (float): Weight for commitment loss. + """ + + def __init__( + self, + dim: int, + codebook_size: int, + codebook_dim: tp.Optional[int] = None, + decay: float = 0.99, + epsilon: float = 1e-5, + kmeans_init: bool = True, + kmeans_iters: int = 50, + threshold_ema_dead_code: int = 2, + commitment_weight: float = 1.0, + ): + super().__init__() + _codebook_dim: int = default(codebook_dim, dim) + + requires_projection = _codebook_dim != dim + self.project_in = ( + nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity() + ) + self.project_out = ( + nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity() + ) + + self.epsilon = epsilon + self.commitment_weight = commitment_weight + + self._codebook = EuclideanCodebook( + dim=_codebook_dim, + codebook_size=codebook_size, + kmeans_init=kmeans_init, + kmeans_iters=kmeans_iters, + decay=decay, + epsilon=epsilon, + threshold_ema_dead_code=threshold_ema_dead_code, + ) + self.codebook_size = codebook_size + + @property + def codebook(self): + return self._codebook.embed + + def encode(self, x): + x = rearrange(x, "b d n -> b n d") + x = self.project_in(x) + embed_in = self._codebook.encode(x) + return embed_in + + def decode(self, embed_ind): + quantize = self._codebook.decode(embed_ind) + quantize = self.project_out(quantize) + quantize = rearrange(quantize, "b n d -> b d n") + return quantize + + def forward(self, x): + device = x.device + x = rearrange(x, "b d n -> b n d") + x = self.project_in(x) + + quantize, embed_ind = self._codebook(x) + + if self.training: + quantize = x + (quantize - x).detach() + + loss = torch.tensor([0.0], device=device, requires_grad=self.training) + + if self.training: + if self.commitment_weight > 0: + commit_loss = F.mse_loss(quantize.detach(), x) + loss = loss + commit_loss * self.commitment_weight + + quantize = self.project_out(quantize) + quantize = rearrange(quantize, "b n d -> b d n") + return quantize, embed_ind, loss + + +class ResidualVectorQuantization(nn.Module): + """Residual vector quantization implementation. + Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf + """ + + def __init__(self, *, num_quantizers, **kwargs): + super().__init__() + self.layers = nn.ModuleList( + [VectorQuantization(**kwargs) for _ in range(num_quantizers)] + ) + + def forward( + self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None + ): + quantized_out = 0.0 + residual = x + + all_losses = [] + all_indices = [] + out_quantized = [] + + n_q = n_q or len(self.layers) + + for i, layer in enumerate(self.layers[:n_q]): + quantized, indices, loss = layer(residual) + residual = residual - quantized + quantized_out = quantized_out + quantized + + all_indices.append(indices) + all_losses.append(loss) + if layers and i in layers: + out_quantized.append(quantized) + + out_losses, out_indices = map(torch.stack, (all_losses, all_indices)) + return quantized_out, out_indices, out_losses, out_quantized + + def encode( + self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None + ) -> torch.Tensor: + residual = x + all_indices = [] + n_q = n_q or len(self.layers) + st = st or 0 + for layer in self.layers[st:n_q]: + indices = layer.encode(residual) + quantized = layer.decode(indices) + residual = residual - quantized + all_indices.append(indices) + out_indices = torch.stack(all_indices) + return out_indices + + def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor: + quantized_out = torch.tensor(0.0, device=q_indices.device) + for i, indices in enumerate(q_indices): + layer = self.layers[st + i] + quantized = layer.decode(indices) + quantized_out = quantized_out + quantized + return quantized_out diff --git a/module/data_utils.py b/module/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4c4f4311db8a3fda0a6052ba00ac0c622a9236 --- /dev/null +++ b/module/data_utils.py @@ -0,0 +1,332 @@ +import time +import logging +import os +import random +import traceback +import numpy as np +import torch +import torch.utils.data +from tqdm import tqdm + +from module import commons +from module.mel_processing import spectrogram_torch +from text import cleaned_text_to_sequence +from utils import load_wav_to_torch, load_filepaths_and_text +import torch.nn.functional as F +from functools import lru_cache +import requests +from scipy.io import wavfile +from io import BytesIO +from my_utils import load_audio + +# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) +class TextAudioSpeakerLoader(torch.utils.data.Dataset): + """ + 1) loads audio, speaker_id, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, hparams, val=False): + exp_dir = hparams.exp_dir + self.path2 = "%s/2-name2text.txt" % exp_dir + self.path4 = "%s/4-cnhubert" % exp_dir + self.path5 = "%s/5-wav32k" % exp_dir + assert os.path.exists(self.path2) + assert os.path.exists(self.path4) + assert os.path.exists(self.path5) + names4 = set([name[:-3] for name in list(os.listdir(self.path4))]) # 去除.pt后缀 + names5 = set(os.listdir(self.path5)) + self.phoneme_data = {} + with open(self.path2, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + for line in lines: + tmp = line.split("\t") + if (len(tmp) != 4): + continue + self.phoneme_data[tmp[0]] = [tmp[1]] + + self.audiopaths_sid_text = list(set(self.phoneme_data) & names4 & names5) + tmp = self.audiopaths_sid_text + leng = len(tmp) + min_num = 100 + if (leng < min_num): + self.audiopaths_sid_text = [] + for _ in range(max(2, int(min_num / leng))): + self.audiopaths_sid_text += tmp + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.val = val + + random.seed(1234) + random.shuffle(self.audiopaths_sid_text) + + print("phoneme_data_len:", len(self.phoneme_data.keys())) + print("wav_data_len:", len(self.audiopaths_sid_text)) + + audiopaths_sid_text_new = [] + lengths = [] + skipped_phone = 0 + skipped_dur = 0 + for audiopath in tqdm(self.audiopaths_sid_text): + try: + phoneme = self.phoneme_data[audiopath][0] + phoneme = phoneme.split(' ') + phoneme_ids = cleaned_text_to_sequence(phoneme) + except Exception: + print(f"{audiopath} not in self.phoneme_data !") + skipped_phone += 1 + continue + + size = os.path.getsize("%s/%s" % (self.path5, audiopath)) + duration = size / self.sampling_rate / 2 + + if duration == 0: + print(f"Zero duration for {audiopath}, skipping...") + skipped_dur += 1 + continue + + if 54 > duration > 0.6 or self.val: + audiopaths_sid_text_new.append([audiopath, phoneme_ids]) + lengths.append(size // (2 * self.hop_length)) + else: + skipped_dur += 1 + continue + + print("skipped_phone: ", skipped_phone, ", skipped_dur: ", skipped_dur) + print("total left: ", len(audiopaths_sid_text_new)) + assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo + self.audiopaths_sid_text = audiopaths_sid_text_new + self.lengths = lengths + + def get_audio_text_speaker_pair(self, audiopath_sid_text): + audiopath, phoneme_ids = audiopath_sid_text + text = torch.FloatTensor(phoneme_ids) + try: + spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath)) + with torch.no_grad(): + ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu") + if (ssl.shape[-1] != spec.shape[-1]): + typee = ssl.dtype + ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee) + ssl.requires_grad = False + except: + traceback.print_exc() + spec = torch.zeros(1025, 100) + wav = torch.zeros(1, 100 * self.hop_length) + ssl = torch.zeros(1, 768, 100) + text = text[-1:] + print("load audio or ssl error!!!!!!", audiopath) + return (ssl, spec, wav, text) + + def get_audio(self, filename): + audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768 + audio = torch.FloatTensor(audio_array) # /32768 + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, + center=False) + spec = torch.squeeze(spec, 0) + return spec, audio_norm + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def __getitem__(self, index): + # with torch.no_grad(): + return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) + + def __len__(self): + return len(self.audiopaths_sid_text) + + def random_slice(self, ssl, wav, mel): + assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ( + "first", ssl.shape, wav.shape) + + len_mel = mel.shape[1] + if self.val: + reference_mel = mel[:, :len_mel // 3] + return reference_mel, ssl, wav, mel + dir = random.randint(0, 1) + sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2)) + + if dir == 0: + reference_mel = mel[:, :sep_point] + ssl = ssl[:, :, sep_point:] + wav2 = wav[:, sep_point * self.hop_length:] + mel = mel[:, sep_point:] + else: + reference_mel = mel[:, sep_point:] + ssl = ssl[:, :, :sep_point] + wav2 = wav[:, :sep_point * self.hop_length] + mel = mel[:, :sep_point] + + assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, ( + ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir) + return reference_mel, ssl, wav2, mel + + +class TextAudioSpeakerCollate(): + """ Zero-pads model inputs and targets + """ + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text, audio and speaker identities + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized, sid] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[1].size(1) for x in batch]), + dim=0, descending=True) + + max_ssl_len = max([x[0].size(2) for x in batch]) + max_ssl_len = int(2 * ((max_ssl_len // 2) + 1)) + max_spec_len = max([x[1].size(1) for x in batch]) + max_spec_len = int(2 * ((max_spec_len // 2) + 1)) + max_wav_len = max([x[2].size(1) for x in batch]) + max_text_len = max([x[3].size(0) for x in batch]) + + ssl_lengths = torch.LongTensor(len(batch)) + spec_lengths = torch.LongTensor(len(batch)) + wav_lengths = torch.LongTensor(len(batch)) + text_lengths = torch.LongTensor(len(batch)) + + spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) + wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) + ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) + text_padded = torch.LongTensor(len(batch), max_text_len) + + spec_padded.zero_() + wav_padded.zero_() + ssl_padded.zero_() + text_padded.zero_() + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + ssl = row[0] + ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :] + ssl_lengths[i] = ssl.size(2) + + spec = row[1] + spec_padded[i, :, :spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wav = row[2] + wav_padded[i, :, :wav.size(1)] = wav + wav_lengths[i] = wav.size(1) + + text = row[3] + text_padded[i, :text.size(0)] = text + text_lengths[i] = text.size(0) + + return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + i = len(buckets) - 1 + while i >= 0: + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + i -= 1 + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + rem = num_samples_bucket - len_bucket + ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] + + ids_bucket = ids_bucket[self.rank::self.num_replicas] + + for j in range(len(ids_bucket) // self.batch_size): + batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size \ No newline at end of file diff --git a/module/losses.py b/module/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..b23fc8c1d5403d8d1ac19bb57e734029dcd67a19 --- /dev/null +++ b/module/losses.py @@ -0,0 +1,73 @@ +import math + +import torch +from torch.nn import functional as F + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l + + +def mle_loss(z, m, logs, logdet, mask): + l = torch.sum(logs) + 0.5 * torch.sum( + torch.exp(-2 * logs) * ((z - m) ** 2) + ) # neg normal likelihood w/o the constant term + l = l - torch.sum(logdet) # log jacobian determinant + l = l / torch.sum( + torch.ones_like(z) * mask + ) # averaging across batch, channel and time axes + l = l + 0.5 * math.log(2 * math.pi) # add the remaining constant term + return l diff --git a/module/mel_processing.py b/module/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..503825ec6df0ba395b36739516a5d21842eb72d8 --- /dev/null +++ b/module/mel_processing.py @@ -0,0 +1,153 @@ +import math +import os +import random +import torch +from torch import nn +import torch.nn.functional as F +import torch.utils.data +import numpy as np +import librosa +import librosa.util as librosa_util +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=y.dtype, device=y.device + ) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/module/models.py b/module/models.py new file mode 100644 index 0000000000000000000000000000000000000000..a4d223522723bbe3070fc2bed7556239ced1e526 --- /dev/null +++ b/module/models.py @@ -0,0 +1,992 @@ +import copy +import math +import torch +from torch import nn +from torch.nn import functional as F + +from module import commons +from module import modules +from module import attentions + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from module.commons import init_weights, get_padding +from module.mrte_model import MRTE +from module.quantize import ResidualVectorQuantizer +from text import symbols +from torch.cuda.amp import autocast + + +class StochasticDurationPredictor(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): + super().__init__() + filter_channels = in_channels # it needs to be removed from future version. + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.log_flow = modules.Log() + self.flows = nn.ModuleList() + self.flows.append(modules.ElementwiseAffine(2)) + for i in range(n_flows): + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.flows.append(modules.Flip()) + + self.post_pre = nn.Conv1d(1, filter_channels, 1) + self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + self.post_flows = nn.ModuleList() + self.post_flows.append(modules.ElementwiseAffine(2)) + for i in range(4): + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.post_flows.append(modules.Flip()) + + self.pre = nn.Conv1d(in_channels, filter_channels, 1) + self.proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, filter_channels, 1) + + def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): + x = torch.detach(x) + x = self.pre(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.convs(x, x_mask) + x = self.proj(x) * x_mask + + if not reverse: + flows = self.flows + assert w is not None + + logdet_tot_q = 0 + h_w = self.post_pre(w) + h_w = self.post_convs(h_w, x_mask) + h_w = self.post_proj(h_w) * x_mask + e_q = ( + torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) + * x_mask + ) + z_q = e_q + for flow in self.post_flows: + z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) + logdet_tot_q += logdet_q + z_u, z1 = torch.split(z_q, [1, 1], 1) + u = torch.sigmoid(z_u) * x_mask + z0 = (w - u) * x_mask + logdet_tot_q += torch.sum( + (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] + ) + logq = ( + torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) + - logdet_tot_q + ) + + logdet_tot = 0 + z0, logdet = self.log_flow(z0, x_mask) + logdet_tot += logdet + z = torch.cat([z0, z1], 1) + for flow in flows: + z, logdet = flow(z, x_mask, g=x, reverse=reverse) + logdet_tot = logdet_tot + logdet + nll = ( + torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) + - logdet_tot + ) + return nll + logq # [b] + else: + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + z = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale + ) + for flow in flows: + z = flow(z, x_mask, g=x, reverse=reverse) + z0, z1 = torch.split(z, [1, 1], 1) + logw = z0 + return logw + + +class DurationPredictor(nn.Module): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + def forward(self, x, x_mask, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + latent_channels=192, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.latent_channels = latent_channels + + self.ssl_proj = nn.Conv1d(768, hidden_channels, 1) + + self.encoder_ssl = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers // 2, + kernel_size, + p_dropout, + ) + + self.encoder_text = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.text_embedding = nn.Embedding(len(symbols), hidden_channels) + + self.mrte = MRTE() + + self.encoder2 = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers // 2, + kernel_size, + p_dropout, + ) + + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, y, y_lengths, text, text_lengths, ge, test=None): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) + + y = self.ssl_proj(y * y_mask) * y_mask + + y = self.encoder_ssl(y * y_mask, y_mask) + + text_mask = torch.unsqueeze( + commons.sequence_mask(text_lengths, text.size(1)), 1 + ).to(y.dtype) + if test == 1: + text[:, :] = 0 + text = self.text_embedding(text).transpose(1, 2) + text = self.encoder_text(text * text_mask, text_mask) + y = self.mrte(y, y_mask, text, text_mask, ge) + + y = self.encoder2(y * y_mask, y_mask) + + stats = self.proj(y) * y_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return y, m, logs, y_mask + + def extract_latent(self, x): + x = self.ssl_proj(x) + quantized, codes, commit_loss, quantized_list = self.quantizer(x) + return codes.transpose(0, 1) + + def decode_latent(self, codes, y_mask, refer, refer_mask, ge): + quantized = self.quantizer.decode(codes) + + y = self.vq_proj(quantized) * y_mask + y = self.encoder_ssl(y * y_mask, y_mask) + + y = self.mrte(y, y_mask, refer, refer_mask, ge) + + y = self.encoder2(y * y_mask, y_mask) + + stats = self.proj(y) * y_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return y, m, logs, y_mask, quantized + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + if g != None: + g = g.detach() + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class WNEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.norm = modules.LayerNorm(out_channels) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + out = self.proj(x) * x_mask + out = self.norm(out) + return out + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class ReferenceEncoder(nn.Module): + """ + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + """ + + def __init__(self, spec_channels, gin_channels=0): + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, gin_channels) + + def forward(self, inputs): + N = inputs.size(0) + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + for conv in self.convs: + out = conv(out) + # out = wn(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)).unsqueeze(-1) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class Quantizer_module(torch.nn.Module): + def __init__(self, n_e, e_dim): + super(Quantizer_module, self).__init__() + self.embedding = nn.Embedding(n_e, e_dim) + self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e) + + def forward(self, x): + d = ( + torch.sum(x**2, 1, keepdim=True) + + torch.sum(self.embedding.weight**2, 1) + - 2 * torch.matmul(x, self.embedding.weight.T) + ) + min_indicies = torch.argmin(d, 1) + z_q = self.embedding(min_indicies) + return z_q, min_indicies + + +class Quantizer(torch.nn.Module): + def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160): + super(Quantizer, self).__init__() + assert embed_dim % n_code_groups == 0 + self.quantizer_modules = nn.ModuleList( + [ + Quantizer_module(n_codes, embed_dim // n_code_groups) + for _ in range(n_code_groups) + ] + ) + self.n_code_groups = n_code_groups + self.embed_dim = embed_dim + + def forward(self, xin): + # B, C, T + B, C, T = xin.shape + xin = xin.transpose(1, 2) + x = xin.reshape(-1, self.embed_dim) + x = torch.split(x, self.embed_dim // self.n_code_groups, dim=-1) + min_indicies = [] + z_q = [] + for _x, m in zip(x, self.quantizer_modules): + _z_q, _min_indicies = m(_x) + z_q.append(_z_q) + min_indicies.append(_min_indicies) # B * T, + z_q = torch.cat(z_q, -1).reshape(xin.shape) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( + (z_q - xin.detach()) ** 2 + ) + z_q = xin + (z_q - xin).detach() + z_q = z_q.transpose(1, 2) + codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) + return z_q, loss, codes.transpose(1, 2) + + def embed(self, x): + # idx: N, 4, T + x = x.transpose(1, 2) + x = torch.split(x, 1, 2) + ret = [] + for q, embed in zip(x, self.quantizer_modules): + q = embed.embedding(q.squeeze(-1)) + ret.append(q) + ret = torch.cat(ret, -1) + return ret.transpose(1, 2) # N, C, T + + +class CodePredictor(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_q=8, + dims=1024, + ssl_dim=768, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) + self.ref_enc = modules.MelStyleEncoder( + ssl_dim, style_vector_dim=hidden_channels + ) + + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + + self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) + self.n_q = n_q + self.dims = dims + + def forward(self, x, x_mask, refer, codes, infer=False): + x = x.detach() + x = self.vq_proj(x * x_mask) * x_mask + g = self.ref_enc(refer, x_mask) + x = x + g + x = self.encoder(x * x_mask, x_mask) + x = self.out_proj(x * x_mask) * x_mask + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( + 2, 3 + ) + target = codes[1:].transpose(0, 1) + if not infer: + logits = logits.reshape(-1, self.dims) + target = target.reshape(-1) + loss = torch.nn.functional.cross_entropy(logits, target) + return loss + else: + _, top10_preds = torch.topk(logits, 10, dim=-1) + correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1) + top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item() + + print("Top-10 Accuracy:", top3_acc, "%") + + pred_codes = torch.argmax(logits, dim=-1) + acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item() + print("Top-1 Accuracy:", acc, "%") + + return pred_codes.transpose(0, 1) + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + + self.use_sdp = use_sdp + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels + ) + + self.ref_enc = modules.MelStyleEncoder( + spec_channels, style_vector_dim=gin_channels + ) + + ssl_dim = 768 + assert semantic_frame_rate in ["25hz", "50hz"] + self.semantic_frame_rate = semantic_frame_rate + if semantic_frame_rate == "25hz": + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) + else: + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) + + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + if freeze_quantizer: + self.ssl_proj.requires_grad_(False) + self.quantizer.requires_grad_(False) + # self.enc_p.text_embedding.requires_grad_(False) + # self.enc_p.encoder_text.requires_grad_(False) + # self.enc_p.mrte.requires_grad_(False) + + def forward(self, ssl, y, y_lengths, text, text_lengths): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) + ge = self.ref_enc(y * y_mask, y_mask) + + with autocast(enabled=False): + ssl = self.ssl_proj(ssl) + quantized, codes, commit_loss, quantized_list = self.quantizer( + ssl, layers=[0] + ) + + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) + + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge + ) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge) + z_p = self.flow(z, y_mask, g=ge) + + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=ge) + return ( + o, + commit_loss, + ids_slice, + y_mask, + y_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + quantized, + ) + + def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5): + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to( + y.dtype + ) + ge = self.ref_enc(y * y_mask, y_mask) + + ssl = self.ssl_proj(ssl) + quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0]) + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) + + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge, test=test + ) + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + + z = self.flow(z_p, y_mask, g=ge, reverse=True) + + o = self.dec((z * y_mask)[:, :, :], g=ge) + return o, y_mask, (z, z_p, m_p, logs_p) + + @torch.no_grad() + def decode(self, codes, text, refer, noise_scale=0.5): + ge = None + if refer is not None: + refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device) + refer_mask = torch.unsqueeze( + commons.sequence_mask(refer_lengths, refer.size(2)), 1 + ).to(refer.dtype) + ge = self.ref_enc(refer * refer_mask, refer_mask) + + y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device) + text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) + + quantized = self.quantizer.decode(codes) + if self.semantic_frame_rate == "25hz": + quantized = F.interpolate( + quantized, size=int(quantized.shape[-1] * 2), mode="nearest" + ) + + x, m_p, logs_p, y_mask = self.enc_p( + quantized, y_lengths, text, text_lengths, ge + ) + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale + + z = self.flow(z_p, y_mask, g=ge, reverse=True) + + o = self.dec((z * y_mask)[:, :, :], g=ge) + return o + + def extract_latent(self, x): + ssl = self.ssl_proj(x) + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) + return codes.transpose(0, 1) diff --git a/module/models_onnx.py b/module/models_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..232fd74d3084e86ab77f595010bb1f975573113b --- /dev/null +++ b/module/models_onnx.py @@ -0,0 +1,918 @@ +import copy +import math +import torch +from torch import nn +from torch.nn import functional as F + +from module import commons +from module import modules +from module import attentions_onnx as attentions + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from module.commons import init_weights, get_padding +from module.mrte_model import MRTE +from module.quantize import ResidualVectorQuantizer +from text import symbols +from torch.cuda.amp import autocast + + +class StochasticDurationPredictor(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + n_flows=4, + gin_channels=0, + ): + super().__init__() + filter_channels = in_channels # it needs to be removed from future version. + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.log_flow = modules.Log() + self.flows = nn.ModuleList() + self.flows.append(modules.ElementwiseAffine(2)) + for i in range(n_flows): + self.flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.flows.append(modules.Flip()) + + self.post_pre = nn.Conv1d(1, filter_channels, 1) + self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + self.post_flows = nn.ModuleList() + self.post_flows.append(modules.ElementwiseAffine(2)) + for i in range(4): + self.post_flows.append( + modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3) + ) + self.post_flows.append(modules.Flip()) + + self.pre = nn.Conv1d(in_channels, filter_channels, 1) + self.proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.convs = modules.DDSConv( + filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout + ) + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, filter_channels, 1) + + def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): + x = torch.detach(x) + x = self.pre(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.convs(x, x_mask) + x = self.proj(x) * x_mask + + if not reverse: + flows = self.flows + assert w is not None + + logdet_tot_q = 0 + h_w = self.post_pre(w) + h_w = self.post_convs(h_w, x_mask) + h_w = self.post_proj(h_w) * x_mask + e_q = ( + torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) + * x_mask + ) + z_q = e_q + for flow in self.post_flows: + z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) + logdet_tot_q += logdet_q + z_u, z1 = torch.split(z_q, [1, 1], 1) + u = torch.sigmoid(z_u) * x_mask + z0 = (w - u) * x_mask + logdet_tot_q += torch.sum( + (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2] + ) + logq = ( + torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) + - logdet_tot_q + ) + + logdet_tot = 0 + z0, logdet = self.log_flow(z0, x_mask) + logdet_tot += logdet + z = torch.cat([z0, z1], 1) + for flow in flows: + z, logdet = flow(z, x_mask, g=x, reverse=reverse) + logdet_tot = logdet_tot + logdet + nll = ( + torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) + - logdet_tot + ) + return nll + logq # [b] + else: + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + z = ( + torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) + * noise_scale + ) + for flow in flows: + z = flow(z, x_mask, g=x, reverse=reverse) + z0, z1 = torch.split(z, [1, 1], 1) + logw = z0 + return logw + + +class DurationPredictor(nn.Module): + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + def forward(self, x, x_mask, g=None): + x = torch.detach(x) + if g is not None: + g = torch.detach(g) + x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.drop(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + x = self.drop(x) + x = self.proj(x * x_mask) + return x * x_mask + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + latent_channels=192, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.latent_channels = latent_channels + + self.ssl_proj = nn.Conv1d(768, hidden_channels, 1) + + self.encoder_ssl = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers // 2, + kernel_size, + p_dropout, + ) + + self.encoder_text = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.text_embedding = nn.Embedding(len(symbols), hidden_channels) + + self.mrte = MRTE() + + self.encoder2 = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers // 2, + kernel_size, + p_dropout, + ) + + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, y, text, ge): + y_mask = torch.ones_like(y[:1,:1,:]) + + y = self.ssl_proj(y * y_mask) * y_mask + y = self.encoder_ssl(y * y_mask, y_mask) + + text_mask = torch.ones_like(text).to(y.dtype).unsqueeze(0) + + text = self.text_embedding(text).transpose(1, 2) + text = self.encoder_text(text * text_mask, text_mask) + y = self.mrte(y, y_mask, text, text_mask, ge) + + y = self.encoder2(y * y_mask, y_mask) + + stats = self.proj(y) * y_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return y, m, logs, y_mask + + def extract_latent(self, x): + x = self.ssl_proj(x) + quantized, codes, commit_loss, quantized_list = self.quantizer(x) + return codes.transpose(0, 1) + + def decode_latent(self, codes, y_mask, refer, refer_mask, ge): + quantized = self.quantizer.decode(codes) + + y = self.vq_proj(quantized) * y_mask + y = self.encoder_ssl(y * y_mask, y_mask) + + y = self.mrte(y, y_mask, refer, refer_mask, ge) + + y = self.encoder2(y * y_mask, y_mask) + + stats = self.proj(y) * y_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return y, m, logs, y_mask, quantized + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + if g != None: + g = g.detach() + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + +class WNEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.norm = modules.LayerNorm(out_channels) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + out = self.proj(x) * x_mask + out = self.norm(out) + return out + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class ReferenceEncoder(nn.Module): + """ + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + """ + + def __init__(self, spec_channels, gin_channels=0): + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, gin_channels) + + def forward(self, inputs): + N = inputs.size(0) + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + for conv in self.convs: + out = conv(out) + # out = wn(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)).unsqueeze(-1) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class Quantizer_module(torch.nn.Module): + def __init__(self, n_e, e_dim): + super(Quantizer_module, self).__init__() + self.embedding = nn.Embedding(n_e, e_dim) + self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e) + + def forward(self, x): + d = ( + torch.sum(x**2, 1, keepdim=True) + + torch.sum(self.embedding.weight**2, 1) + - 2 * torch.matmul(x, self.embedding.weight.T) + ) + min_indicies = torch.argmin(d, 1) + z_q = self.embedding(min_indicies) + return z_q, min_indicies + + +class Quantizer(torch.nn.Module): + def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160): + super(Quantizer, self).__init__() + assert embed_dim % n_code_groups == 0 + self.quantizer_modules = nn.ModuleList( + [ + Quantizer_module(n_codes, embed_dim // n_code_groups) + for _ in range(n_code_groups) + ] + ) + self.n_code_groups = n_code_groups + self.embed_dim = embed_dim + + def forward(self, xin): + # B, C, T + B, C, T = xin.shape + xin = xin.transpose(1, 2) + x = xin.reshape(-1, self.embed_dim) + x = torch.split(x, self.embed_dim // self.n_code_groups, dim=-1) + min_indicies = [] + z_q = [] + for _x, m in zip(x, self.quantizer_modules): + _z_q, _min_indicies = m(_x) + z_q.append(_z_q) + min_indicies.append(_min_indicies) # B * T, + z_q = torch.cat(z_q, -1).reshape(xin.shape) + loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean( + (z_q - xin.detach()) ** 2 + ) + z_q = xin + (z_q - xin).detach() + z_q = z_q.transpose(1, 2) + codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups) + return z_q, loss, codes.transpose(1, 2) + + def embed(self, x): + # idx: N, 4, T + x = x.transpose(1, 2) + x = torch.split(x, 1, 2) + ret = [] + for q, embed in zip(x, self.quantizer_modules): + q = embed.embedding(q.squeeze(-1)) + ret.append(q) + ret = torch.cat(ret, -1) + return ret.transpose(1, 2) # N, C, T + + +class CodePredictor(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + n_q=8, + dims=1024, + ssl_dim=768, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + + self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1) + self.ref_enc = modules.MelStyleEncoder( + ssl_dim, style_vector_dim=hidden_channels + ) + + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + + self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1) + self.n_q = n_q + self.dims = dims + + def forward(self, x, x_mask, refer, codes, infer=False): + x = x.detach() + x = self.vq_proj(x * x_mask) * x_mask + g = self.ref_enc(refer, x_mask) + x = x + g + x = self.encoder(x * x_mask, x_mask) + x = self.out_proj(x * x_mask) * x_mask + logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose( + 2, 3 + ) + target = codes[1:].transpose(0, 1) + if not infer: + logits = logits.reshape(-1, self.dims) + target = target.reshape(-1) + loss = torch.nn.functional.cross_entropy(logits, target) + return loss + else: + _, top10_preds = torch.topk(logits, 10, dim=-1) + correct_top10 = torch.any(top10_preds == target.unsqueeze(-1), dim=-1) + top3_acc = 100 * torch.mean(correct_top10.float()).detach().cpu().item() + + print("Top-10 Accuracy:", top3_acc, "%") + + pred_codes = torch.argmax(logits, dim=-1) + acc = 100 * torch.mean((pred_codes == target).float()).detach().cpu().item() + print("Top-1 Accuracy:", acc, "%") + + return pred_codes.transpose(0, 1) + + +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + n_speakers=0, + gin_channels=0, + use_sdp=True, + semantic_frame_rate=None, + freeze_quantizer=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.n_speakers = n_speakers + self.gin_channels = gin_channels + + self.use_sdp = use_sdp + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels + ) + + self.ref_enc = modules.MelStyleEncoder( + spec_channels, style_vector_dim=gin_channels + ) + + ssl_dim = 768 + self.ssl_dim = ssl_dim + assert semantic_frame_rate in ["25hz", "50hz"] + self.semantic_frame_rate = semantic_frame_rate + if semantic_frame_rate == "25hz": + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2) + else: + self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1) + + self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024) + if freeze_quantizer: + self.ssl_proj.requires_grad_(False) + self.quantizer.requires_grad_(False) + # self.enc_p.text_embedding.requires_grad_(False) + # self.enc_p.encoder_text.requires_grad_(False) + # self.enc_p.mrte.requires_grad_(False) + + def forward(self, codes, text, refer): + refer_mask = torch.ones_like(refer[:1,:1,:]) + ge = self.ref_enc(refer * refer_mask, refer_mask) + + quantized = self.quantizer.decode(codes) + if self.semantic_frame_rate == "25hz": + dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0) + quantized = dquantized.contiguous().view(1, self.ssl_dim, -1) + + x, m_p, logs_p, y_mask = self.enc_p( + quantized, text, ge + ) + + z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) + + z = self.flow(z_p, y_mask, g=ge, reverse=True) + + o = self.dec((z * y_mask)[:, :, :], g=ge) + return o + + def extract_latent(self, x): + ssl = self.ssl_proj(x) + quantized, codes, commit_loss, quantized_list = self.quantizer(ssl) + return codes.transpose(0, 1) \ No newline at end of file diff --git a/module/modules.py b/module/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..f44474558423ba27b1d2693cf35289b07cc0086f --- /dev/null +++ b/module/modules.py @@ -0,0 +1,923 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d +from torch.nn.utils import weight_norm, remove_weight_norm + +from module import commons +from module.commons import init_weights, get_padding +from module.transforms import piecewise_rational_quadratic_transform +import torch.distributions as D + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x + + +class LinearNorm(nn.Module): + def __init__( + self, + in_channels, + out_channels, + bias=True, + spectral_norm=False, + ): + super(LinearNorm, self).__init__() + self.fc = nn.Linear(in_channels, out_channels, bias) + + if spectral_norm: + self.fc = nn.utils.spectral_norm(self.fc) + + def forward(self, input): + out = self.fc(input) + return out + + +class Mish(nn.Module): + def __init__(self): + super(Mish, self).__init__() + + def forward(self, x): + return x * torch.tanh(F.softplus(x)) + + +class Conv1dGLU(nn.Module): + """ + Conv1d + GLU(Gated Linear Unit) with residual connection. + For GLU refer to https://arxiv.org/abs/1612.08083 paper. + """ + + def __init__(self, in_channels, out_channels, kernel_size, dropout): + super(Conv1dGLU, self).__init__() + self.out_channels = out_channels + self.conv1 = ConvNorm(in_channels, 2 * out_channels, kernel_size=kernel_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + residual = x + x = self.conv1(x) + x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1) + x = x1 * torch.sigmoid(x2) + x = residual + self.dropout(x) + return x + + +class ConvNorm(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=None, + dilation=1, + bias=True, + spectral_norm=False, + ): + super(ConvNorm, self).__init__() + + if padding is None: + assert kernel_size % 2 == 1 + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + + if spectral_norm: + self.conv = nn.utils.spectral_norm(self.conv) + + def forward(self, input): + out = self.conv(input) + return out + + +class MultiHeadAttention(nn.Module): + """Multi-Head Attention module""" + + def __init__(self, n_head, d_model, d_k, d_v, dropout=0.0, spectral_norm=False): + super().__init__() + + self.n_head = n_head + self.d_k = d_k + self.d_v = d_v + + self.w_qs = nn.Linear(d_model, n_head * d_k) + self.w_ks = nn.Linear(d_model, n_head * d_k) + self.w_vs = nn.Linear(d_model, n_head * d_v) + + self.attention = ScaledDotProductAttention( + temperature=np.power(d_model, 0.5), dropout=dropout + ) + + self.fc = nn.Linear(n_head * d_v, d_model) + self.dropout = nn.Dropout(dropout) + + if spectral_norm: + self.w_qs = nn.utils.spectral_norm(self.w_qs) + self.w_ks = nn.utils.spectral_norm(self.w_ks) + self.w_vs = nn.utils.spectral_norm(self.w_vs) + self.fc = nn.utils.spectral_norm(self.fc) + + def forward(self, x, mask=None): + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_x, _ = x.size() + + residual = x + + q = self.w_qs(x).view(sz_b, len_x, n_head, d_k) + k = self.w_ks(x).view(sz_b, len_x, n_head, d_k) + v = self.w_vs(x).view(sz_b, len_x, n_head, d_v) + q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lq x dk + k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lk x dk + v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_v) # (n*b) x lv x dv + + if mask is not None: + slf_mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. + else: + slf_mask = None + output, attn = self.attention(q, k, v, mask=slf_mask) + + output = output.view(n_head, sz_b, len_x, d_v) + output = ( + output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1) + ) # b x lq x (n*dv) + + output = self.fc(output) + + output = self.dropout(output) + residual + return output, attn + + +class ScaledDotProductAttention(nn.Module): + """Scaled Dot-Product Attention""" + + def __init__(self, temperature, dropout): + super().__init__() + self.temperature = temperature + self.softmax = nn.Softmax(dim=2) + self.dropout = nn.Dropout(dropout) + + def forward(self, q, k, v, mask=None): + attn = torch.bmm(q, k.transpose(1, 2)) + attn = attn / self.temperature + + if mask is not None: + attn = attn.masked_fill(mask, -np.inf) + + attn = self.softmax(attn) + p_attn = self.dropout(attn) + + output = torch.bmm(p_attn, v) + return output, attn + + +class MelStyleEncoder(nn.Module): + """MelStyleEncoder""" + + def __init__( + self, + n_mel_channels=80, + style_hidden=128, + style_vector_dim=256, + style_kernel_size=5, + style_head=2, + dropout=0.1, + ): + super(MelStyleEncoder, self).__init__() + self.in_dim = n_mel_channels + self.hidden_dim = style_hidden + self.out_dim = style_vector_dim + self.kernel_size = style_kernel_size + self.n_head = style_head + self.dropout = dropout + + self.spectral = nn.Sequential( + LinearNorm(self.in_dim, self.hidden_dim), + Mish(), + nn.Dropout(self.dropout), + LinearNorm(self.hidden_dim, self.hidden_dim), + Mish(), + nn.Dropout(self.dropout), + ) + + self.temporal = nn.Sequential( + Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout), + Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout), + ) + + self.slf_attn = MultiHeadAttention( + self.n_head, + self.hidden_dim, + self.hidden_dim // self.n_head, + self.hidden_dim // self.n_head, + self.dropout, + ) + + self.fc = LinearNorm(self.hidden_dim, self.out_dim) + + def temporal_avg_pool(self, x, mask=None): + if mask is None: + out = torch.mean(x, dim=1) + else: + len_ = (~mask).sum(dim=1).unsqueeze(1) + x = x.masked_fill(mask.unsqueeze(-1), 0) + x = x.sum(dim=1) + out = torch.div(x, len_) + return out + + def forward(self, x, mask=None): + x = x.transpose(1, 2) + if mask is not None: + mask = (mask.int() == 0).squeeze(1) + max_len = x.shape[1] + slf_attn_mask = ( + mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None + ) + + # spectral + x = self.spectral(x) + # temporal + x = x.transpose(1, 2) + x = self.temporal(x) + x = x.transpose(1, 2) + # self-attention + if mask is not None: + x = x.masked_fill(mask.unsqueeze(-1), 0) + x, _ = self.slf_attn(x, mask=slf_attn_mask) + # fc + x = self.fc(x) + # temoral average pooling + w = self.temporal_avg_pool(x, mask=mask) + + return w.unsqueeze(-1) + + +class MelStyleEncoderVAE(nn.Module): + def __init__(self, spec_channels, z_latent_dim, emb_dim): + super().__init__() + self.ref_encoder = MelStyleEncoder(spec_channels, style_vector_dim=emb_dim) + self.fc1 = nn.Linear(emb_dim, z_latent_dim) + self.fc2 = nn.Linear(emb_dim, z_latent_dim) + self.fc3 = nn.Linear(z_latent_dim, emb_dim) + self.z_latent_dim = z_latent_dim + + def reparameterize(self, mu, logvar): + if self.training: + std = torch.exp(0.5 * logvar) + eps = torch.randn_like(std) + return eps.mul(std).add_(mu) + else: + return mu + + def forward(self, inputs, mask=None): + enc_out = self.ref_encoder(inputs.squeeze(-1), mask).squeeze(-1) + mu = self.fc1(enc_out) + logvar = self.fc2(enc_out) + posterior = D.Normal(mu, torch.exp(logvar)) + kl_divergence = D.kl_divergence( + posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar)) + ) + loss_kl = kl_divergence.mean() + + z = posterior.rsample() + style_embed = self.fc3(z) + + return style_embed.unsqueeze(-1), loss_kl + + def infer(self, inputs=None, random_sample=False, manual_latent=None): + if manual_latent is None: + if random_sample: + dev = next(self.parameters()).device + posterior = D.Normal( + torch.zeros(1, self.z_latent_dim, device=dev), + torch.ones(1, self.z_latent_dim, device=dev), + ) + z = posterior.rsample() + else: + enc_out = self.ref_encoder(inputs.transpose(1, 2)) + mu = self.fc1(enc_out) + z = mu + else: + z = manual_latent + style_embed = self.fc3(z) + return style_embed.unsqueeze(-1), z + + +class ActNorm(nn.Module): + def __init__(self, channels, ddi=False, **kwargs): + super().__init__() + self.channels = channels + self.initialized = not ddi + + self.logs = nn.Parameter(torch.zeros(1, channels, 1)) + self.bias = nn.Parameter(torch.zeros(1, channels, 1)) + + def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): + if x_mask is None: + x_mask = torch.ones(x.size(0), 1, x.size(2)).to( + device=x.device, dtype=x.dtype + ) + x_len = torch.sum(x_mask, [1, 2]) + if not self.initialized: + self.initialize(x, x_mask) + self.initialized = True + + if reverse: + z = (x - self.bias) * torch.exp(-self.logs) * x_mask + logdet = None + return z + else: + z = (self.bias + torch.exp(self.logs) * x) * x_mask + logdet = torch.sum(self.logs) * x_len # [b] + return z, logdet + + def store_inverse(self): + pass + + def set_ddi(self, ddi): + self.initialized = not ddi + + def initialize(self, x, x_mask): + with torch.no_grad(): + denom = torch.sum(x_mask, [0, 2]) + m = torch.sum(x * x_mask, [0, 2]) / denom + m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom + v = m_sq - (m**2) + logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) + + bias_init = ( + (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype) + ) + logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype) + + self.bias.data.copy_(bias_init) + self.logs.data.copy_(logs_init) + + +class InvConvNear(nn.Module): + def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs): + super().__init__() + assert n_split % 2 == 0 + self.channels = channels + self.n_split = n_split + self.no_jacobian = no_jacobian + + w_init = torch.linalg.qr( + torch.FloatTensor(self.n_split, self.n_split).normal_() + )[0] + if torch.det(w_init) < 0: + w_init[:, 0] = -1 * w_init[:, 0] + self.weight = nn.Parameter(w_init) + + def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs): + b, c, t = x.size() + assert c % self.n_split == 0 + if x_mask is None: + x_mask = 1 + x_len = torch.ones((b,), dtype=x.dtype, device=x.device) * t + else: + x_len = torch.sum(x_mask, [1, 2]) + + x = x.view(b, 2, c // self.n_split, self.n_split // 2, t) + x = ( + x.permute(0, 1, 3, 2, 4) + .contiguous() + .view(b, self.n_split, c // self.n_split, t) + ) + + if reverse: + if hasattr(self, "weight_inv"): + weight = self.weight_inv + else: + weight = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) + logdet = None + else: + weight = self.weight + if self.no_jacobian: + logdet = 0 + else: + logdet = torch.logdet(self.weight) * (c / self.n_split) * x_len # [b] + + weight = weight.view(self.n_split, self.n_split, 1, 1) + z = F.conv2d(x, weight) + + z = z.view(b, 2, self.n_split // 2, c // self.n_split, t) + z = z.permute(0, 1, 3, 2, 4).contiguous().view(b, c, t) * x_mask + if reverse: + return z + else: + return z, logdet + + def store_inverse(self): + self.weight_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype) diff --git a/module/mrte_model.py b/module/mrte_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b0cd242c3faef30c819cbd7cdbedd90f83fcb1d6 --- /dev/null +++ b/module/mrte_model.py @@ -0,0 +1,192 @@ +# This is Multi-reference timbre encoder + +import torch +from torch import nn +from torch.nn.utils import remove_weight_norm, weight_norm +from module.attentions import MultiHeadAttention + + +class MRTE(nn.Module): + def __init__( + self, + content_enc_channels=192, + hidden_size=512, + out_channels=192, + kernel_size=5, + n_heads=4, + ge_layer=2, + ): + super(MRTE, self).__init__() + self.cross_attention = MultiHeadAttention(hidden_size, hidden_size, n_heads) + self.c_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) + self.text_pre = nn.Conv1d(content_enc_channels, hidden_size, 1) + self.c_post = nn.Conv1d(hidden_size, out_channels, 1) + + def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None): + if ge == None: + ge = 0 + attn_mask = text_mask.unsqueeze(2) * ssl_mask.unsqueeze(-1) + + ssl_enc = self.c_pre(ssl_enc * ssl_mask) + text_enc = self.text_pre(text * text_mask) + if test != None: + if test == 0: + x = ( + self.cross_attention( + ssl_enc * ssl_mask, text_enc * text_mask, attn_mask + ) + + ssl_enc + + ge + ) + elif test == 1: + x = ssl_enc + ge + elif test == 2: + x = ( + self.cross_attention( + ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask + ) + + ge + ) + else: + raise ValueError("test should be 0,1,2") + else: + x = ( + self.cross_attention( + ssl_enc * ssl_mask, text_enc * text_mask, attn_mask + ) + + ssl_enc + + ge + ) + x = self.c_post(x * ssl_mask) + return x + + +class SpeakerEncoder(torch.nn.Module): + def __init__( + self, + mel_n_channels=80, + model_num_layers=2, + model_hidden_size=256, + model_embedding_size=256, + ): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM( + mel_n_channels, model_hidden_size, model_num_layers, batch_first=True + ) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels.transpose(-1, -2)) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + +class MELEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + + def forward(self, x): + # print(x.shape,x_lengths.shape) + x = self.pre(x) + x = self.enc(x) + x = self.proj(x) + return x + + +class WN(torch.nn.Module): + def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = weight_norm(in_layer) + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + + acts = fused_add_tanh_sigmoid_multiply(x_in, n_channels_tensor) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = x + res_acts + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output + + def remove_weight_norm(self): + for l in self.in_layers: + remove_weight_norm(l) + for l in self.res_skip_layers: + remove_weight_norm(l) + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input, n_channels): + n_channels_int = n_channels[0] + t_act = torch.tanh(input[:, :n_channels_int, :]) + s_act = torch.sigmoid(input[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +if __name__ == "__main__": + content_enc = torch.randn(3, 192, 100) + content_mask = torch.ones(3, 1, 100) + ref_mel = torch.randn(3, 128, 30) + ref_mask = torch.ones(3, 1, 30) + model = MRTE() + out = model(content_enc, content_mask, ref_mel, ref_mask) + print(out.shape) diff --git a/module/quantize.py b/module/quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a5c632dadbd82e0e2802daee5c0f1fe51b36a7 --- /dev/null +++ b/module/quantize.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Residual vector quantizer implementation.""" + +from dataclasses import dataclass, field +import math +import typing as tp + +import torch +from torch import nn + +from module.core_vq import ResidualVectorQuantization + + +@dataclass +class QuantizedResult: + quantized: torch.Tensor + codes: torch.Tensor + bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item. + penalty: tp.Optional[torch.Tensor] = None + metrics: dict = field(default_factory=dict) + + +class ResidualVectorQuantizer(nn.Module): + """Residual Vector Quantizer. + Args: + dimension (int): Dimension of the codebooks. + n_q (int): Number of residual vector quantizers used. + bins (int): Codebook size. + decay (float): Decay for exponential moving average over the codebooks. + kmeans_init (bool): Whether to use kmeans to initialize the codebooks. + kmeans_iters (int): Number of iterations used for kmeans initialization. + threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes + that have an exponential moving average cluster size less than the specified threshold with + randomly selected vector from the current batch. + """ + + def __init__( + self, + dimension: int = 256, + n_q: int = 8, + bins: int = 1024, + decay: float = 0.99, + kmeans_init: bool = True, + kmeans_iters: int = 50, + threshold_ema_dead_code: int = 2, + ): + super().__init__() + self.n_q = n_q + self.dimension = dimension + self.bins = bins + self.decay = decay + self.kmeans_init = kmeans_init + self.kmeans_iters = kmeans_iters + self.threshold_ema_dead_code = threshold_ema_dead_code + self.vq = ResidualVectorQuantization( + dim=self.dimension, + codebook_size=self.bins, + num_quantizers=self.n_q, + decay=self.decay, + kmeans_init=self.kmeans_init, + kmeans_iters=self.kmeans_iters, + threshold_ema_dead_code=self.threshold_ema_dead_code, + ) + + def forward( + self, + x: torch.Tensor, + n_q: tp.Optional[int] = None, + layers: tp.Optional[list] = None, + ) -> QuantizedResult: + """Residual vector quantization on the given input tensor. + Args: + x (torch.Tensor): Input tensor. + n_q (int): Number of quantizer used to quantize. Default: All quantizers. + layers (list): Layer that need to return quantized. Defalt: None. + Returns: + QuantizedResult: + The quantized (or approximately quantized) representation with + the associated numbert quantizers and layer quantized required to return. + """ + n_q = n_q if n_q else self.n_q + if layers and max(layers) >= n_q: + raise ValueError( + f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B." + ) + quantized, codes, commit_loss, quantized_list = self.vq( + x, n_q=n_q, layers=layers + ) + return quantized, codes, torch.mean(commit_loss), quantized_list + + def encode( + self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None + ) -> torch.Tensor: + """Encode a given input tensor with the specified sample rate at the given bandwidth. + The RVQ encode method sets the appropriate number of quantizer to use + and returns indices for each quantizer. + Args: + x (torch.Tensor): Input tensor. + n_q (int): Number of quantizer used to quantize. Default: All quantizers. + st (int): Start to encode input from which layers. Default: 0. + """ + n_q = n_q if n_q else self.n_q + st = st or 0 + codes = self.vq.encode(x, n_q=n_q, st=st) + return codes + + def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor: + """Decode the given codes to the quantized representation. + Args: + codes (torch.Tensor): Input indices for each quantizer. + st (int): Start to decode input codes from which layers. Default: 0. + """ + quantized = self.vq.decode(codes, st=st) + return quantized diff --git a/module/transforms.py b/module/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a11f799e023864ff7082c1f49c0cc18351a13b47 --- /dev/null +++ b/module/transforms.py @@ -0,0 +1,209 @@ +import torch +from torch.nn import functional as F + +import numpy as np + + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/my_utils.py b/my_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..776939dd29bbcd9c45e47665f545a3dddcb16478 --- /dev/null +++ b/my_utils.py @@ -0,0 +1,21 @@ +import ffmpeg +import numpy as np + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() diff --git a/onnx_export.py b/onnx_export.py new file mode 100644 index 0000000000000000000000000000000000000000..b82e987f3f42598bd8a43109b778f93a4500f36b --- /dev/null +++ b/onnx_export.py @@ -0,0 +1,334 @@ +from module.models_onnx import SynthesizerTrn, symbols +from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule +import torch +import torchaudio +from torch import nn +from feature_extractor import cnhubert +cnhubert_base_path = "pretrained_models/chinese-hubert-base" +cnhubert.cnhubert_base_path=cnhubert_base_path +ssl_model = cnhubert.get_model() +from text import cleaned_text_to_sequence +import soundfile +from my_utils import load_audio +import os +import json + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + hann_window = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +class DictToAttrRecursive(dict): + def __init__(self, input_dict): + super().__init__(input_dict) + for key, value in input_dict.items(): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + self[key] = value + setattr(self, key, value) + + def __getattr__(self, item): + try: + return self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + def __setattr__(self, key, value): + if isinstance(value, dict): + value = DictToAttrRecursive(value) + super(DictToAttrRecursive, self).__setitem__(key, value) + super().__setattr__(key, value) + + def __delattr__(self, item): + try: + del self[item] + except KeyError: + raise AttributeError(f"Attribute {item} not found") + + +class T2SEncoder(nn.Module): + def __init__(self, t2s, vits): + super().__init__() + self.encoder = t2s.onnx_encoder + self.vits = vits + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): + codes = self.vits.extract_latent(ssl_content) + prompt_semantic = codes[0, 0] + bert = torch.cat([ref_bert.transpose(0, 1), text_bert.transpose(0, 1)], 1) + all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) + bert = bert.unsqueeze(0) + prompt = prompt_semantic.unsqueeze(0) + return self.encoder(all_phoneme_ids, bert), prompt + + +class T2SModel(nn.Module): + def __init__(self, t2s_path, vits_model): + super().__init__() + dict_s1 = torch.load(t2s_path, map_location="cpu") + self.config = dict_s1["config"] + self.t2s_model = Text2SemanticLightningModule(self.config, "ojbk", is_train=False) + self.t2s_model.load_state_dict(dict_s1["weight"]) + self.t2s_model.eval() + self.vits_model = vits_model.vq_model + self.hz = 50 + self.max_sec = self.config["data"]["max_sec"] + self.t2s_model.model.top_k = torch.LongTensor([self.config["inference"]["top_k"]]) + self.t2s_model.model.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) + self.t2s_model = self.t2s_model.model + self.t2s_model.init_onnx() + self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model) + self.first_stage_decoder = self.t2s_model.first_stage_decoder + self.stage_decoder = self.t2s_model.stage_decoder + #self.t2s_model = torch.jit.script(self.t2s_model) + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): + early_stop_num = self.t2s_model.early_stop_num + + #[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] + x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + + prefix_len = prompts.shape[1] + + #[1,N,512] [1,N] + y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) + + stop = False + for idx in range(1, 1500): + #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] + enco = self.stage_decoder(y, k, v, y_emb, x_example) + y, k, v, y_emb, logits, samples = enco + if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: + stop = True + if torch.argmax(logits, dim=-1)[0] == self.t2s_model.EOS or samples[0, 0] == self.t2s_model.EOS: + stop = True + if stop: + break + y[0, -1] = 0 + + return y[:, -idx:].unsqueeze(0) + + def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False): + #self.onnx_encoder = torch.jit.script(self.onnx_encoder) + if dynamo: + export_options = torch.onnx.ExportOptions(dynamic_shapes=True) + onnx_encoder_export_output = torch.onnx.dynamo_export( + self.onnx_encoder, + (ref_seq, text_seq, ref_bert, text_bert, ssl_content), + export_options=export_options + ) + onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx") + return + + torch.onnx.export( + self.onnx_encoder, + (ref_seq, text_seq, ref_bert, text_bert, ssl_content), + f"onnx/{project_name}/{project_name}_t2s_encoder.onnx", + input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], + output_names=["x", "prompts"], + dynamic_axes={ + "ref_seq": {1 : "ref_length"}, + "text_seq": {1 : "text_length"}, + "ref_bert": {0 : "ref_length"}, + "text_bert": {0 : "text_length"}, + "ssl_content": {2 : "ssl_length"}, + }, + opset_version=16 + ) + x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + + torch.onnx.export( + self.first_stage_decoder, + (x, prompts), + f"onnx/{project_name}/{project_name}_t2s_fsdec.onnx", + input_names=["x", "prompts"], + output_names=["y", "k", "v", "y_emb", "x_example"], + dynamic_axes={ + "x": {1 : "x_length"}, + "prompts": {1 : "prompts_length"}, + }, + verbose=False, + opset_version=16 + ) + y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) + + torch.onnx.export( + self.stage_decoder, + (y, k, v, y_emb, x_example), + f"onnx/{project_name}/{project_name}_t2s_sdec.onnx", + input_names=["iy", "ik", "iv", "iy_emb", "ix_example"], + output_names=["y", "k", "v", "y_emb", "logits", "samples"], + dynamic_axes={ + "iy": {1 : "iy_length"}, + "ik": {1 : "ik_length"}, + "iv": {1 : "iv_length"}, + "iy_emb": {1 : "iy_emb_length"}, + "ix_example": {1 : "ix_example_length"}, + }, + verbose=False, + opset_version=16 + ) + + +class VitsModel(nn.Module): + def __init__(self, vits_path): + super().__init__() + dict_s2 = torch.load(vits_path,map_location="cpu") + self.hps = dict_s2["config"] + self.hps = DictToAttrRecursive(self.hps) + self.hps.model.semantic_frame_rate = "25hz" + self.vq_model = SynthesizerTrn( + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + n_speakers=self.hps.data.n_speakers, + **self.hps.model + ) + self.vq_model.eval() + self.vq_model.load_state_dict(dict_s2["weight"], strict=False) + + def forward(self, text_seq, pred_semantic, ref_audio): + refer = spectrogram_torch( + ref_audio, + self.hps.data.filter_length, + self.hps.data.sampling_rate, + self.hps.data.hop_length, + self.hps.data.win_length, + center=False + ) + return self.vq_model(pred_semantic, text_seq, refer)[0, 0] + + +class GptSoVits(nn.Module): + def __init__(self, vits, t2s): + super().__init__() + self.vits = vits + self.t2s = t2s + + def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, debug=False): + pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + audio = self.vits(text_seq, pred_semantic, ref_audio) + if debug: + import onnxruntime + sess = onnxruntime.InferenceSession("onnx/koharu/koharu_vits.onnx", providers=["CPU"]) + audio1 = sess.run(None, { + "text_seq" : text_seq.detach().cpu().numpy(), + "pred_semantic" : pred_semantic.detach().cpu().numpy(), + "ref_audio" : ref_audio.detach().cpu().numpy() + }) + return audio, audio1 + return audio + + def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, project_name): + self.t2s.export(ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name) + pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) + torch.onnx.export( + self.vits, + (text_seq, pred_semantic, ref_audio), + f"onnx/{project_name}/{project_name}_vits.onnx", + input_names=["text_seq", "pred_semantic", "ref_audio"], + output_names=["audio"], + dynamic_axes={ + "text_seq": {1 : "text_length"}, + "pred_semantic": {2 : "pred_length"}, + "ref_audio": {1 : "audio_length"}, + }, + opset_version=17, + verbose=False + ) + + +class SSLModel(nn.Module): + def __init__(self): + super().__init__() + self.ssl = ssl_model + + def forward(self, ref_audio_16k): + return self.ssl.model(ref_audio_16k)["last_hidden_state"].transpose(1, 2) + + +def export(vits_path, gpt_path, project_name): + vits = VitsModel(vits_path) + gpt = T2SModel(gpt_path, vits) + gpt_sovits = GptSoVits(vits, gpt) + ssl = SSLModel() + ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])]) + text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])]) + ref_bert = torch.randn((ref_seq.shape[1], 1024)).float() + text_bert = torch.randn((text_seq.shape[1], 1024)).float() + ref_audio = torch.randn((1, 48000 * 5)).float() + # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float() + ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float() + ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float() + + try: + os.mkdir(f"onnx/{project_name}") + except: + pass + + ssl_content = ssl(ref_audio_16k).float() + + debug = False + + if debug: + a, b = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, debug=debug) + soundfile.write("out1.wav", a.cpu().detach().numpy(), vits.hps.data.sampling_rate) + soundfile.write("out2.wav", b[0], vits.hps.data.sampling_rate) + return + + a = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content).detach().cpu().numpy() + + soundfile.write("out.wav", a, vits.hps.data.sampling_rate) + + gpt_sovits.export(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, project_name) + + MoeVSConf = { + "Folder" : f"{project_name}", + "Name" : f"{project_name}", + "Type" : "GPT-SoVits", + "Rate" : vits.hps.data.sampling_rate, + "NumLayers": gpt.t2s_model.num_layers, + "EmbeddingDim": gpt.t2s_model.embedding_dim, + "Dict": "BasicDict", + "BertPath": "chinese-roberta-wwm-ext-large", + "Symbol": symbols, + "AddBlank": False + } + + MoeVSConfJson = json.dumps(MoeVSConf) + with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile: + json.dump(MoeVSConf, MoeVsConfFile, indent = 4) + + +if __name__ == "__main__": + try: + os.mkdir("onnx") + except: + pass + + gpt_path = "GPT_weights/nahida-e25.ckpt" + vits_path = "SoVITS_weights/nahida_e30_s3930.pth" + exp_path = "nahida" + export(vits_path, gpt_path, exp_path) + + # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) \ No newline at end of file diff --git a/prepare_datasets/1-get-text.py b/prepare_datasets/1-get-text.py new file mode 100644 index 0000000000000000000000000000000000000000..5873164a8bf54ee526afa7869e0c23c6a05c5681 --- /dev/null +++ b/prepare_datasets/1-get-text.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +import os + +inp_text = os.environ.get("inp_text") +inp_wav_dir = os.environ.get("inp_wav_dir") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +opt_dir = os.environ.get("opt_dir") +bert_pretrained_dir = os.environ.get("bert_pretrained_dir") +is_half = eval(os.environ.get("is_half", "True")) +import sys, numpy as np, traceback, pdb +import os.path +from glob import glob +from tqdm import tqdm +from text.cleaner import clean_text +import torch +from transformers import AutoModelForMaskedLM, AutoTokenizer +import numpy as np + +# inp_text=sys.argv[1] +# inp_wav_dir=sys.argv[2] +# exp_name=sys.argv[3] +# i_part=sys.argv[4] +# all_parts=sys.argv[5] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name +# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" + +from time import time as ttime +import shutil + + +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) + tmp_path="%s%s.pth"%(ttime(),i_part) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) + + +txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) +if os.path.exists(txt_path) == False: + bert_dir = "%s/3-bert" % (opt_dir) + os.makedirs(opt_dir, exist_ok=True) + os.makedirs(bert_dir, exist_ok=True) + if torch.cuda.is_available(): + device = "cuda:0" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) + bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) + if is_half == True: + bert_model = bert_model.half().to(device) + else: + bert_model = bert_model.to(device) + + def get_bert_feature(text, word2ph): + with torch.no_grad(): + inputs = tokenizer(text, return_tensors="pt") + for i in inputs: + inputs[i] = inputs[i].to(device) + res = bert_model(**inputs, output_hidden_states=True) + res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] + + assert len(word2ph) == len(text) + phone_level_feature = [] + for i in range(len(word2ph)): + repeat_feature = res[i].repeat(word2ph[i], 1) + phone_level_feature.append(repeat_feature) + + phone_level_feature = torch.cat(phone_level_feature, dim=0) + + return phone_level_feature.T + + def process(data, res): + for name, text, lan in data: + try: + name = os.path.basename(name) + phones, word2ph, norm_text = clean_text( + text.replace("%", "-").replace("¥", ","), lan + ) + path_bert = "%s/%s.pt" % (bert_dir, name) + if os.path.exists(path_bert) == False and lan == "zh": + bert_feature = get_bert_feature(norm_text, word2ph) + assert bert_feature.shape[-1] == len(phones) + # torch.save(bert_feature, path_bert) + my_save(bert_feature, path_bert) + phones = " ".join(phones) + # res.append([name,phones]) + res.append([name, phones, word2ph, norm_text]) + except: + print(name, text, traceback.format_exc()) + + todo = [] + res = [] + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + language_v1_to_language_v2 = { + "ZH": "zh", + "zh": "zh", + "JP": "ja", + "jp": "ja", + "JA": "ja", + "ja": "ja", + "EN": "en", + "en": "en", + "En": "en", + } + for line in lines[int(i_part) :: int(all_parts)]: + try: + wav_name, spk_name, language, text = line.split("|") + # todo.append([name,text,"zh"]) + todo.append( + [wav_name, text, language_v1_to_language_v2.get(language, language)] + ) + except: + print(line, traceback.format_exc()) + + process(todo, res) + opt = [] + for name, phones, word2ph, norm_text in res: + opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text)) + with open(txt_path, "w", encoding="utf8") as f: + f.write("\n".join(opt) + "\n") diff --git a/prepare_datasets/2-get-hubert-wav32k.py b/prepare_datasets/2-get-hubert-wav32k.py new file mode 100644 index 0000000000000000000000000000000000000000..7607259ec2dce9212643eec652c23a82468b6557 --- /dev/null +++ b/prepare_datasets/2-get-hubert-wav32k.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +import sys,os +inp_text= os.environ.get("inp_text") +inp_wav_dir= os.environ.get("inp_wav_dir") +exp_name= os.environ.get("exp_name") +i_part= os.environ.get("i_part") +all_parts= os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") +from feature_extractor import cnhubert +opt_dir= os.environ.get("opt_dir") +cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") +is_half=eval(os.environ.get("is_half","True")) + +import pdb,traceback,numpy as np,logging +from scipy.io import wavfile +import librosa,torch +now_dir = os.getcwd() +sys.path.append(now_dir) +from my_utils import load_audio + +# from config import cnhubert_base_path +# cnhubert.cnhubert_base_path=cnhubert_base_path +# inp_text=sys.argv[1] +# inp_wav_dir=sys.argv[2] +# exp_name=sys.argv[3] +# i_part=sys.argv[4] +# all_parts=sys.argv[5] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6] +# cnhubert.cnhubert_base_path=sys.argv[7] +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name + +from time import time as ttime +import shutil +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) + tmp_path="%s%s.pth"%(ttime(),i_part) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) + +hubert_dir="%s/4-cnhubert"%(opt_dir) +wav32dir="%s/5-wav32k"%(opt_dir) +os.makedirs(opt_dir,exist_ok=True) +os.makedirs(hubert_dir,exist_ok=True) +os.makedirs(wav32dir,exist_ok=True) + +maxx=0.95 +alpha=0.5 +if torch.cuda.is_available(): + device = "cuda:0" +elif torch.backends.mps.is_available(): + device = "mps" +else: + device = "cpu" +model=cnhubert.get_model() +# is_half=False +if(is_half==True): + model=model.half().to(device) +else: + model = model.to(device) + +nan_fails=[] +def name2go(wav_name,wav_path): + hubert_path="%s/%s.pt"%(hubert_dir,wav_name) + if(os.path.exists(hubert_path)):return + tmp_audio = load_audio(wav_path, 32000) + tmp_max = np.abs(tmp_audio).max() + if tmp_max > 2.2: + print("%s-filtered,%s" % (wav_name, tmp_max)) + return + tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio + tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio + tmp_audio = librosa.resample( + tmp_audio32b, orig_sr=32000, target_sr=16000 + )#不是重采样问题 + tensor_wav16 = torch.from_numpy(tmp_audio) + if (is_half == True): + tensor_wav16=tensor_wav16.half().to(device) + else: + tensor_wav16 = tensor_wav16.to(device) + ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) + if np.isnan(ssl.detach().numpy()).sum()!= 0: + nan_fails.append(wav_name) + print("nan filtered:%s"%wav_name) + return + wavfile.write( + "%s/%s"%(wav32dir,wav_name), + 32000, + tmp_audio32.astype("int16"), + ) + my_save(ssl,hubert_path ) + +with open(inp_text,"r",encoding="utf8")as f: + lines=f.read().strip("\n").split("\n") + +for line in lines[int(i_part)::int(all_parts)]: + try: + # wav_name,text=line.split("\t") + wav_name, spk_name, language, text = line.split("|") + if (inp_wav_dir != "" and inp_wav_dir != None): + wav_name = os.path.basename(wav_name) + wav_path = "%s/%s"%(inp_wav_dir, wav_name) + + else: + wav_path=wav_name + wav_name = os.path.basename(wav_name) + name2go(wav_name,wav_path) + except: + print(line,traceback.format_exc()) + +if(len(nan_fails)>0 and is_half==True): + is_half=False + model=model.float() + for wav_name in nan_fails: + try: + name2go(wav_name) + except: + print(wav_name,traceback.format_exc()) diff --git a/prepare_datasets/3-get-semantic.py b/prepare_datasets/3-get-semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..9ab56a4879409b2e25ddd5f6a76acaaefdd28d4f --- /dev/null +++ b/prepare_datasets/3-get-semantic.py @@ -0,0 +1,95 @@ +import os + +inp_text = os.environ.get("inp_text") +exp_name = os.environ.get("exp_name") +i_part = os.environ.get("i_part") +all_parts = os.environ.get("all_parts") +os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") +opt_dir = os.environ.get("opt_dir") +pretrained_s2G = os.environ.get("pretrained_s2G") +s2config_path = os.environ.get("s2config_path") +is_half = eval(os.environ.get("is_half", "True")) +import math, traceback +import multiprocessing +import sys, pdb + +now_dir = os.getcwd() +sys.path.append(now_dir) +from random import shuffle +import torch.multiprocessing as mp +from glob import glob +from tqdm import tqdm +import logging, librosa, utils, torch +from module.models import SynthesizerTrn + +logging.getLogger("numba").setLevel(logging.WARNING) +# from config import pretrained_s2G + +# inp_text=sys.argv[1] +# exp_name=sys.argv[2] +# i_part=sys.argv[3] +# all_parts=sys.argv[4] +# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5] +# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name + + +hubert_dir = "%s/4-cnhubert" % (opt_dir) +semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) +if os.path.exists(semantic_path) == False: + os.makedirs(opt_dir, exist_ok=True) + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + hps = utils.get_hparams_from_file(s2config_path) + vq_model = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model + ) + if is_half == True: + vq_model = vq_model.half().to(device) + else: + vq_model = vq_model.to(device) + vq_model.eval() + # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) + # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) + print( + vq_model.load_state_dict( + torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False + ) + ) + + def name2go(wav_name, lines): + hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) + if os.path.exists(hubert_path) == False: + return + ssl_content = torch.load(hubert_path, map_location="cpu") + if is_half == True: + ssl_content = ssl_content.half().to(device) + else: + ssl_content = ssl_content.to(device) + codes = vq_model.extract_latent(ssl_content) + semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) + lines.append("%s\t%s" % (wav_name, semantic)) + + with open(inp_text, "r", encoding="utf8") as f: + lines = f.read().strip("\n").split("\n") + + lines1 = [] + for line in lines[int(i_part) :: int(all_parts)]: + # print(line) + try: + # wav_name,text=line.split("\t") + wav_name, spk_name, language, text = line.split("|") + wav_name = os.path.basename(wav_name) + # name2go(name,lines1) + name2go(wav_name, lines1) + except: + print(line, traceback.format_exc()) + with open(semantic_path, "w", encoding="utf8") as f: + f.write("\n".join(lines1)) diff --git a/process_ckpt.py b/process_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..3a436f1053b82b1149de2a4eacddcedcda6d4908 --- /dev/null +++ b/process_ckpt.py @@ -0,0 +1,31 @@ +import traceback +from collections import OrderedDict +from time import time as ttime +import shutil,os +import torch +from tools.i18n.i18n import I18nAuto + +i18n = I18nAuto() + +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + tmp_path="%s.pth"%(ttime()) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) + +def savee(ckpt, name, epoch, steps, hps): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = hps + opt["info"] = "%sepoch_%siteration" % (epoch, steps) + # torch.save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) + my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) + return "Success." + except: + return traceback.format_exc() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f930016c815fc0d262c38bf4c76838f7fe5641d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,29 @@ +numpy +scipy +tensorboard +librosa==0.9.2 +numba==0.56.4 +pytorch-lightning +gradio==3.38.0 +gradio_client==0.8.1 +ffmpeg-python +onnxruntime +tqdm +funasr==1.0.0 +cn2an +pypinyin +g2p_en +modelscope==1.10.0 +sentencepiece +transformers +chardet +PyYAML +psutil +jieba_fast +jieba +Faster_Whisper +LangSegment +feature_extractor +module +py3langid +torch \ No newline at end of file diff --git a/s1_train.py b/s1_train.py new file mode 100644 index 0000000000000000000000000000000000000000..fb2735424cc979d6f0ad64a47f78f9adcb02c615 --- /dev/null +++ b/s1_train.py @@ -0,0 +1,180 @@ +# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py +import os +import pdb + +if "_CUDA_VISIBLE_DEVICES" in os.environ: + os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] +import argparse +import logging +from pathlib import Path + +import torch, platform +from pytorch_lightning import seed_everything +from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger +from pytorch_lightning.strategies import DDPStrategy +from AR.data.data_module import Text2SemanticDataModule +from AR.models.t2s_lightning_module import Text2SemanticLightningModule +from AR.utils.io import load_yaml_config + +logging.getLogger("numba").setLevel(logging.WARNING) +logging.getLogger("matplotlib").setLevel(logging.WARNING) +torch.set_float32_matmul_precision("high") +from AR.utils import get_newest_ckpt + +from collections import OrderedDict +from time import time as ttime +import shutil +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + tmp_path="%s.pth"%(ttime()) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) + + +class my_model_ckpt(ModelCheckpoint): + def __init__( + self, + config, + if_save_latest, + if_save_every_weights, + half_weights_save_dir, + exp_name, + **kwargs + ): + super().__init__(**kwargs) + self.if_save_latest = if_save_latest + self.if_save_every_weights = if_save_every_weights + self.half_weights_save_dir = half_weights_save_dir + self.exp_name = exp_name + self.config = config + + def on_train_epoch_end(self, trainer, pl_module): + # if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): + if self._should_save_on_train_epoch_end(trainer): + monitor_candidates = self._monitor_candidates(trainer) + if ( + self._every_n_epochs >= 1 + and (trainer.current_epoch + 1) % self._every_n_epochs == 0 + ): + if ( + self.if_save_latest == True + ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt + to_clean = list(os.listdir(self.dirpath)) + self._save_topk_checkpoint(trainer, monitor_candidates) + if self.if_save_latest == True: + for name in to_clean: + try: + os.remove("%s/%s" % (self.dirpath, name)) + except: + pass + if self.if_save_every_weights == True: + to_save_od = OrderedDict() + to_save_od["weight"] = OrderedDict() + dictt = trainer.strategy._lightning_module.state_dict() + for key in dictt: + to_save_od["weight"][key] = dictt[key].half() + to_save_od["config"] = self.config + to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) + # torch.save( + my_save( + to_save_od, + "%s/%s-e%s.ckpt" + % ( + self.half_weights_save_dir, + self.exp_name, + trainer.current_epoch + 1, + ), + ) + self._save_last_checkpoint(trainer, monitor_candidates) + + +def main(args): + config = load_yaml_config(args.config_file) + + output_dir = Path(config["output_dir"]) + output_dir.mkdir(parents=True, exist_ok=True) + + ckpt_dir = output_dir / "ckpt" + ckpt_dir.mkdir(parents=True, exist_ok=True) + + seed_everything(config["train"]["seed"], workers=True) + ckpt_callback: ModelCheckpoint = my_model_ckpt( + config=config, + if_save_latest=config["train"]["if_save_latest"], + if_save_every_weights=config["train"]["if_save_every_weights"], + half_weights_save_dir=config["train"]["half_weights_save_dir"], + exp_name=config["train"]["exp_name"], + save_top_k=-1, + monitor="top_3_acc", + mode="max", + save_on_train_epoch_end=True, + every_n_epochs=config["train"]["save_every_n_epoch"], + dirpath=ckpt_dir, + ) + logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) + os.environ["MASTER_ADDR"]="localhost" + trainer: Trainer = Trainer( + max_epochs=config["train"]["epochs"], + accelerator="gpu", + # val_check_interval=9999999999999999999999,###不要验证 + # check_val_every_n_epoch=None, + limit_val_batches=0, + devices=-1, + benchmark=False, + fast_dev_run=False, + strategy = "auto" if torch.backends.mps.is_available() else DDPStrategy( + process_group_backend="nccl" if platform.system() != "Windows" else "gloo" + ), # mps 不支持多节点训练 + precision=config["train"]["precision"], + logger=logger, + num_sanity_val_steps=0, + callbacks=[ckpt_callback], + ) + + model: Text2SemanticLightningModule = Text2SemanticLightningModule( + config, output_dir + ) + + data_module: Text2SemanticDataModule = Text2SemanticDataModule( + config, + train_semantic_path=config["train_semantic_path"], + train_phoneme_path=config["train_phoneme_path"], + # dev_semantic_path=args.dev_semantic_path, + # dev_phoneme_path=args.dev_phoneme_path + ) + + try: + # 使用正则表达式匹配文件名中的数字部分,并按数字大小进行排序 + newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir)) + ckpt_path = ckpt_dir / newest_ckpt_name + except Exception: + ckpt_path = None + print("ckpt_path:", ckpt_path) + trainer.fit(model, data_module, ckpt_path=ckpt_path) + + +# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config_file", + type=str, + default="configs/s1longer.yaml", + help="path of config file", + ) + # args for dataset + # parser.add_argument('--train_semantic_path',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/6-name2semantic.tsv') + # parser.add_argument('--train_phoneme_path', type=str, default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/2-name2text.txt') + + # parser.add_argument('--dev_semantic_path', type=str, default='dump_mix/semantic_dev.tsv') + # parser.add_argument('--dev_phoneme_path', type=str, default='dump_mix/phoneme_dev.npy') + # parser.add_argument('--output_dir',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/logs_s1',help='directory to save the results') + # parser.add_argument('--output_dir',type=str,default='/liujing04/gpt_logs/s1/xuangou_ft',help='directory to save the results') + + args = parser.parse_args() + logging.info(str(args)) + main(args) diff --git a/s2_train.py b/s2_train.py new file mode 100644 index 0000000000000000000000000000000000000000..e6b64f6b5cc369f58308ec1f73ad26fc98926008 --- /dev/null +++ b/s2_train.py @@ -0,0 +1,600 @@ +import utils, os + +hps = utils.get_hparams(stage=2) +os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") +import torch +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.multiprocessing as mp +import torch.distributed as dist, traceback +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.cuda.amp import autocast, GradScaler +from tqdm import tqdm +import logging, traceback + +logging.getLogger("matplotlib").setLevel(logging.INFO) +logging.getLogger("h5py").setLevel(logging.INFO) +logging.getLogger("numba").setLevel(logging.INFO) +from random import randint +from module import commons + +from module.data_utils import ( + TextAudioSpeakerLoader, + TextAudioSpeakerCollate, + DistributedBucketSampler, +) +from module.models import ( + SynthesizerTrn, + MultiPeriodDiscriminator, +) +from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss +from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from process_ckpt import savee + +torch.backends.cudnn.benchmark = False +torch.backends.cudnn.deterministic = False +###反正A100fp32更快,那试试tf32吧 +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 +# from config import pretrained_s2G,pretrained_s2D +global_step = 0 + + +def main(): + """Assume Single Node Multi GPUs Training Only""" + assert torch.cuda.is_available() or torch.backends.mps.is_available(), "Only GPU training is allowed." + + if torch.backends.mps.is_available(): + n_gpus = 1 + else: + n_gpus = torch.cuda.device_count() + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + + mp.spawn( + run, + nprocs=n_gpus, + args=( + n_gpus, + hps, + ), + ) + + +def run(rank, n_gpus, hps): + global global_step + if rank == 0: + logger = utils.get_logger(hps.data.exp_dir) + logger.info(hps) + # utils.check_git_hash(hps.s2_ckpt_dir) + writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) + + dist.init_process_group( + backend = "gloo" if os.name == "nt" or torch.backends.mps.is_available() else "nccl", + init_method="env://", + world_size=n_gpus, + rank=rank, + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + train_dataset = TextAudioSpeakerLoader(hps.data) ######## + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [ + 32, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1100, + 1200, + 1300, + 1400, + 1500, + 1600, + 1700, + 1800, + 1900, + ], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + collate_fn = TextAudioSpeakerCollate() + train_loader = DataLoader( + train_dataset, + num_workers=6, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=16, + ) + # if rank == 0: + # eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) + # eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, + # batch_size=1, pin_memory=True, + # drop_last=False, collate_fn=collate_fn) + + net_g = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model, + ).to("mps") + + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to("mps") + for name, param in net_g.named_parameters(): + if not param.requires_grad: + print(name, "not requires_grad") + + te_p = list(map(id, net_g.enc_p.text_embedding.parameters())) + et_p = list(map(id, net_g.enc_p.encoder_text.parameters())) + mrte_p = list(map(id, net_g.enc_p.mrte.parameters())) + base_params = filter( + lambda p: id(p) not in te_p + et_p + mrte_p and p.requires_grad, + net_g.parameters(), + ) + + # te_p=net_g.enc_p.text_embedding.parameters() + # et_p=net_g.enc_p.encoder_text.parameters() + # mrte_p=net_g.enc_p.mrte.parameters() + + optim_g = torch.optim.AdamW( + # filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 + [ + {"params": base_params, "lr": hps.train.learning_rate}, + { + "params": net_g.enc_p.text_embedding.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.encoder_text.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + { + "params": net_g.enc_p.mrte.parameters(), + "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, + }, + ], + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + if torch.cuda.is_available(): + net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + else: + net_g = net_g.to("mps") + net_d = net_d.to("mps") + + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"), + net_d, + optim_d, + ) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"), + net_g, + optim_g, + ) + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + if hps.train.pretrained_s2G != "": + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) + print( + net_g.module.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ) if torch.cuda.is_available() else net_g.load_state_dict( + torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], + strict=False, + ) + ) ##测试不加载优化器 + if hps.train.pretrained_s2D != "": + if rank == 0: + logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) + print( + net_d.module.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + ) if torch.cuda.is_available() else net_d.load_state_dict( + torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] + ) + ) + + # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=-1 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=-1 + ) + for _ in range(epoch_str): + scheduler_g.step() + scheduler_d.step() + + scaler = GradScaler(enabled=hps.train.fp16_run) + + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + # [train_loader, eval_loader], logger, [writer, writer_eval]) + [train_loader, None], + logger, + [writer, writer_eval], + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + ) + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate( + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers +): + net_g, net_d = nets + optim_g, optim_d = optims + # scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in tqdm(enumerate(train_loader)): + if torch.cuda.is_available(): + spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( + rank, non_blocking=True + ) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( + rank, non_blocking=True + ) + ssl = ssl.cuda(rank, non_blocking=True) + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( + rank, non_blocking=True + ) + else: + spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps") + y, y_lengths = y.to("mps"), y_lengths.to("mps") + ssl = ssl.to("mps") + ssl.requires_grad = False + # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) + text, text_lengths = text.to("mps"), text_lengths.to("mps") + + with autocast(enabled=hps.train.fp16_run): + ( + y_hat, + kl_ssl, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + stats_ssl, + ) = net_g(ssl, spec, spec_lengths, text, text_lengths) + + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + + y = commons.slice_segments( + y, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + loss_disc_all = loss_disc + optim_d.zero_grad() + scaler.scale(loss_disc_all).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + kl_ssl * 1 + loss_kl + + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc_all, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl_ssl": kl_ssl, + "loss/g/kl": loss_kl, + } + ) + + # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) + # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) + # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + "all/stats_ssl": utils.plot_spectrogram_to_numpy( + stats_ssl[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + if epoch % hps.train.save_every_epoch == 0 and rank == 0: + if hps.train.if_save_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(global_step) + ), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(global_step) + ), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(233333333333) + ), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join( + "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(233333333333) + ), + ) + if rank == 0 and hps.train.if_save_every_weights == True: + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + global_step, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {}".format(epoch)) + + +def evaluate(hps, generator, eval_loader, writer_eval): + generator.eval() + image_dict = {} + audio_dict = {} + print("Evaluating ...") + with torch.no_grad(): + for batch_idx, ( + ssl, + ssl_lengths, + spec, + spec_lengths, + y, + y_lengths, + text, + text_lengths, + ) in enumerate(eval_loader): + print(111) + if torch.cuda.is_available(): + spec, spec_lengths = spec.cuda(), spec_lengths.cuda() + y, y_lengths = y.cuda(), y_lengths.cuda() + ssl = ssl.cuda() + text, text_lengths = text.cuda(), text_lengths.cuda() + else: + spec, spec_lengths = spec.to("mps"), spec_lengths.to("mps") + y, y_lengths = y.to("mps"), y_lengths.to("mps") + ssl = ssl.to("mps") + text, text_lengths = text.to("mps"), text_lengths.to("mps") + for test in [0, 1]: + y_hat, mask, *_ = generator.module.infer( + ssl, spec, spec_lengths, text, text_lengths, test=test + ) if torch.cuda.is_available() else generator.infer( + ssl, spec, spec_lengths, text, text_lengths, test=test + ) + y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length + + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1).float(), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + image_dict.update( + { + f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].cpu().numpy() + ) + } + ) + audio_dict.update( + {f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]]} + ) + image_dict.update( + { + f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( + mel[0].cpu().numpy() + ) + } + ) + audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) + + # y_hat, mask, *_ = generator.module.infer(ssl, spec_lengths, speakers, y=None) + # audio_dict.update({ + # f"gen/audio_{batch_idx}_style_pred": y_hat[0, :, :] + # }) + + utils.summarize( + writer=writer_eval, + global_step=global_step, + images=image_dict, + audios=audio_dict, + audio_sampling_rate=hps.data.sampling_rate, + ) + generator.train() + + +if __name__ == "__main__": + main() diff --git a/text/__init__.py b/text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a48b9a49a39bdae1801d94a15db645154d6a68d1 --- /dev/null +++ b/text/__init__.py @@ -0,0 +1,15 @@ +from text.symbols import * + + +_symbol_to_id = {s: i for i, s in enumerate(symbols)} + +def cleaned_text_to_sequence(cleaned_text): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + Returns: + List of integers corresponding to the symbols in the text + ''' + phones = [_symbol_to_id[symbol] for symbol in cleaned_text] + return phones + diff --git a/text/__pycache__/__init__.cpython-39.pyc b/text/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cee25cc6c7c0c0576759728cda108d145826e643 Binary files /dev/null and b/text/__pycache__/__init__.cpython-39.pyc differ diff --git a/text/__pycache__/chinese.cpython-39.pyc b/text/__pycache__/chinese.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12ba9ac614779aef5ec27c3674ec06d051ac483c Binary files /dev/null and b/text/__pycache__/chinese.cpython-39.pyc differ diff --git a/text/__pycache__/cleaner.cpython-39.pyc b/text/__pycache__/cleaner.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebcdc62d687062b5b9ddb5c14b12c5b9b6204728 Binary files /dev/null and b/text/__pycache__/cleaner.cpython-39.pyc differ diff --git a/text/__pycache__/english.cpython-39.pyc b/text/__pycache__/english.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b163731d24929a5c12841e610dced0ce8cc9c32f Binary files /dev/null and b/text/__pycache__/english.cpython-39.pyc differ diff --git a/text/__pycache__/japanese.cpython-39.pyc b/text/__pycache__/japanese.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11ae41467febe7e9a9d53ae83ac7df53835cbbe5 Binary files /dev/null and b/text/__pycache__/japanese.cpython-39.pyc differ diff --git a/text/__pycache__/symbols.cpython-39.pyc b/text/__pycache__/symbols.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4340e6b534c7fc412c476d9d059e4ca3c14ac166 Binary files /dev/null and b/text/__pycache__/symbols.cpython-39.pyc differ diff --git a/text/__pycache__/tone_sandhi.cpython-39.pyc b/text/__pycache__/tone_sandhi.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c463fe267612f5675eaff612127fbfac27b2966f Binary files /dev/null and b/text/__pycache__/tone_sandhi.cpython-39.pyc differ diff --git a/text/chinese.py b/text/chinese.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a4b360e484f5dd0570328aecae9df4d3bf8427 --- /dev/null +++ b/text/chinese.py @@ -0,0 +1,174 @@ +import os +import pdb +import re + +import cn2an +from pypinyin import lazy_pinyin, Style + +from text.symbols import punctuation +from text.tone_sandhi import ToneSandhi +from text.zh_normalization.text_normlization import TextNormalizer + +normalizer = lambda x: cn2an.transform(x, "an2cn") + +current_file_path = os.path.dirname(__file__) +pinyin_to_symbol_map = { + line.split("\t")[0]: line.strip().split("\t")[1] + for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() +} + +import jieba_fast.posseg as psg + + +rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + "$": ".", + "/": ",", + "—": "-", + "~": "…", + "~":"…", +} + +tone_modifier = ToneSandhi() + + +def replace_punctuation(text): + text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + + replaced_text = re.sub( + r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text + ) + + return replaced_text + + +def g2p(text): + pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) + sentences = [i for i in re.split(pattern, text) if i.strip() != ""] + phones, word2ph = _g2p(sentences) + return phones, word2ph + + +def _get_initials_finals(word): + initials = [] + finals = [] + orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) + orig_finals = lazy_pinyin( + word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 + ) + for c, v in zip(orig_initials, orig_finals): + initials.append(c) + finals.append(v) + return initials, finals + + +def _g2p(segments): + phones_list = [] + word2ph = [] + for seg in segments: + pinyins = [] + # Replace all English words in the sentence + seg = re.sub("[a-zA-Z]+", "", seg) + seg_cut = psg.lcut(seg) + initials = [] + finals = [] + seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) + for word, pos in seg_cut: + if pos == "eng": + continue + sub_initials, sub_finals = _get_initials_finals(word) + sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) + initials.append(sub_initials) + finals.append(sub_finals) + + # assert len(sub_initials) == len(sub_finals) == len(word) + initials = sum(initials, []) + finals = sum(finals, []) + # + for c, v in zip(initials, finals): + raw_pinyin = c + v + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c == v: + assert c in punctuation + phone = [c] + word2ph.append(1) + else: + v_without_tone = v[:-1] + tone = v[-1] + + pinyin = c + v_without_tone + assert tone in "12345" + + if c: + # 多音节 + v_rep_map = { + "uei": "ui", + "iou": "iu", + "uen": "un", + } + if v_without_tone in v_rep_map.keys(): + pinyin = c + v_rep_map[v_without_tone] + else: + # 单音节 + pinyin_rep_map = { + "ing": "ying", + "i": "yi", + "in": "yin", + "u": "wu", + } + if pinyin in pinyin_rep_map.keys(): + pinyin = pinyin_rep_map[pinyin] + else: + single_rep_map = { + "v": "yu", + "e": "e", + "i": "y", + "u": "w", + } + if pinyin[0] in single_rep_map.keys(): + pinyin = single_rep_map[pinyin[0]] + pinyin[1:] + + assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) + new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ") + new_v = new_v + tone + phone = [new_c, new_v] + word2ph.append(len(phone)) + + phones_list += phone + return phones_list, word2ph + + +def text_normalize(text): + # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization + tx = TextNormalizer() + sentences = tx.normalize(text) + dest_text = "" + for sentence in sentences: + dest_text += replace_punctuation(sentence) + return dest_text + + +if __name__ == "__main__": + text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" + text = "呣呣呣~就是…大人的鼹鼠党吧?" + text = "你好" + text = text_normalize(text) + print(g2p(text)) + + +# # 示例用法 +# text = "这是一个示例文本:,你好!这是一个测试..." +# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 diff --git a/text/cleaner.py b/text/cleaner.py new file mode 100644 index 0000000000000000000000000000000000000000..0d82d8506ab969d3a9a2f0df47a26124b1a5a462 --- /dev/null +++ b/text/cleaner.py @@ -0,0 +1,58 @@ +from text import chinese, japanese, cleaned_text_to_sequence, symbols, english + +language_module_map = {"zh": chinese, "ja": japanese, "en": english} +special = [ + # ("%", "zh", "SP"), + ("¥", "zh", "SP2"), + ("^", "zh", "SP3"), + # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 +] + + +def clean_text(text, language): + if(language not in language_module_map): + language="en" + text=" " + for special_s, special_l, target_symbol in special: + if special_s in text and language == special_l: + return clean_special(text, language, special_s, target_symbol) + language_module = language_module_map[language] + norm_text = language_module.text_normalize(text) + if language == "zh": + phones, word2ph = language_module.g2p(norm_text) + assert len(phones) == sum(word2ph) + assert len(norm_text) == len(word2ph) + else: + phones = language_module.g2p(norm_text) + word2ph = None + + for ph in phones: + assert ph in symbols + return phones, word2ph, norm_text + + +def clean_special(text, language, special_s, target_symbol): + """ + 特殊静音段sp符号处理 + """ + text = text.replace(special_s, ",") + language_module = language_module_map[language] + norm_text = language_module.text_normalize(text) + phones = language_module.g2p(norm_text) + new_ph = [] + for ph in phones[0]: + assert ph in symbols + if ph == ",": + new_ph.append(target_symbol) + else: + new_ph.append(ph) + return new_ph, phones[1], norm_text + + +def text_to_sequence(text, language): + phones = clean_text(text) + return cleaned_text_to_sequence(phones) + + +if __name__ == "__main__": + print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) diff --git a/text/engdict-hot.rep b/text/engdict-hot.rep new file mode 100644 index 0000000000000000000000000000000000000000..22d4cd289d983755cb3e44fb3eab413d25b8b522 --- /dev/null +++ b/text/engdict-hot.rep @@ -0,0 +1 @@ +CHATGPT CH AE1 T JH IY1 P IY1 T IY1 \ No newline at end of file diff --git a/text/english.py b/text/english.py new file mode 100644 index 0000000000000000000000000000000000000000..90f48a55c93f9688de4116759a7151e3acf002e9 --- /dev/null +++ b/text/english.py @@ -0,0 +1,234 @@ +import pickle +import os +import re +from g2p_en import G2p + +from string import punctuation + +from text import symbols + +current_file_path = os.path.dirname(__file__) +CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep") +CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep") +CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep") +CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle") +_g2p = G2p() + +arpa = { + "AH0", + "S", + "AH1", + "EY2", + "AE2", + "EH0", + "OW2", + "UH0", + "NG", + "B", + "G", + "AY0", + "M", + "AA0", + "F", + "AO0", + "ER2", + "UH1", + "IY1", + "AH2", + "DH", + "IY0", + "EY1", + "IH0", + "K", + "N", + "W", + "IY2", + "T", + "AA1", + "ER1", + "EH2", + "OY0", + "UH2", + "UW1", + "Z", + "AW2", + "AW1", + "V", + "UW2", + "AA2", + "ER", + "AW0", + "UW0", + "R", + "OW1", + "EH1", + "ZH", + "AE0", + "IH2", + "IH", + "Y", + "JH", + "P", + "AY1", + "EY0", + "OY2", + "TH", + "HH", + "D", + "ER0", + "CH", + "AO1", + "AE1", + "AO2", + "OY1", + "AY2", + "IH1", + "OW0", + "L", + "SH", +} + + +def replace_phs(phs): + rep_map = {";": ",", ":": ",", "'": "-", '"': "-"} + phs_new = [] + for ph in phs: + if ph in symbols: + phs_new.append(ph) + elif ph in rep_map.keys(): + phs_new.append(rep_map[ph]) + else: + print("ph not in symbols: ", ph) + return phs_new + + +def read_dict(): + g2p_dict = {} + start_line = 49 + with open(CMU_DICT_PATH) as f: + line = f.readline() + line_index = 1 + while line: + if line_index >= start_line: + line = line.strip() + word_split = line.split(" ") + word = word_split[0] + + syllable_split = word_split[1].split(" - ") + g2p_dict[word] = [] + for syllable in syllable_split: + phone_split = syllable.split(" ") + g2p_dict[word].append(phone_split) + + line_index = line_index + 1 + line = f.readline() + + return g2p_dict + + +def read_dict_new(): + g2p_dict = {} + with open(CMU_DICT_PATH) as f: + line = f.readline() + line_index = 1 + while line: + if line_index >= 49: + line = line.strip() + word_split = line.split(" ") + word = word_split[0] + + syllable_split = word_split[1].split(" - ") + g2p_dict[word] = [] + for syllable in syllable_split: + phone_split = syllable.split(" ") + g2p_dict[word].append(phone_split) + + line_index = line_index + 1 + line = f.readline() + + with open(CMU_DICT_FAST_PATH) as f: + line = f.readline() + line_index = 1 + while line: + if line_index >= 0: + line = line.strip() + word_split = line.split(" ") + word = word_split[0] + if word not in g2p_dict: + g2p_dict[word] = [] + g2p_dict[word].append(word_split[1:]) + + line_index = line_index + 1 + line = f.readline() + + with open(CMU_DICT_HOT_PATH) as f: + line = f.readline() + line_index = 1 + while line: + if line_index >= 0: + line = line.strip() + word_split = line.split(" ") + word = word_split[0] + #if word not in g2p_dict: + g2p_dict[word] = [] + g2p_dict[word].append(word_split[1:]) + + line_index = line_index + 1 + line = f.readline() + + return g2p_dict + + +def cache_dict(g2p_dict, file_path): + with open(file_path, "wb") as pickle_file: + pickle.dump(g2p_dict, pickle_file) + + +def get_dict(): + if os.path.exists(CACHE_PATH): + with open(CACHE_PATH, "rb") as pickle_file: + g2p_dict = pickle.load(pickle_file) + else: + g2p_dict = read_dict_new() + cache_dict(g2p_dict, CACHE_PATH) + + return g2p_dict + + +eng_dict = get_dict() + + +def text_normalize(text): + # todo: eng text normalize + return text.replace(";", ",") + + +def g2p(text): + phones = [] + words = re.split(r"([,;.\-\?\!\s+])", text) + for w in words: + if w.upper() in eng_dict: + phns = eng_dict[w.upper()] + for ph in phns: + phones += ph + else: + phone_list = list(filter(lambda p: p != " ", _g2p(w))) + for ph in phone_list: + if ph in arpa: + phones.append(ph) + else: + phones.append(ph) + + return replace_phs(phones) + + +if __name__ == "__main__": + # print(get_dict()) + print(g2p("hello")) + print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) + # all_phones = set() + # for k, syllables in eng_dict.items(): + # for group in syllables: + # for ph in group: + # all_phones.add(ph) + # print(all_phones) diff --git a/text/japanese.py b/text/japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..a571467c2295c2815dbb0024726d965d59081ac2 --- /dev/null +++ b/text/japanese.py @@ -0,0 +1,191 @@ +# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py +import re +import sys + +import pyopenjtalk + + +from text import symbols +# Regular expression matching Japanese without punctuation marks: +_japanese_characters = re.compile( + r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) + +# Regular expression matching non-Japanese characters or punctuation marks: +_japanese_marks = re.compile( + r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" +) + +# List of (symbol, Japanese) pairs for marks: +_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]] + + +# List of (consonant, sokuon) pairs: +_real_sokuon = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + (r"Q([↑↓]*[kg])", r"k#\1"), + (r"Q([↑↓]*[tdjʧ])", r"t#\1"), + (r"Q([↑↓]*[sʃ])", r"s\1"), + (r"Q([↑↓]*[pb])", r"p#\1"), + ] +] + +# List of (consonant, hatsuon) pairs: +_real_hatsuon = [ + (re.compile("%s" % x[0]), x[1]) + for x in [ + (r"N([↑↓]*[pbm])", r"m\1"), + (r"N([↑↓]*[ʧʥj])", r"n^\1"), + (r"N([↑↓]*[tdn])", r"n\1"), + (r"N([↑↓]*[kg])", r"ŋ\1"), + ] +] + + +def post_replace_ph(ph): + rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + } + if ph in rep_map.keys(): + ph = rep_map[ph] + if ph in symbols: + return ph + if ph not in symbols: + ph = "UNK" + return ph + + +def symbols_to_japanese(text): + for regex, replacement in _symbols_to_japanese: + text = re.sub(regex, replacement, text) + return text + + +def preprocess_jap(text, with_prosody=False): + """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" + text = symbols_to_japanese(text) + sentences = re.split(_japanese_marks, text) + marks = re.findall(_japanese_marks, text) + text = [] + for i, sentence in enumerate(sentences): + if re.match(_japanese_characters, sentence): + if with_prosody: + text += pyopenjtalk_g2p_prosody(sentence)[1:-1] + else: + p = pyopenjtalk.g2p(sentence) + text += p.split(" ") + + if i < len(marks): + if marks[i] == " ":# 防止意外的UNK + continue + text += [marks[i].replace(" ", "")] + return text + + +def text_normalize(text): + # todo: jap text normalize + return text + +# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py +def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True): + """Extract phoneme + prosoody symbol sequence from input full-context labels. + + The algorithm is based on `Prosodic features control by symbols as input of + sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. + + Args: + text (str): Input text. + drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. + + Returns: + List[str]: List of phoneme + prosody symbols. + + Examples: + >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody + >>> pyopenjtalk_g2p_prosody("こんにちは。") + ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] + + .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic + modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 + + """ + labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) + N = len(labels) + + phones = [] + for n in range(N): + lab_curr = labels[n] + + # current phoneme + p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) + # deal unvoiced vowels as normal vowels + if drop_unvoiced_vowels and p3 in "AEIOU": + p3 = p3.lower() + + # deal with sil at the beginning and the end of text + if p3 == "sil": + assert n == 0 or n == N - 1 + if n == 0: + phones.append("^") + elif n == N - 1: + # check question form or not + e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) + if e3 == 0: + phones.append("$") + elif e3 == 1: + phones.append("?") + continue + elif p3 == "pau": + phones.append("_") + continue + else: + phones.append(p3) + + # accent type and position info (forward or backward) + a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) + a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) + a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) + + # number of mora in accent phrase + f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) + + a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) + # accent phrase border + if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": + phones.append("#") + # pitch falling + elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: + phones.append("]") + # pitch rising + elif a2 == 1 and a2_next == 2: + phones.append("[") + + return phones + +# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py +def _numeric_feature_by_regex(regex, s): + match = re.search(regex, s) + if match is None: + return -50 + return int(match.group(1)) + +def g2p(norm_text, with_prosody=False): + phones = preprocess_jap(norm_text, with_prosody) + phones = [post_replace_ph(i) for i in phones] + # todo: implement tones and word2ph + return phones + + +if __name__ == "__main__": + phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") + print(phones) \ No newline at end of file diff --git a/text/opencpop-strict.txt b/text/opencpop-strict.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bede89a429e34b1a58781ee570fc313b80c0aee --- /dev/null +++ b/text/opencpop-strict.txt @@ -0,0 +1,429 @@ +a AA a +ai AA ai +an AA an +ang AA ang +ao AA ao +ba b a +bai b ai +ban b an +bang b ang +bao b ao +bei b ei +ben b en +beng b eng +bi b i +bian b ian +biao b iao +bie b ie +bin b in +bing b ing +bo b o +bu b u +ca c a +cai c ai +can c an +cang c ang +cao c ao +ce c e +cei c ei +cen c en +ceng c eng +cha ch a +chai ch ai +chan ch an +chang ch ang +chao ch ao +che ch e +chen ch en +cheng ch eng +chi ch ir +chong ch ong +chou ch ou +chu ch u +chua ch ua +chuai ch uai +chuan ch uan +chuang ch uang +chui ch ui +chun ch un +chuo ch uo +ci c i0 +cong c ong +cou c ou +cu c u +cuan c uan +cui c ui +cun c un +cuo c uo +da d a +dai d ai +dan d an +dang d ang +dao d ao +de d e +dei d ei +den d en +deng d eng +di d i +dia d ia +dian d ian +diao d iao +die d ie +ding d ing +diu d iu +dong d ong +dou d ou +du d u +duan d uan +dui d ui +dun d un +duo d uo +e EE e +ei EE ei +en EE en +eng EE eng +er EE er +fa f a +fan f an +fang f ang +fei f ei +fen f en +feng f eng +fo f o +fou f ou +fu f u +ga g a +gai g ai +gan g an +gang g ang +gao g ao +ge g e +gei g ei +gen g en +geng g eng +gong g ong +gou g ou +gu g u +gua g ua +guai g uai +guan g uan +guang g uang +gui g ui +gun g un +guo g uo +ha h a +hai h ai +han h an +hang h ang +hao h ao +he h e +hei h ei +hen h en +heng h eng +hong h ong +hou h ou +hu h u +hua h ua +huai h uai +huan h uan +huang h uang +hui h ui +hun h un +huo h uo +ji j i +jia j ia +jian j ian +jiang j iang +jiao j iao +jie j ie +jin j in +jing j ing +jiong j iong +jiu j iu +ju j v +jv j v +juan j van +jvan j van +jue j ve +jve j ve +jun j vn +jvn j vn +ka k a +kai k ai +kan k an +kang k ang +kao k ao +ke k e +kei k ei +ken k en +keng k eng +kong k ong +kou k ou +ku k u +kua k ua +kuai k uai +kuan k uan +kuang k uang +kui k ui +kun k un +kuo k uo +la l a +lai l ai +lan l an +lang l ang +lao l ao +le l e +lei l ei +leng l eng +li l i +lia l ia +lian l ian +liang l iang +liao l iao +lie l ie +lin l in +ling l ing +liu l iu +lo l o +long l ong +lou l ou +lu l u +luan l uan +lun l un +luo l uo +lv l v +lve l ve +ma m a +mai m ai +man m an +mang m ang +mao m ao +me m e +mei m ei +men m en +meng m eng +mi m i +mian m ian +miao m iao +mie m ie +min m in +ming m ing +miu m iu +mo m o +mou m ou +mu m u +na n a +nai n ai +nan n an +nang n ang +nao n ao +ne n e +nei n ei +nen n en +neng n eng +ni n i +nian n ian +niang n iang +niao n iao +nie n ie +nin n in +ning n ing +niu n iu +nong n ong +nou n ou +nu n u +nuan n uan +nun n un +nuo n uo +nv n v +nve n ve +o OO o +ou OO ou +pa p a +pai p ai +pan p an +pang p ang +pao p ao +pei p ei +pen p en +peng p eng +pi p i +pian p ian +piao p iao +pie p ie +pin p in +ping p ing +po p o +pou p ou +pu p u +qi q i +qia q ia +qian q ian +qiang q iang +qiao q iao +qie q ie +qin q in +qing q ing +qiong q iong +qiu q iu +qu q v +qv q v +quan q van +qvan q van +que q ve +qve q ve +qun q vn +qvn q vn +ran r an +rang r ang +rao r ao +re r e +ren r en +reng r eng +ri r ir +rong r ong +rou r ou +ru r u +rua r ua +ruan r uan +rui r ui +run r un +ruo r uo +sa s a +sai s ai +san s an +sang s ang +sao s ao +se s e +sen s en +seng s eng +sha sh a +shai sh ai +shan sh an +shang sh ang +shao sh ao +she sh e +shei sh ei +shen sh en +sheng sh eng +shi sh ir +shou sh ou +shu sh u +shua sh ua +shuai sh uai +shuan sh uan +shuang sh uang +shui sh ui +shun sh un +shuo sh uo +si s i0 +song s ong +sou s ou +su s u +suan s uan +sui s ui +sun s un +suo s uo +ta t a +tai t ai +tan t an +tang t ang +tao t ao +te t e +tei t ei +teng t eng +ti t i +tian t ian +tiao t iao +tie t ie +ting t ing +tong t ong +tou t ou +tu t u +tuan t uan +tui t ui +tun t un +tuo t uo +wa w a +wai w ai +wan w an +wang w ang +wei w ei +wen w en +weng w eng +wo w o +wu w u +xi x i +xia x ia +xian x ian +xiang x iang +xiao x iao +xie x ie +xin x in +xing x ing +xiong x iong +xiu x iu +xu x v +xv x v +xuan x van +xvan x van +xue x ve +xve x ve +xun x vn +xvn x vn +ya y a +yan y En +yang y ang +yao y ao +ye y E +yi y i +yin y in +ying y ing +yo y o +yong y ong +you y ou +yu y v +yv y v +yuan y van +yvan y van +yue y ve +yve y ve +yun y vn +yvn y vn +za z a +zai z ai +zan z an +zang z ang +zao z ao +ze z e +zei z ei +zen z en +zeng z eng +zha zh a +zhai zh ai +zhan zh an +zhang zh ang +zhao zh ao +zhe zh e +zhei zh ei +zhen zh en +zheng zh eng +zhi zh ir +zhong zh ong +zhou zh ou +zhu zh u +zhua zh ua +zhuai zh uai +zhuan zh uan +zhuang zh uang +zhui zh ui +zhun zh un +zhuo zh uo +zi z i0 +zong z ong +zou z ou +zu z u +zuan z uan +zui z ui +zun z un +zuo z uo diff --git a/text/symbols.py b/text/symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..70499492c3a2fac668847b1317e6e213cb086ece --- /dev/null +++ b/text/symbols.py @@ -0,0 +1,401 @@ +import os + +# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 +punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 +punctuation.append("-") +pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"] +# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"] +pad = "_" + +c = [ + "AA", + "EE", + "OO", + "b", + "c", + "ch", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "sh", + "t", + "w", + "x", + "y", + "z", + "zh", +] +v = [ + "E1", + "En1", + "a1", + "ai1", + "an1", + "ang1", + "ao1", + "e1", + "ei1", + "en1", + "eng1", + "er1", + "i1", + "i01", + "ia1", + "ian1", + "iang1", + "iao1", + "ie1", + "in1", + "ing1", + "iong1", + "ir1", + "iu1", + "o1", + "ong1", + "ou1", + "u1", + "ua1", + "uai1", + "uan1", + "uang1", + "ui1", + "un1", + "uo1", + "v1", + "van1", + "ve1", + "vn1", + "E2", + "En2", + "a2", + "ai2", + "an2", + "ang2", + "ao2", + "e2", + "ei2", + "en2", + "eng2", + "er2", + "i2", + "i02", + "ia2", + "ian2", + "iang2", + "iao2", + "ie2", + "in2", + "ing2", + "iong2", + "ir2", + "iu2", + "o2", + "ong2", + "ou2", + "u2", + "ua2", + "uai2", + "uan2", + "uang2", + "ui2", + "un2", + "uo2", + "v2", + "van2", + "ve2", + "vn2", + "E3", + "En3", + "a3", + "ai3", + "an3", + "ang3", + "ao3", + "e3", + "ei3", + "en3", + "eng3", + "er3", + "i3", + "i03", + "ia3", + "ian3", + "iang3", + "iao3", + "ie3", + "in3", + "ing3", + "iong3", + "ir3", + "iu3", + "o3", + "ong3", + "ou3", + "u3", + "ua3", + "uai3", + "uan3", + "uang3", + "ui3", + "un3", + "uo3", + "v3", + "van3", + "ve3", + "vn3", + "E4", + "En4", + "a4", + "ai4", + "an4", + "ang4", + "ao4", + "e4", + "ei4", + "en4", + "eng4", + "er4", + "i4", + "i04", + "ia4", + "ian4", + "iang4", + "iao4", + "ie4", + "in4", + "ing4", + "iong4", + "ir4", + "iu4", + "o4", + "ong4", + "ou4", + "u4", + "ua4", + "uai4", + "uan4", + "uang4", + "ui4", + "un4", + "uo4", + "v4", + "van4", + "ve4", + "vn4", + "E5", + "En5", + "a5", + "ai5", + "an5", + "ang5", + "ao5", + "e5", + "ei5", + "en5", + "eng5", + "er5", + "i5", + "i05", + "ia5", + "ian5", + "iang5", + "iao5", + "ie5", + "in5", + "ing5", + "iong5", + "ir5", + "iu5", + "o5", + "ong5", + "ou5", + "u5", + "ua5", + "uai5", + "uan5", + "uang5", + "ui5", + "un5", + "uo5", + "v5", + "van5", + "ve5", + "vn5", +] + +v_without_tone = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "e", + "ei", + "en", + "eng", + "er", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "o", + "ong", + "ou", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", +] + +# japanese +ja_symbols = [ + "I", + "N", + "U", + "a", + "b", + "by", + "ch", + "cl", + "d", + "dy", + "e", + "f", + "g", + "gy", + "h", + "hy", + "i", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "p", + "py", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "u", + "v", + "w", + "y", + "z", + # "[", #上升调型 + # "]", #下降调型 + # "$", #结束符 + # "^", #开始符 +] + +arpa = { + "AH0", + "S", + "AH1", + "EY2", + "AE2", + "EH0", + "OW2", + "UH0", + "NG", + "B", + "G", + "AY0", + "M", + "AA0", + "F", + "AO0", + "ER2", + "UH1", + "IY1", + "AH2", + "DH", + "IY0", + "EY1", + "IH0", + "K", + "N", + "W", + "IY2", + "T", + "AA1", + "ER1", + "EH2", + "OY0", + "UH2", + "UW1", + "Z", + "AW2", + "AW1", + "V", + "UW2", + "AA2", + "ER", + "AW0", + "UW0", + "R", + "OW1", + "EH1", + "ZH", + "AE0", + "IH2", + "IH", + "Y", + "JH", + "P", + "AY1", + "EY0", + "OY2", + "TH", + "HH", + "D", + "ER0", + "CH", + "AO1", + "AE1", + "AO2", + "OY1", + "AY2", + "IH1", + "OW0", + "L", + "SH", +} + +symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) +symbols = sorted(set(symbols)) +if __name__ == "__main__": + print(len(symbols)) diff --git a/text/tone_sandhi.py b/text/tone_sandhi.py new file mode 100644 index 0000000000000000000000000000000000000000..c557dad118631f2aa9b75d84e8775bea8c34eaf0 --- /dev/null +++ b/text/tone_sandhi.py @@ -0,0 +1,806 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List +from typing import Tuple + +import jieba_fast as jieba +from pypinyin import lazy_pinyin +from pypinyin import Style + + +class ToneSandhi: + def __init__(self): + self.must_neural_tone_words = { + "麻烦", + "麻利", + "鸳鸯", + "高粱", + "骨头", + "骆驼", + "马虎", + "首饰", + "馒头", + "馄饨", + "风筝", + "难为", + "队伍", + "阔气", + "闺女", + "门道", + "锄头", + "铺盖", + "铃铛", + "铁匠", + "钥匙", + "里脊", + "里头", + "部分", + "那么", + "道士", + "造化", + "迷糊", + "连累", + "这么", + "这个", + "运气", + "过去", + "软和", + "转悠", + "踏实", + "跳蚤", + "跟头", + "趔趄", + "财主", + "豆腐", + "讲究", + "记性", + "记号", + "认识", + "规矩", + "见识", + "裁缝", + "补丁", + "衣裳", + "衣服", + "衙门", + "街坊", + "行李", + "行当", + "蛤蟆", + "蘑菇", + "薄荷", + "葫芦", + "葡萄", + "萝卜", + "荸荠", + "苗条", + "苗头", + "苍蝇", + "芝麻", + "舒服", + "舒坦", + "舌头", + "自在", + "膏药", + "脾气", + "脑袋", + "脊梁", + "能耐", + "胳膊", + "胭脂", + "胡萝", + "胡琴", + "胡同", + "聪明", + "耽误", + "耽搁", + "耷拉", + "耳朵", + "老爷", + "老实", + "老婆", + "老头", + "老太", + "翻腾", + "罗嗦", + "罐头", + "编辑", + "结实", + "红火", + "累赘", + "糨糊", + "糊涂", + "精神", + "粮食", + "簸箕", + "篱笆", + "算计", + "算盘", + "答应", + "笤帚", + "笑语", + "笑话", + "窟窿", + "窝囊", + "窗户", + "稳当", + "稀罕", + "称呼", + "秧歌", + "秀气", + "秀才", + "福气", + "祖宗", + "砚台", + "码头", + "石榴", + "石头", + "石匠", + "知识", + "眼睛", + "眯缝", + "眨巴", + "眉毛", + "相声", + "盘算", + "白净", + "痢疾", + "痛快", + "疟疾", + "疙瘩", + "疏忽", + "畜生", + "生意", + "甘蔗", + "琵琶", + "琢磨", + "琉璃", + "玻璃", + "玫瑰", + "玄乎", + "狐狸", + "状元", + "特务", + "牲口", + "牙碜", + "牌楼", + "爽快", + "爱人", + "热闹", + "烧饼", + "烟筒", + "烂糊", + "点心", + "炊帚", + "灯笼", + "火候", + "漂亮", + "滑溜", + "溜达", + "温和", + "清楚", + "消息", + "浪头", + "活泼", + "比方", + "正经", + "欺负", + "模糊", + "槟榔", + "棺材", + "棒槌", + "棉花", + "核桃", + "栅栏", + "柴火", + "架势", + "枕头", + "枇杷", + "机灵", + "本事", + "木头", + "木匠", + "朋友", + "月饼", + "月亮", + "暖和", + "明白", + "时候", + "新鲜", + "故事", + "收拾", + "收成", + "提防", + "挖苦", + "挑剔", + "指甲", + "指头", + "拾掇", + "拳头", + "拨弄", + "招牌", + "招呼", + "抬举", + "护士", + "折腾", + "扫帚", + "打量", + "打算", + "打点", + "打扮", + "打听", + "打发", + "扎实", + "扁担", + "戒指", + "懒得", + "意识", + "意思", + "情形", + "悟性", + "怪物", + "思量", + "怎么", + "念头", + "念叨", + "快活", + "忙活", + "志气", + "心思", + "得罪", + "张罗", + "弟兄", + "开通", + "应酬", + "庄稼", + "干事", + "帮手", + "帐篷", + "希罕", + "师父", + "师傅", + "巴结", + "巴掌", + "差事", + "工夫", + "岁数", + "屁股", + "尾巴", + "少爷", + "小气", + "小伙", + "将就", + "对头", + "对付", + "寡妇", + "家伙", + "客气", + "实在", + "官司", + "学问", + "学生", + "字号", + "嫁妆", + "媳妇", + "媒人", + "婆家", + "娘家", + "委屈", + "姑娘", + "姐夫", + "妯娌", + "妥当", + "妖精", + "奴才", + "女婿", + "头发", + "太阳", + "大爷", + "大方", + "大意", + "大夫", + "多少", + "多么", + "外甥", + "壮实", + "地道", + "地方", + "在乎", + "困难", + "嘴巴", + "嘱咐", + "嘟囔", + "嘀咕", + "喜欢", + "喇嘛", + "喇叭", + "商量", + "唾沫", + "哑巴", + "哈欠", + "哆嗦", + "咳嗽", + "和尚", + "告诉", + "告示", + "含糊", + "吓唬", + "后头", + "名字", + "名堂", + "合同", + "吆喝", + "叫唤", + "口袋", + "厚道", + "厉害", + "千斤", + "包袱", + "包涵", + "匀称", + "勤快", + "动静", + "动弹", + "功夫", + "力气", + "前头", + "刺猬", + "刺激", + "别扭", + "利落", + "利索", + "利害", + "分析", + "出息", + "凑合", + "凉快", + "冷战", + "冤枉", + "冒失", + "养活", + "关系", + "先生", + "兄弟", + "便宜", + "使唤", + "佩服", + "作坊", + "体面", + "位置", + "似的", + "伙计", + "休息", + "什么", + "人家", + "亲戚", + "亲家", + "交情", + "云彩", + "事情", + "买卖", + "主意", + "丫头", + "丧气", + "两口", + "东西", + "东家", + "世故", + "不由", + "不在", + "下水", + "下巴", + "上头", + "上司", + "丈夫", + "丈人", + "一辈", + "那个", + "菩萨", + "父亲", + "母亲", + "咕噜", + "邋遢", + "费用", + "冤家", + "甜头", + "介绍", + "荒唐", + "大人", + "泥鳅", + "幸福", + "熟悉", + "计划", + "扑腾", + "蜡烛", + "姥爷", + "照顾", + "喉咙", + "吉他", + "弄堂", + "蚂蚱", + "凤凰", + "拖沓", + "寒碜", + "糟蹋", + "倒腾", + "报复", + "逻辑", + "盘缠", + "喽啰", + "牢骚", + "咖喱", + "扫把", + "惦记", + } + self.must_not_neural_tone_words = { + "男子", + "女子", + "分子", + "原子", + "量子", + "莲子", + "石子", + "瓜子", + "电子", + "人人", + "虎虎", + "幺幺", + "干嘛", + "学子", + "哈哈", + "数数", + "袅袅", + "局地", + "以下", + "娃哈哈", + "花花草草", + "留得", + "耕地", + "想想", + "熙熙", + "攘攘", + "卵子", + "死死", + "冉冉", + "恳恳", + "佼佼", + "吵吵", + "打打", + "考考", + "整整", + "莘莘", + "落地", + "算子", + "家家户户", + "青青", + } + self.punc = ":,;。?!“”‘’':,;.?!" + + # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 + # e.g. + # word: "家里" + # pos: "s" + # finals: ['ia1', 'i3'] + def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: + # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 + for j, item in enumerate(word): + if ( + j - 1 >= 0 + and item == word[j - 1] + and pos[0] in {"n", "v", "a"} + and word not in self.must_not_neural_tone_words + ): + finals[j] = finals[j][:-1] + "5" + ge_idx = word.find("个") + if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": + finals[-1] = finals[-1][:-1] + "5" + elif len(word) >= 1 and word[-1] in "的地得": + finals[-1] = finals[-1][:-1] + "5" + # e.g. 走了, 看着, 去过 + elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: + finals[-1] = finals[-1][:-1] + "5" + elif ( + len(word) > 1 + and word[-1] in "们子" + and pos in {"r", "n"} + and word not in self.must_not_neural_tone_words + ): + finals[-1] = finals[-1][:-1] + "5" + # e.g. 桌上, 地下, 家里 + elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: + finals[-1] = finals[-1][:-1] + "5" + # e.g. 上来, 下去 + elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": + finals[-1] = finals[-1][:-1] + "5" + # 个做量词 + elif ( + ge_idx >= 1 + and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") + ) or word == "个": + finals[ge_idx] = finals[ge_idx][:-1] + "5" + else: + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): + finals[-1] = finals[-1][:-1] + "5" + + word_list = self._split_word(word) + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] + for i, word in enumerate(word_list): + # conventional neural in Chinese + if ( + word in self.must_neural_tone_words + or word[-2:] in self.must_neural_tone_words + ): + finals_list[i][-1] = finals_list[i][-1][:-1] + "5" + finals = sum(finals_list, []) + return finals + + def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: + # e.g. 看不懂 + if len(word) == 3 and word[1] == "不": + finals[1] = finals[1][:-1] + "5" + else: + for i, char in enumerate(word): + # "不" before tone4 should be bu2, e.g. 不怕 + if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": + finals[i] = finals[i][:-1] + "2" + return finals + + def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: + # "一" in number sequences, e.g. 一零零, 二一零 + if word.find("一") != -1 and all( + [item.isnumeric() for item in word if item != "一"] + ): + return finals + # "一" between reduplication words shold be yi5, e.g. 看一看 + elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: + finals[1] = finals[1][:-1] + "5" + # when "一" is ordinal word, it should be yi1 + elif word.startswith("第一"): + finals[1] = finals[1][:-1] + "1" + else: + for i, char in enumerate(word): + if char == "一" and i + 1 < len(word): + # "一" before tone4 should be yi2, e.g. 一段 + if finals[i + 1][-1] == "4": + finals[i] = finals[i][:-1] + "2" + # "一" before non-tone4 should be yi4, e.g. 一天 + else: + # "一" 后面如果是标点,还读一声 + if word[i + 1] not in self.punc: + finals[i] = finals[i][:-1] + "4" + return finals + + def _split_word(self, word: str) -> List[str]: + word_list = jieba.cut_for_search(word) + word_list = sorted(word_list, key=lambda i: len(i), reverse=False) + first_subword = word_list[0] + first_begin_idx = word.find(first_subword) + if first_begin_idx == 0: + second_subword = word[len(first_subword) :] + new_word_list = [first_subword, second_subword] + else: + second_subword = word[: -len(first_subword)] + new_word_list = [second_subword, first_subword] + return new_word_list + + def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: + if len(word) == 2 and self._all_tone_three(finals): + finals[0] = finals[0][:-1] + "2" + elif len(word) == 3: + word_list = self._split_word(word) + if self._all_tone_three(finals): + # disyllabic + monosyllabic, e.g. 蒙古/包 + if len(word_list[0]) == 2: + finals[0] = finals[0][:-1] + "2" + finals[1] = finals[1][:-1] + "2" + # monosyllabic + disyllabic, e.g. 纸/老虎 + elif len(word_list[0]) == 1: + finals[1] = finals[1][:-1] + "2" + else: + finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] + if len(finals_list) == 2: + for i, sub in enumerate(finals_list): + # e.g. 所有/人 + if self._all_tone_three(sub) and len(sub) == 2: + finals_list[i][0] = finals_list[i][0][:-1] + "2" + # e.g. 好/喜欢 + elif ( + i == 1 + and not self._all_tone_three(sub) + and finals_list[i][0][-1] == "3" + and finals_list[0][-1][-1] == "3" + ): + finals_list[0][-1] = finals_list[0][-1][:-1] + "2" + finals = sum(finals_list, []) + # split idiom into two words who's length is 2 + elif len(word) == 4: + finals_list = [finals[:2], finals[2:]] + finals = [] + for sub in finals_list: + if self._all_tone_three(sub): + sub[0] = sub[0][:-1] + "2" + finals += sub + + return finals + + def _all_tone_three(self, finals: List[str]) -> bool: + return all(x[-1] == "3" for x in finals) + + # merge "不" and the word behind it + # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error + def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + last_word = "" + for word, pos in seg: + if last_word == "不": + word = last_word + word + if word != "不": + new_seg.append((word, pos)) + last_word = word[:] + if last_word == "不": + new_seg.append((last_word, "d")) + last_word = "" + return new_seg + + # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听" + # function 2: merge single "一" and the word behind it + # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error + # e.g. + # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] + # output seg: [['听一听', 'v']] + def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + # function 1 + for i, (word, pos) in enumerate(seg): + if ( + i - 1 >= 0 + and word == "一" + and i + 1 < len(seg) + and seg[i - 1][0] == seg[i + 1][0] + and seg[i - 1][1] == "v" + and seg[i + 1][1] == "v" + ): + new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] + else: + if ( + i - 2 >= 0 + and seg[i - 1][0] == "一" + and seg[i - 2][0] == word + and pos == "v" + ): + continue + else: + new_seg.append([word, pos]) + seg = new_seg + new_seg = [] + # function 2 + for i, (word, pos) in enumerate(seg): + if new_seg and new_seg[-1][0] == "一": + new_seg[-1][0] = new_seg[-1][0] + word + else: + new_seg.append([word, pos]) + return new_seg + + # the first and the second words are all_tone_three + def _merge_continuous_three_tones( + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: + new_seg = [] + sub_finals_list = [ + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + for (word, pos) in seg + ] + assert len(sub_finals_list) == len(seg) + merge_last = [False] * len(seg) + for i, (word, pos) in enumerate(seg): + if ( + i - 1 >= 0 + and self._all_tone_three(sub_finals_list[i - 1]) + and self._all_tone_three(sub_finals_list[i]) + and not merge_last[i - 1] + ): + # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + merge_last[i] = True + else: + new_seg.append([word, pos]) + else: + new_seg.append([word, pos]) + + return new_seg + + def _is_reduplication(self, word: str) -> bool: + return len(word) == 2 and word[0] == word[1] + + # the last char of first word and the first char of second word is tone_three + def _merge_continuous_three_tones_2( + self, seg: List[Tuple[str, str]] + ) -> List[Tuple[str, str]]: + new_seg = [] + sub_finals_list = [ + lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) + for (word, pos) in seg + ] + assert len(sub_finals_list) == len(seg) + merge_last = [False] * len(seg) + for i, (word, pos) in enumerate(seg): + if ( + i - 1 >= 0 + and sub_finals_list[i - 1][-1][-1] == "3" + and sub_finals_list[i][0][-1] == "3" + and not merge_last[i - 1] + ): + # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi + if ( + not self._is_reduplication(seg[i - 1][0]) + and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 + ): + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + merge_last[i] = True + else: + new_seg.append([word, pos]) + else: + new_seg.append([word, pos]) + return new_seg + + def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + for i, (word, pos) in enumerate(seg): + if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + else: + new_seg.append([word, pos]) + return new_seg + + def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + new_seg = [] + for i, (word, pos) in enumerate(seg): + if new_seg and word == new_seg[-1][0]: + new_seg[-1][0] = new_seg[-1][0] + seg[i][0] + else: + new_seg.append([word, pos]) + return new_seg + + def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + seg = self._merge_bu(seg) + try: + seg = self._merge_yi(seg) + except: + print("_merge_yi failed") + seg = self._merge_reduplication(seg) + try: + seg = self._merge_continuous_three_tones(seg) + except: + print("_merge_continuous_three_tones failed") + try: + seg = self._merge_continuous_three_tones_2(seg) + except: + print("_merge_continuous_three_tones_2 failed") + + seg = self._merge_er(seg) + return seg + + def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: + finals = self._bu_sandhi(word, finals) + finals = self._yi_sandhi(word, finals) + finals = self._neural_sandhi(word, pos, finals) + finals = self._three_sandhi(word, finals) + return finals diff --git a/text/zh_normalization/README.md b/text/zh_normalization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..92eea9f54630dfd41dfc3ce53bc511cc7595062c --- /dev/null +++ b/text/zh_normalization/README.md @@ -0,0 +1,16 @@ +## Supported NSW (Non-Standard-Word) Normalization + +|NSW type|raw|normalized| +|:--|:-|:-| +|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| +|cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| +|numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| +|date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| +|time|等会请在12:05请通知我|等会请在十二点零五分请通知我 +|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 +|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| +|percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| +|money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| +|telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| +## References +[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) diff --git a/text/zh_normalization/__init__.py b/text/zh_normalization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..46b367a68b074cf02da933f1e2433b86eeffe494 --- /dev/null +++ b/text/zh_normalization/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from text.zh_normalization.text_normlization import * diff --git a/text/zh_normalization/__pycache__/__init__.cpython-39.pyc b/text/zh_normalization/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d789b9d16b45d7c05a6bf0a39b8e12e2c7509eaa Binary files /dev/null and b/text/zh_normalization/__pycache__/__init__.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/char_convert.cpython-39.pyc b/text/zh_normalization/__pycache__/char_convert.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..038f982a7a13621916c0bf828d4194dd79d66efe Binary files /dev/null and b/text/zh_normalization/__pycache__/char_convert.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/chronology.cpython-39.pyc b/text/zh_normalization/__pycache__/chronology.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5863a264799585fbebf5e69b2e064d1f062f72a4 Binary files /dev/null and b/text/zh_normalization/__pycache__/chronology.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/constants.cpython-39.pyc b/text/zh_normalization/__pycache__/constants.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0563de3a92f24f3897a3d90b4a92a1c9b60d6ffd Binary files /dev/null and b/text/zh_normalization/__pycache__/constants.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/num.cpython-39.pyc b/text/zh_normalization/__pycache__/num.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e872d5b92e78ef53a4d63433cd05f33c4e969047 Binary files /dev/null and b/text/zh_normalization/__pycache__/num.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/phonecode.cpython-39.pyc b/text/zh_normalization/__pycache__/phonecode.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5eb17757d060277df29387a822f7f71d7aa3714 Binary files /dev/null and b/text/zh_normalization/__pycache__/phonecode.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/quantifier.cpython-39.pyc b/text/zh_normalization/__pycache__/quantifier.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ee51c901eb4cda536b8d596f56333083b70598a Binary files /dev/null and b/text/zh_normalization/__pycache__/quantifier.cpython-39.pyc differ diff --git a/text/zh_normalization/__pycache__/text_normlization.cpython-39.pyc b/text/zh_normalization/__pycache__/text_normlization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f152b8f4e7d6fce9705c9c9504c4e65b20b13dc Binary files /dev/null and b/text/zh_normalization/__pycache__/text_normlization.cpython-39.pyc differ diff --git a/text/zh_normalization/char_convert.py b/text/zh_normalization/char_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..dcf95d72861348b36e4f21c31350f47e56934fc1 --- /dev/null +++ b/text/zh_normalization/char_convert.py @@ -0,0 +1,46 @@ +# coding=utf-8 +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. +""" +simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' + +traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' + +assert len(simplified_charcters) == len(simplified_charcters) + +s2t_dict = {} +t2s_dict = {} +for i, item in enumerate(simplified_charcters): + s2t_dict[item] = traditional_characters[i] + t2s_dict[traditional_characters[i]] = item + + +def tranditional_to_simplified(text: str) -> str: + return "".join( + [t2s_dict[item] if item in t2s_dict else item for item in text]) + + +def simplified_to_traditional(text: str) -> str: + return "".join( + [s2t_dict[item] if item in s2t_dict else item for item in text]) + + +if __name__ == "__main__": + text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點" + print(text) + text_simple = tranditional_to_simplified(text) + print(text_simple) + text_traditional = simplified_to_traditional(text_simple) + print(text_traditional) diff --git a/text/zh_normalization/chronology.py b/text/zh_normalization/chronology.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4558e2a7abba4ff454656b82e67b3f5c483bf2 --- /dev/null +++ b/text/zh_normalization/chronology.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from .num import DIGITS +from .num import num2str +from .num import verbalize_cardinal +from .num import verbalize_digit + + +def _time_num2str(num_string: str) -> str: + """A special case for verbalizing number in time.""" + result = num2str(num_string.lstrip('0')) + if num_string.startswith('0'): + result = DIGITS['0'] + result + return result + + +# 时刻表达式 +RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?') + +# 时间范围,如8:30-12:30 +RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' + r'(~|-)' + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?') + + +def replace_time(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + + is_range = len(match.groups()) > 5 + + hour = match.group(1) + minute = match.group(2) + second = match.group(4) + + if is_range: + hour_2 = match.group(6) + minute_2 = match.group(7) + second_2 = match.group(9) + + result = f"{num2str(hour)}点" + if minute.lstrip('0'): + if int(minute) == 30: + result += "半" + else: + result += f"{_time_num2str(minute)}分" + if second and second.lstrip('0'): + result += f"{_time_num2str(second)}秒" + + if is_range: + result += "至" + result += f"{num2str(hour_2)}点" + if minute_2.lstrip('0'): + if int(minute) == 30: + result += "半" + else: + result += f"{_time_num2str(minute_2)}分" + if second_2 and second_2.lstrip('0'): + result += f"{_time_num2str(second_2)}秒" + + return result + + +RE_DATE = re.compile(r'(\d{4}|\d{2})年' + r'((0?[1-9]|1[0-2])月)?' + r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') + + +def replace_date(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + year = match.group(1) + month = match.group(3) + day = match.group(5) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}{match.group(9)}" + return result + + +# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 +RE_DATE2 = re.compile( + r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') + + +def replace_date2(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + year = match.group(1) + month = match.group(3) + day = match.group(4) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}日" + return result diff --git a/text/zh_normalization/constants.py b/text/zh_normalization/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..6423ad74a5c9a92df35aebf04bebdf4050a292de --- /dev/null +++ b/text/zh_normalization/constants.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import string + +from pypinyin.constants import SUPPORT_UCS4 + +# 全角半角转换 +# 英文字符全角 -> 半角映射表 (num: 52) +F2H_ASCII_LETTERS = { + ord(char) + 65248: ord(char) + for char in string.ascii_letters +} + +# 英文字符半角 -> 全角映射表 +H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} + +# 数字字符全角 -> 半角映射表 (num: 10) +F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} +# 数字字符半角 -> 全角映射表 +H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} + +# 标点符号全角 -> 半角映射表 (num: 32) +F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} +# 标点符号半角 -> 全角映射表 +H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} + +# 空格 (num: 1) +F2H_SPACE = {'\u3000': ' '} +H2F_SPACE = {' ': '\u3000'} + +# 非"有拼音的汉字"的字符串,可用于NSW提取 +if SUPPORT_UCS4: + RE_NSW = re.compile(r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] + r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] + r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] + r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] + r'])+') +else: + RE_NSW = re.compile( # pragma: no cover + r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'])+') diff --git a/text/zh_normalization/num.py b/text/zh_normalization/num.py new file mode 100644 index 0000000000000000000000000000000000000000..8ef7f48f765ea294ea11b82339746f2e370a0484 --- /dev/null +++ b/text/zh_normalization/num.py @@ -0,0 +1,253 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Rules to verbalize numbers into Chinese characters. +https://zh.wikipedia.org/wiki/中文数字#現代中文 +""" +import re +from collections import OrderedDict +from typing import List + +DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} +UNITS = OrderedDict({ + 1: '十', + 2: '百', + 3: '千', + 4: '万', + 8: '亿', +}) + +COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' + +# 分数表达式 +RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') + + +def replace_frac(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + nominator = match.group(2) + denominator = match.group(3) + sign: str = "负" if sign else "" + nominator: str = num2str(nominator) + denominator: str = num2str(denominator) + result = f"{sign}{denominator}分之{nominator}" + return result + + +# 百分数表达式 +RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') + + +def replace_percentage(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + percent = match.group(2) + sign: str = "负" if sign else "" + percent: str = num2str(percent) + result = f"{sign}百分之{percent}" + return result + + +# 整数表达式 +# 带负号的整数 -10 +RE_INTEGER = re.compile(r'(-)' r'(\d+)') + + +def replace_negative_num(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + number = match.group(2) + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + + +# 编号-无符号整形 +# 00078 +RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') + + +def replace_default_num(match): + """ + Args: + match (re.Match) + Returns: + str + """ + number = match.group(0) + return verbalize_digit(number, alt_one=True) + + +# 数字表达式 +# 纯小数 +RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') +# 正整数 + 量词 +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) +RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') + + +def replace_positive_quantifier(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + number = match.group(1) + match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" + match_2: str = match_2 if match_2 else "" + quantifiers: str = match.group(3) + number: str = num2str(number) + result = f"{number}{match_2}{quantifiers}" + return result + + +def replace_number(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + number = match.group(2) + pure_decimal = match.group(5) + if pure_decimal: + result = num2str(pure_decimal) + else: + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + + +# 范围表达式 +# match.group(1) and match.group(8) are copy from RE_NUMBER + +RE_RANGE = re.compile( + r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') + + +def replace_range(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + first, second = match.group(1), match.group(8) + first = RE_NUMBER.sub(replace_number, first) + second = RE_NUMBER.sub(replace_number, second) + result = f"{first}到{second}" + return result + + +# ~至表达式 +RE_TO_RANGE = re.compile( + r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)') + +def replace_to_range(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + result = match.group(0).replace('~', '至') + return result + + +def _get_value(value_string: str, use_zero: bool=True) -> List[str]: + stripped = value_string.lstrip('0') + if len(stripped) == 0: + return [] + elif len(stripped) == 1: + if use_zero and len(stripped) < len(value_string): + return [DIGITS['0'], DIGITS[stripped]] + else: + return [DIGITS[stripped]] + else: + largest_unit = next( + power for power in reversed(UNITS.keys()) if power < len(stripped)) + first_part = value_string[:-largest_unit] + second_part = value_string[-largest_unit:] + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( + second_part) + + +def verbalize_cardinal(value_string: str) -> str: + if not value_string: + return '' + + # 000 -> '零' , 0 -> '零' + value_string = value_string.lstrip('0') + if len(value_string) == 0: + return DIGITS['0'] + + result_symbols = _get_value(value_string) + # verbalized number starting with '一十*' is abbreviated as `十*` + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ + '1'] and result_symbols[1] == UNITS[1]: + result_symbols = result_symbols[1:] + return ''.join(result_symbols) + + +def verbalize_digit(value_string: str, alt_one=False) -> str: + result_symbols = [DIGITS[digit] for digit in value_string] + result = ''.join(result_symbols) + if alt_one: + result = result.replace("一", "幺") + return result + + +def num2str(value_string: str) -> str: + integer_decimal = value_string.split('.') + if len(integer_decimal) == 1: + integer = integer_decimal[0] + decimal = '' + elif len(integer_decimal) == 2: + integer, decimal = integer_decimal + else: + raise ValueError( + f"The value string: '${value_string}' has more than one point in it." + ) + + result = verbalize_cardinal(integer) + + decimal = decimal.rstrip('0') + if decimal: + # '.22' is verbalized as '零点二二' + # '3.20' is verbalized as '三点二 + result = result if result else "零" + result += '点' + verbalize_digit(decimal) + return result diff --git a/text/zh_normalization/phonecode.py b/text/zh_normalization/phonecode.py new file mode 100644 index 0000000000000000000000000000000000000000..51835112603c1a8c31052285276f138b8c5a74d6 --- /dev/null +++ b/text/zh_normalization/phonecode.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from .num import verbalize_digit + +# 规范化固话/手机号码 +# 手机 +# http://www.jihaoba.com/news/show/13680 +# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 +# 联通:130、131、132、156、155、186、185、176 +# 电信:133、153、189、180、181、177 +RE_MOBILE_PHONE = re.compile( + r"(? str: + if mobile: + sp_parts = phone_string.strip('+').split() + result = ','.join( + [verbalize_digit(part, alt_one=True) for part in sp_parts]) + return result + else: + sil_parts = phone_string.split('-') + result = ','.join( + [verbalize_digit(part, alt_one=True) for part in sil_parts]) + return result + + +def replace_phone(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + return phone2str(match.group(0), mobile=False) + + +def replace_mobile(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + return phone2str(match.group(0)) diff --git a/text/zh_normalization/quantifier.py b/text/zh_normalization/quantifier.py new file mode 100644 index 0000000000000000000000000000000000000000..598030e439bd2dd419900892547a550a5df96482 --- /dev/null +++ b/text/zh_normalization/quantifier.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from .num import num2str + +# 温度表达式,温度会影响负号的读法 +# -3°C 零下三度 +RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') +measure_dict = { + "cm2": "平方厘米", + "cm²": "平方厘米", + "cm3": "立方厘米", + "cm³": "立方厘米", + "cm": "厘米", + "db": "分贝", + "ds": "毫秒", + "kg": "千克", + "km": "千米", + "m2": "平方米", + "m²": "平方米", + "m³": "立方米", + "m3": "立方米", + "ml": "毫升", + "m": "米", + "mm": "毫米", + "s": "秒" +} + + +def replace_temperature(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + temperature = match.group(2) + unit = match.group(3) + sign: str = "零下" if sign else "" + temperature: str = num2str(temperature) + unit: str = "摄氏度" if unit == "摄氏度" else "度" + result = f"{sign}{temperature}{unit}" + return result + + +def replace_measure(sentence) -> str: + for q_notation in measure_dict: + if q_notation in sentence: + sentence = sentence.replace(q_notation, measure_dict[q_notation]) + return sentence diff --git a/text/zh_normalization/text_normlization.py b/text/zh_normalization/text_normlization.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c14949c3e6d01f931d8ecdd4aa3d5e57b8b89a --- /dev/null +++ b/text/zh_normalization/text_normlization.py @@ -0,0 +1,158 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import List + +from .char_convert import tranditional_to_simplified +from .chronology import RE_DATE +from .chronology import RE_DATE2 +from .chronology import RE_TIME +from .chronology import RE_TIME_RANGE +from .chronology import replace_date +from .chronology import replace_date2 +from .chronology import replace_time +from .constants import F2H_ASCII_LETTERS +from .constants import F2H_DIGITS +from .constants import F2H_SPACE +from .num import RE_DECIMAL_NUM +from .num import RE_DEFAULT_NUM +from .num import RE_FRAC +from .num import RE_INTEGER +from .num import RE_NUMBER +from .num import RE_PERCENTAGE +from .num import RE_POSITIVE_QUANTIFIERS +from .num import RE_RANGE +from .num import RE_TO_RANGE +from .num import replace_default_num +from .num import replace_frac +from .num import replace_negative_num +from .num import replace_number +from .num import replace_percentage +from .num import replace_positive_quantifier +from .num import replace_range +from .num import replace_to_range +from .phonecode import RE_MOBILE_PHONE +from .phonecode import RE_NATIONAL_UNIFORM_NUMBER +from .phonecode import RE_TELEPHONE +from .phonecode import replace_mobile +from .phonecode import replace_phone +from .quantifier import RE_TEMPERATURE +from .quantifier import replace_measure +from .quantifier import replace_temperature + + +class TextNormalizer(): + def __init__(self): + self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') + + def _split(self, text: str, lang="zh") -> List[str]: + """Split long text into sentences with sentence-splitting punctuations. + Args: + text (str): The input text. + Returns: + List[str]: Sentences. + """ + # Only for pure Chinese here + if lang == "zh": + text = text.replace(" ", "") + # 过滤掉特殊字符 + text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text) + text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) + text = text.strip() + sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + return sentences + + def _post_replace(self, sentence: str) -> str: + sentence = sentence.replace('/', '每') + # sentence = sentence.replace('~', '至') + # sentence = sentence.replace('~', '至') + sentence = sentence.replace('①', '一') + sentence = sentence.replace('②', '二') + sentence = sentence.replace('③', '三') + sentence = sentence.replace('④', '四') + sentence = sentence.replace('⑤', '五') + sentence = sentence.replace('⑥', '六') + sentence = sentence.replace('⑦', '七') + sentence = sentence.replace('⑧', '八') + sentence = sentence.replace('⑨', '九') + sentence = sentence.replace('⑩', '十') + sentence = sentence.replace('α', '阿尔法') + sentence = sentence.replace('β', '贝塔') + sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛') + sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔') + sentence = sentence.replace('ε', '艾普西龙') + sentence = sentence.replace('ζ', '捷塔') + sentence = sentence.replace('η', '依塔') + sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔') + sentence = sentence.replace('ι', '艾欧塔') + sentence = sentence.replace('κ', '喀帕') + sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达') + sentence = sentence.replace('μ', '缪') + sentence = sentence.replace('ν', '拗') + sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西') + sentence = sentence.replace('ο', '欧米克伦') + sentence = sentence.replace('π', '派').replace('Π', '派') + sentence = sentence.replace('ρ', '肉') + sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace( + 'σ', '西格玛') + sentence = sentence.replace('τ', '套') + sentence = sentence.replace('υ', '宇普西龙') + sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾') + sentence = sentence.replace('χ', '器') + sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛') + sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽') + # re filter special characters, have one more character "-" than line 68 + sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence) + return sentence + + def normalize_sentence(self, sentence: str) -> str: + # basic character conversions + sentence = tranditional_to_simplified(sentence) + sentence = sentence.translate(F2H_ASCII_LETTERS).translate( + F2H_DIGITS).translate(F2H_SPACE) + + # number related NSW verbalization + sentence = RE_DATE.sub(replace_date, sentence) + sentence = RE_DATE2.sub(replace_date2, sentence) + + # range first + sentence = RE_TIME_RANGE.sub(replace_time, sentence) + sentence = RE_TIME.sub(replace_time, sentence) + + # 处理~波浪号作为至的替换 + sentence = RE_TO_RANGE.sub(replace_to_range, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) + sentence = replace_measure(sentence) + sentence = RE_FRAC.sub(replace_frac, sentence) + sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) + sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) + + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) + + sentence = RE_RANGE.sub(replace_range, sentence) + sentence = RE_INTEGER.sub(replace_negative_num, sentence) + sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) + sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, + sentence) + sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) + sentence = RE_NUMBER.sub(replace_number, sentence) + sentence = self._post_replace(sentence) + + return sentence + + def normalize(self, text: str) -> List[str]: + sentences = self._split(text) + sentences = [self.normalize_sentence(sent) for sent in sentences] + return sentences diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7984b5a843b738b46e5770c79392ec293fe83e09 --- /dev/null +++ b/utils.py @@ -0,0 +1,371 @@ +import os +import glob +import sys +import argparse +import logging +import json +import subprocess +import traceback + +import librosa +import numpy as np +from scipy.io.wavfile import read +import torch +import logging + +logging.getLogger("numba").setLevel(logging.ERROR) +logging.getLogger("matplotlib").setLevel(logging.ERROR) + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None + and not skip_optimizer + and checkpoint_dict["optimizer"] is not None + ): + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + # assert "quantizer" not in k + # print("load", k) + new_state_dict[k] = saved_state_dict[k] + assert saved_state_dict[k].shape == v.shape, ( + saved_state_dict[k].shape, + v.shape, + ) + except: + traceback.print_exc() + print( + "error, %s is not in the checkpoint" % k + ) # shape不对也会,比如text_embedding当cleaner修改时 + new_state_dict[k] = v + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + print("load ") + logger.info( + "Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration) + ) + return model, optimizer, learning_rate, iteration + +from time import time as ttime +import shutil +def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path + dir=os.path.dirname(path) + name=os.path.basename(path) + tmp_path="%s.pth"%(ttime()) + torch.save(fea,tmp_path) + shutil.move(tmp_path,"%s/%s"%(dir,name)) + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at iteration {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + # torch.save( + my_save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + data, sampling_rate = librosa.load(full_path, sr=None) + return torch.FloatTensor(data), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True, stage=1): + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--config", + type=str, + default="./configs/s2.json", + help="JSON file for configuration", + ) + parser.add_argument( + "-p", "--pretrain", type=str, required=False, default=None, help="pretrain dir" + ) + parser.add_argument( + "-rs", + "--resume_step", + type=int, + required=False, + default=None, + help="resume step", + ) + # parser.add_argument('-e', '--exp_dir', type=str, required=False,default=None,help='experiment directory') + # parser.add_argument('-g', '--pretrained_s2G', type=str, required=False,default=None,help='pretrained sovits gererator weights') + # parser.add_argument('-d', '--pretrained_s2D', type=str, required=False,default=None,help='pretrained sovits discriminator weights') + + args = parser.parse_args() + + config_path = args.config + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.pretrain = args.pretrain + hparams.resume_step = args.resume_step + # hparams.data.exp_dir = args.exp_dir + if stage == 1: + model_dir = hparams.s1_ckpt_dir + else: + model_dir = hparams.s2_ckpt_dir + config_save_path = os.path.join(model_dir, "config.json") + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + + with open(config_save_path, "w") as f: + f.write(data) + return hparams + + +def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_time=True): + """Freeing up space by deleting saved ckpts + + Arguments: + path_to_models -- Path to the model directory + n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth + sort_by_time -- True -> chronologically delete ckpts + False -> lexicographically delete ckpts + """ + import re + + ckpts_files = [ + f + for f in os.listdir(path_to_models) + if os.path.isfile(os.path.join(path_to_models, f)) + ] + name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) + time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) + sort_key = time_key if sort_by_time else name_key + x_sorted = lambda _x: sorted( + [f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], + key=sort_key, + ) + to_del = [ + os.path.join(path_to_models, fn) + for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep]) + ] + del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") + del_routine = lambda x: [os.remove(x), del_info(x)] + rs = [del_routine(fn) for fn in to_del] + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() + + +if __name__ == "__main__": + print( + load_wav_to_torch( + "/home/fish/wenetspeech/dataset_vq/Y0000022499_wHFSeHEx9CM/S00261.flac" + ) + )