diff --git a/__pycache__/constants.cpython-310.pyc b/__pycache__/constants.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c12328be90805b2fd881569dfeccb69387bf8bb Binary files /dev/null and b/__pycache__/constants.cpython-310.pyc differ diff --git a/__pycache__/constants.cpython-38.pyc b/__pycache__/constants.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c46119fd7aabaf6ff4fed338e858326bd1b91657 Binary files /dev/null and b/__pycache__/constants.cpython-38.pyc differ diff --git a/__pycache__/handler.cpython-38.pyc b/__pycache__/handler.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9efa1bee704f7433f7eb69bf846528f5a82d5bf Binary files /dev/null and b/__pycache__/handler.cpython-38.pyc differ diff --git a/__pycache__/serve.cpython-310.pyc b/__pycache__/serve.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdb2da8532c7b9b19bb8e5b83b0b6fc11ea743e7 Binary files /dev/null and b/__pycache__/serve.cpython-310.pyc differ diff --git a/__pycache__/serve.cpython-38.pyc b/__pycache__/serve.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..428052441d1370912516e4e96281f91f0946e555 Binary files /dev/null and b/__pycache__/serve.cpython-38.pyc differ diff --git a/__pycache__/server.cpython-38.pyc b/__pycache__/server.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38c18b9b566f075e50202a75a39583e320488d61 Binary files /dev/null and b/__pycache__/server.cpython-38.pyc differ diff --git a/__pycache__/try.cpython-310.pyc b/__pycache__/try.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..533890621d2a9de4339d90608e45ded905d2b12f Binary files /dev/null and b/__pycache__/try.cpython-310.pyc differ diff --git a/__pycache__/utils.cpython-310.pyc b/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f35790f2bb306ffa130589fcb22b3aeadb85ab5 Binary files /dev/null and b/__pycache__/utils.cpython-310.pyc differ diff --git a/__pycache__/utils.cpython-38.pyc b/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1bc4255ebad0734065ae98124c4a621af58aec8 Binary files /dev/null and b/__pycache__/utils.cpython-38.pyc differ diff --git a/config-model.yaml b/config-model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..687f664d6696df42fc0900a0b2aeca2569e941eb --- /dev/null +++ b/config-model.yaml @@ -0,0 +1,12 @@ +gpt: + num_autoregressive_samples: 16 + top_p: 0.8 + temperature: 0.8 + length_penalty: 1 + max_mel_tokens: 500 + repetition_penalty: 2.0 + autoregressive_batch_size: 16 +clvp: + k: 1 +diffusion: + diffusion_temperature: 1.0 diff --git a/constants.py b/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..c3166a0b18f2d5e64bec0ff4eace0f9413c69cfb --- /dev/null +++ b/constants.py @@ -0,0 +1,12 @@ +NUM_AUTOREGRESSIVE_SAMPLES = "num_autoregressive_samples" +TOP_P = "top_p" +TEMPERATURE = "temperature" +LENGTH_PENALTY = "length_penalty" +REPETITION_PENALTY = "repetition_penalty" +MAX_MEL_TOKENS = "max_mel_tokens" +AUTO_REGRESSIVE_BATCH_SIZE = "autoregressive_batch_size" +DIFFUSION_TEMPERATURE = "diffusion_temperature" +# MODELS +GPT = "gpt" +CLVP_const = "clvp" +DIFFUSION = "diffusion" diff --git a/handler.py b/handler.py new file mode 100644 index 0000000000000000000000000000000000000000..05d2c82b5ae831af4997755090d714e7a0b8db8b --- /dev/null +++ b/handler.py @@ -0,0 +1,479 @@ +import base64 +import hashlib +from io import BytesIO +import random +import torch +import torch.nn.functional as F +import torchaudio +from copy import copy +from datetime import datetime +from fastapi import FastAPI +from fastapi.responses import FileResponse +from pathlib import Path +from pydantic import BaseModel + +from time import time +from typing import Any, Dict, List, Text, Tuple + +from constants import ( + AUTO_REGRESSIVE_BATCH_SIZE, + DIFFUSION, + DIFFUSION_TEMPERATURE, + GPT, + LENGTH_PENALTY, + MAX_MEL_TOKENS, + NUM_AUTOREGRESSIVE_SAMPLES, + REPETITION_PENALTY, + TEMPERATURE, + TOP_P, + CLVP_const, +) +from ruth_tts_transformer.models.autoregressive import UnifiedVoice +from ruth_tts_transformer.models.clvp import CLVP +from ruth_tts_transformer.models.diffusion_decoder import DiffusionTts +from ruth_tts_transformer.models.vocoder import UnivNetGenerator +from ruth_tts_transformer.utils.audio import load_voice +from ruth_tts_transformer.utils.tokenizer import VoiceBpeTokenizer +from ruth_tts_transformer.utils.wav2vec_alignment import Wav2VecAlignment +from utils import ( + MODELS_DIR, + get_config_file, + get_model_path, + load_discrete_vocoder_diffuser, +) + +app = FastAPI() + + +class Item(BaseModel): + text: str + voice: str + seed: int = 3 + + +class Gpt: + def __init__( + self, + num_autoregressive_samples: int, + top_p: float, + temperature: float, + length_penalty: int, + repetition_penalty: float, + max_mel_tokens: int, + autoregressive_batch_size: int, + ): + self.num_autoregressive_samples = num_autoregressive_samples + self.top_p = top_p + self.temperature = temperature + self.length_penalty = length_penalty + self.repetition_penalty = repetition_penalty + self.max_mel_tokens = max_mel_tokens + self.autoregressive_batch_size = autoregressive_batch_size + self.gpt = ( + UnifiedVoice( + max_mel_tokens=604, + max_text_tokens=402, + max_conditioning_inputs=2, + layers=30, + model_dim=1024, + heads=16, + number_text_tokens=255, + start_text_token=255, + checkpointing=False, + train_solo_embeddings=False, + ) + .cpu() + .eval() + ) + self.gpt.load_state_dict( + torch.load(get_model_path("autoregressive.pth", MODELS_DIR)) + ) + self.gpt = self.gpt.to("cuda") + + def __num_batches(self): + return self.num_autoregressive_samples // self.autoregressive_batch_size + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def parse(self, auto_conditioning, text_tokens, best_results, seed, k=1): + self.deterministic_state(seed=seed) + auto_conditioning = copy(auto_conditioning).to("cuda") + text_tokens = copy(text_tokens).to("cuda") + best_results = copy(best_results).to("cuda") + best_latents = self.gpt( + auto_conditioning.repeat(k, 1), + text_tokens.repeat(k, 1), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), + best_results, + torch.tensor( + [best_results.shape[-1] * self.gpt.mel_length_compression], + device=text_tokens.device, + ), + return_latent=True, + clip_inputs=False, + ) + # return best_latents.cpu().detach().numpy() + return best_latents + + def parse_inference( + self, auto_conditioning: torch.Tensor, text_tokens: torch.Tensor, seed + ) -> Tuple[List[torch.Tensor], int]: + self.deterministic_state(seed=seed) + auto_conditioning = copy(auto_conditioning).to("cuda") + text_tokens = copy(text_tokens).to("cuda") + with torch.no_grad(): + samples = [] + num_batches = self.__num_batches() + for b in range(num_batches): + codes = self.gpt.inference_speech( + auto_conditioning, + text_tokens, + do_sample=True, + top_p=self.top_p, + temperature=self.temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=self.length_penalty, + repetition_penalty=self.repetition_penalty, + max_generate_length=self.max_mel_tokens, + ) + padding_needed = self.max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=self.gpt.stop_mel_token) + # samples.append(codes.cpu().detach().numpy()) + samples.append(codes) + + return samples, self.gpt.stop_mel_token + + +class clvp: + def __init__(self, K): + + self.clvp = ( + CLVP( + dim_text=768, + dim_speech=768, + dim_latent=768, + num_text_tokens=256, + text_enc_depth=20, + text_seq_len=350, + text_heads=12, + num_speech_tokens=8192, + speech_enc_depth=20, + speech_heads=12, + speech_seq_len=430, + use_xformers=True, + ) + .cpu() + .eval() + ) + self.clvp.load_state_dict(torch.load(get_model_path("clvp2.pth", MODELS_DIR))) + self.clvp.to("cuda") + self.K = K + + @staticmethod + def fix_gpt_output(codes, stop_token, complain=True): + stop_token_indices = (codes == stop_token).nonzero() + if len(stop_token_indices) == 0: + if complain: + print( + "No stop tokens found in one of the generated voice clips. This typically means the spoken audio " + "is " + "too long. In some cases, the output will still be good, though. Listen to it and if it is " + "missing words, " + "try breaking up your input text." + ) + return codes + else: + codes[stop_token_indices] = 83 + stm = stop_token_indices.min().item() + codes[stm:] = 83 + if stm - 3 < codes.shape[0]: + codes[-3] = 45 + codes[-2] = 45 + codes[-1] = 248 + + return codes + + def parse( + self, + text_tokens: torch.Tensor, + samples: List[torch.Tensor], + stop_mel_token: int, + seed: int, + ) -> torch.Tensor: + self.deterministic_state(seed=seed) + clip_results = [] + text_tokens = copy(text_tokens).to("cuda") + samples = [copy(batch).to("cuda") for batch in samples] + for batch in samples: + for i in range(batch.shape[0]): + batch[i] = self.fix_gpt_output(batch[i], stop_mel_token) + + clvp = self.clvp( + text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False + ) + clip_results.append(clvp) + + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + # return samples[torch.topk(clip_results, self.K).indices].cpu().detach().numpy() + return samples[torch.topk(clip_results, self.K).indices] + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + +class Diffusion: + def __init__( + self, + diffusion_temperature, + diffusion_iterations=30, + cond_free=True, + cond_free_k=2, + ): + self.diffusion_temperature = diffusion_temperature + self.diffusion = ( + DiffusionTts( + model_channels=1024, + num_layers=10, + in_channels=100, + out_channels=200, + in_latent_channels=1024, + in_tokens=8193, + dropout=0, + use_fp16=False, + num_heads=16, + layer_drop=0, + unconditioned_percentage=0, + ) + .cpu() + .eval() + ) + self.diffusion.load_state_dict( + torch.load(get_model_path("diffusion_decoder.pth", MODELS_DIR)) + ) + self.diffuser = load_discrete_vocoder_diffuser( + desired_diffusion_steps=diffusion_iterations, + cond_free=cond_free, + cond_free_k=cond_free_k, + ) + + self.vocoder = UnivNetGenerator().cpu() + self.vocoder.load_state_dict( + torch.load( + get_model_path("vocoder.pth", MODELS_DIR), + map_location=torch.device("cpu"), + )["model_g"] + ) + self.vocoder.eval(inference=True) + self.diffusion.to("cuda") + self.vocoder.to("cuda") + self.aligner = Wav2VecAlignment() + # state = self.deterministic_state(seed=0) #Remove after testing + self.TACOTRON_MEL_MAX = 2.3143386840820312 + self.TACOTRON_MEL_MIN = -11.512925148010254 + + def denormalize_tacotron_mel(self, norm_mel): + return ((norm_mel + 1) / 2) * ( + self.TACOTRON_MEL_MAX - self.TACOTRON_MEL_MIN + ) + self.TACOTRON_MEL_MIN + + def potentially_redact(self, clip, text): + return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1) + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def do_spectrogram_diffusion( + self, + diffusion_model, + diffuser, + latents, + conditioning_latents, + seed, + temperature=1, + verbose=False, + ): + self.deterministic_state(seed=seed) + with torch.no_grad(): + output_seq_len = ( + latents.shape[1] * 4 * 24000 // 22050 + ) # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_shape = (latents.shape[0], 100, output_seq_len) + precomputed_embeddings = diffusion_model.timestep_independent( + latents, conditioning_latents, output_seq_len, False + ) + + noise = torch.randn(output_shape, device=latents.device) * temperature + mel = diffuser.p_sample_loop( + diffusion_model, + output_shape, + noise=noise, + model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings}, + progress=verbose, + ) + return self.denormalize_tacotron_mel(mel)[:, :, :output_seq_len] + + def parse( + self, best_results, best_latents, calm_token, diffusion_conditioning, text, seed + ): + self.deterministic_state(seed=seed) + best_results = copy(best_results).to("cuda") + best_latents = copy(best_latents).to("cuda") + diffusion_conditioning = copy(diffusion_conditioning).to("cuda") + wav_candidates = [] + for b in range(best_results.shape[0]): + + codes = best_results[b].unsqueeze(0) + latents = best_latents[b].unsqueeze(0) + + ctokens = 0 + for k in range(codes.shape[-1]): + if codes[0, k] == calm_token: + ctokens += 1 + else: + ctokens = 0 + if ctokens > 8: + latents = latents[:, :k] + break + + mel = self.do_spectrogram_diffusion( + self.diffusion, + self.diffuser, + latents, + diffusion_conditioning, + seed, + temperature=self.diffusion_temperature, + verbose=False, + ) + wav = self.vocoder.inference(mel) + wav_candidates.append(wav) + # wav_candidates = [self.potentially_redact(wav_candidate, text).cpu().detach().numpy() for wav_candidate in + # wav_candidates] + # TODO: Check whether wav candidates should be in numpy + wav_candidates = [ + self.potentially_redact(wav_candidate, text) + for wav_candidate in wav_candidates + ] + return wav_candidates + +class EndpointHandler(): + def __init__(self, path="config-model.yaml"): + config = get_config_file(Path(path)) + self.calm_token = 83 + self.tokenizer = VoiceBpeTokenizer() + _, conditioning_latent_1 = load_voice("gabby_reading", map_location="cpu") + _, conditioning_latent_2 = load_voice("gabby_conversation", map_location="cpu") + + # self.conditioning_latents1 = (latent.cpu().detach().numpy() for latent in conditioning_latent_1) + # self.conditioning_latents2 = (latent.cpu().detach().numpy() for latent in conditioning_latent_2) + self.conditioning_latents1 = (latent for latent in conditioning_latent_1) + self.conditioning_latents2 = (latent for latent in conditioning_latent_2) + ( + self.auto_conditioning1, + self.diffusion_conditioning1, + ) = self.conditioning_latents1 + ( + self.auto_conditioning2, + self.diffusion_conditioning2, + ) = self.conditioning_latents2 + + self.auto_conditioning = None + self.diffusion_conditioning = None + self.gpt = Gpt( + config[GPT][NUM_AUTOREGRESSIVE_SAMPLES], + config[GPT][TOP_P], + config[GPT][TEMPERATURE], + config[GPT][LENGTH_PENALTY], + config[GPT][REPETITION_PENALTY], + config[GPT][MAX_MEL_TOKENS], + config[GPT][AUTO_REGRESSIVE_BATCH_SIZE], + ) + self.clvp = clvp(config[CLVP_const]["k"]) + self.diffusion = Diffusion(config[DIFFUSION][DIFFUSION_TEMPERATURE]) + self.calm_token = 83 + print("orchestrator setup completed") + + @staticmethod + def __check_for_long_sentence(text_tokens): + assert ( + text_tokens.shape[-1] < 400 + ), "Too much text provided. Break the text up into separate segments and re-try inference." + # TODO: split the text into several pieces and do the generation and combine them last + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def preprocess_text(self, text: Text): + torch_tensor = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0) + return torch_tensor + + def parse(self, res): + print("parsing") + file_name = hashlib.sha1(str(datetime.now()).encode("UTF-8")) + res = [torch.Tensor(copy(split)).squeeze(0).cpu() for split in res] + res = [torch.flatten(split) for split in res] + merged_audio_tensor = torch.cat(res).reshape(1, -1) + torchaudio.save(f"./{file_name.hexdigest()}.wav", merged_audio_tensor, 24000) + # torchaudio.save(f"./{file_name.hexdigest()}.wav", torch.Tensor(copy(res)).squeeze(0).cpu(), 24000) + return file_name.hexdigest() + + def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: + voice = data["voice"] + text = data["text"] + seed = data["seed"] + if voice == "gabby_reading": + self.auto_conditioning = self.auto_conditioning1 + self.diffusion_conditioning = self.diffusion_conditioning1 + elif voice == "gabby_conversation": + self.auto_conditioning = self.auto_conditioning2 + self.diffusion_conditioning = self.diffusion_conditioning2 + + self.deterministic_state(seed=seed) + text_tokens = self.preprocess_text( + text + ) # preprocess the in-coming text into tokens + self.__check_for_long_sentence(text_tokens) + # text_tokens = text_tokens.cpu().detach().numpy() + samples, stop_mel_token = self.gpt.parse_inference( + self.auto_conditioning, text_tokens, seed + ) + best_sample = self.clvp.parse(text_tokens, samples, stop_mel_token, seed) + best_latent = self.gpt.parse( + self.auto_conditioning, text_tokens, best_sample, seed + ) + wav_candidates = self.diffusion.parse( + best_sample, + best_latent, + self.calm_token, + self.diffusion_conditioning, + text, + seed, + ) + if len(wav_candidates) > 1: + res = wav_candidates + else: + res = wav_candidates[0] + + buffered = BytesIO() + self.parse(res) + img_str = base64.b64encode(buffered.getvalue()) + + # postprocess the prediction + return {"audio": img_str.decode()} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..af825dea9999e7e4d7e0d69a8e26db8b5bed7dd5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +tqdm~=4.64.0 +rotary_embedding_torch +transformers~=4.21.2 +tokenizers~=0.12.1 +inflect~=6.0.0 +progressbar~=2.5 +einops~=0.4.1 +unidecode~=1.3.4 +scipy~=1.9.1 +librosa~=0.9.2 +numba==0.48.0 +ffmpeg +fastapi~=0.81.0 +ray[serve]~=2.0.0 +PyYAML~=6.0 +starlette~=0.19.1 +numpy~=1.23.2 +setuptools~=60.2.0 \ No newline at end of file diff --git a/ruth_tts_transformer/.gitignore b/ruth_tts_transformer/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..485dee64bcfb48793379b200a1afd14e85a8aaf4 --- /dev/null +++ b/ruth_tts_transformer/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/ruth_tts_transformer/__init__.py b/ruth_tts_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..425bd75dae230469cc3962c57a7c1f5256f400d6 --- /dev/null +++ b/ruth_tts_transformer/__init__.py @@ -0,0 +1,2 @@ +VERSION = "0.0.27" + diff --git a/ruth_tts_transformer/__pycache__/__init__.cpython-310.pyc b/ruth_tts_transformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..526e0ada5a51bd27eaa8ec3afd665c90cd3c0ee4 Binary files /dev/null and b/ruth_tts_transformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/ruth_tts_transformer/__pycache__/__init__.cpython-37.pyc b/ruth_tts_transformer/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc17c789115089d4008bc4d118e5fd45064a73a6 Binary files /dev/null and b/ruth_tts_transformer/__pycache__/__init__.cpython-37.pyc differ diff --git a/ruth_tts_transformer/__pycache__/__init__.cpython-38.pyc b/ruth_tts_transformer/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7c4d236f9a945beb8ba993897b3778e9ac72464 Binary files /dev/null and b/ruth_tts_transformer/__pycache__/__init__.cpython-38.pyc differ diff --git a/ruth_tts_transformer/__pycache__/__init__.cpython-39.pyc b/ruth_tts_transformer/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..473d7f4f0096d7be909079d44ebe8eee45dc7702 Binary files /dev/null and b/ruth_tts_transformer/__pycache__/__init__.cpython-39.pyc differ diff --git a/ruth_tts_transformer/data/latents.pkl b/ruth_tts_transformer/data/latents.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2ed4ca93b2652ebc8badd9b62e5328cbfa80a244 Binary files /dev/null and b/ruth_tts_transformer/data/latents.pkl differ diff --git a/ruth_tts_transformer/data/layman.txt b/ruth_tts_transformer/data/layman.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ruth_tts_transformer/data/mel_norms.pth b/ruth_tts_transformer/data/mel_norms.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8c73216492c6e3a58ea9ca74beb3c263dfc452e Binary files /dev/null and b/ruth_tts_transformer/data/mel_norms.pth differ diff --git a/ruth_tts_transformer/data/riding_hood.txt b/ruth_tts_transformer/data/riding_hood.txt new file mode 100644 index 0000000000000000000000000000000000000000..2987bef78f92ecb327fc0f754b7ab1211a18542b --- /dev/null +++ b/ruth_tts_transformer/data/riding_hood.txt @@ -0,0 +1,54 @@ +Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her. It suited the girl so extremely well that everybody called her Little Red Riding Hood. +One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter." + +Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. + +As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." + +"Does she live far off?" said the wolf + +"Oh I say," answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." + +"Well," said the wolf, "and I'll go and see her too. I'll go this way and go you that, and we shall see who will be there first." + +The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap. + +"Who's there?" + +"Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother." + +The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up." + +The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten. He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. + +"Who's there?" + +Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you." + +The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." + +Little Red Riding Hood pulled the bobbin, and the door opened. + +The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." + +Little Red Riding Hood took off her clothes and got into bed. She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" + +"All the better to hug you with, my dear." + +"Grandmother, what big legs you have!" + +"All the better to run with, my child." + +"Grandmother, what big ears you have!" + +"All the better to hear with, my child." + +"Grandmother, what big eyes you have!" + +"All the better to see with, my child." + +"Grandmother, what big teeth you have got!" + +"All the better to eat you up with." + +And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up. \ No newline at end of file diff --git a/ruth_tts_transformer/data/seal_copypasta.txt b/ruth_tts_transformer/data/seal_copypasta.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce59a386070125650d3c6d8e8a13801d3666aa5f --- /dev/null +++ b/ruth_tts_transformer/data/seal_copypasta.txt @@ -0,0 +1 @@ +What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al kayda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire U S armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the U S A and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo. \ No newline at end of file diff --git a/ruth_tts_transformer/data/tokenizer.json b/ruth_tts_transformer/data/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a128f273053e465a15c488e48d8106e0c8b0898e --- /dev/null +++ b/ruth_tts_transformer/data/tokenizer.json @@ -0,0 +1 @@ +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file diff --git a/ruth_tts_transformer/models/__init__.py b/ruth_tts_transformer/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ruth_tts_transformer/models/__pycache__/__init__.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..004b44570333bd471ecceb62b02ccf2aa05dce82 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/__init__.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f72639a3af5f9fba8b9c9ad8b53bcf235eec1b4 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/arch_util.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/arch_util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a5770509b1c05ce7a36a71fb1415084a843d479 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/arch_util.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/arch_util.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/arch_util.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd187ce347f27541133a894e2a7cb3cb8c19aa71 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/arch_util.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f82726723ccdbfdc3cff1f5893b578949e8581f Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d27f50ff52cf6add031503ec3ccefaaacee328a Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/autoregressive.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/clvp.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/clvp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acd0ab09c540e808d1cfabf4f09ac83713d03dcf Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/clvp.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/clvp.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/clvp.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c03dc4ff7c1ec2af19e13546713dfd18d027bbc8 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/clvp.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08c2f605f5cc46de8f14aac044ce42cac9658e31 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d1cc5b70ad73c74f58cead40a588456f3928e76 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/diffusion_decoder.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/transformer.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66554323b27a40127e531c539633a543ad06ab51 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/transformer.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/transformer.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/transformer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d33b28d0a9ffcd699270155abcec088f154aa406 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/transformer.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/vocoder.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/vocoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c39dacb214dd07a0b058e8de0a79997d31515654 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/vocoder.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/vocoder.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/vocoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d52a8c32fb1040085118d10d72ed2c02836628a Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/vocoder.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-310.pyc b/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3888e170cc5aa736155c2010220e3fcc182df2b3 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-310.pyc differ diff --git a/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-38.pyc b/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e3ee12612b4511e40b32387b63e84d2682d2248 Binary files /dev/null and b/ruth_tts_transformer/models/__pycache__/xtransformers.cpython-38.pyc differ diff --git a/ruth_tts_transformer/models/arch_util.py b/ruth_tts_transformer/models/arch_util.py new file mode 100644 index 0000000000000000000000000000000000000000..24f6a67aed46c1ff3c5675bd784fb31faa953f29 --- /dev/null +++ b/ruth_tts_transformer/models/arch_util.py @@ -0,0 +1,371 @@ +import os +import functools +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from ruth_tts_transformer.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + groups = 32 + if channels <= 16: + groups = 8 + elif channels <= 64: + groups = 16 + while channels % groups != 0: + groups = int(groups / 2) + assert groups > 2 + return GroupNorm32(groups, channels) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv, mask=None, rel_pos=None): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = torch.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + if rel_pos is not None: + weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1]) + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + if mask is not None: + # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs. + mask = mask.repeat(self.n_heads, 1).unsqueeze(1) + weight = weight * mask + a = torch.einsum("bts,bcs->bct", weight, v) + + return a.reshape(bs, -1, length) + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + do_checkpoint=True, + relative_pos_embeddings=False, + ): + super().__init__() + self.channels = channels + self.do_checkpoint = do_checkpoint + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.norm = normalization(channels) + self.qkv = nn.Conv1d(channels, channels * 3, 1) + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) + if relative_pos_embeddings: + self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) + else: + self.relative_pos_embeddings = None + + def forward(self, x, mask=None): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv, mask, self.relative_pos_embeddings) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.factor = factor + if use_conv: + ksize = 5 + pad = 2 + self.conv = nn.Conv1d(self.channels, self.out_channels, ksize, padding=pad) + + def forward(self, x): + assert x.shape[1] == self.channels + x = F.interpolate(x, scale_factor=self.factor, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + """ + + def __init__(self, channels, use_conv, out_channels=None, factor=4, ksize=5, pad=2): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + + stride = factor + if use_conv: + self.op = nn.Conv1d( + self.channels, self.out_channels, ksize, stride=stride, padding=pad + ) + else: + assert self.channels == self.out_channels + self.op = nn.AvgPool1d(kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(nn.Module): + def __init__( + self, + channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + up=False, + down=False, + kernel_size=3, + ): + super().__init__() + self.channels = channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_scale_shift_norm = use_scale_shift_norm + padding = 1 if kernel_size == 3 else 2 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False) + self.x_upd = Upsample(channels, False) + elif down: + self.h_upd = Downsample(channels, False) + self.x_upd = Downsample(channels, False) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = nn.Conv1d( + channels, self.out_channels, kernel_size, padding=padding + ) + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) + + def forward(self, x): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AudioMiniEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + base_channels=128, + depth=2, + resnet_blocks=2, + attn_blocks=4, + num_attn_heads=4, + dropout=0, + downsample_factor=2, + kernel_size=3): + super().__init__() + self.init = nn.Sequential( + nn.Conv1d(spec_dim, base_channels, 3, padding=1) + ) + ch = base_channels + res = [] + for l in range(depth): + for r in range(resnet_blocks): + res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) + res.append(Downsample(ch, use_conv=True, out_channels=ch*2, factor=downsample_factor)) + ch *= 2 + self.res = nn.Sequential(*res) + self.final = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.Conv1d(ch, embedding_dim, 1) + ) + attn = [] + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads,)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + + def forward(self, x): + h = self.init(x) + h = self.res(h) + h = self.final(h) + h = self.attn(h) + return h[:, :, 0] + + +DEFAULT_MEL_NORM_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/mel_norms.pth') + + +class TorchMelSpectrogram(nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0, mel_fmax=8000, + sampling_rate=22050, normalize=False, mel_norm_file=DEFAULT_MEL_NORM_FILE): + super().__init__() + # These are the default tacotron values for the MEL spectrogram. + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.n_mel_channels = n_mel_channels + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.sampling_rate = sampling_rate + self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length, + win_length=self.win_length, power=2, normalized=normalize, + sample_rate=self.sampling_rate, f_min=self.mel_fmin, + f_max=self.mel_fmax, n_mels=self.n_mel_channels, + norm="slaney") + self.mel_norm_file = mel_norm_file + if self.mel_norm_file is not None: + self.mel_norms = torch.load(self.mel_norm_file) + else: + self.mel_norms = None + + def forward(self, inp): + if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) + inp = inp.squeeze(1) + assert len(inp.shape) == 2 + self.mel_stft = self.mel_stft.to(inp.device) + mel = self.mel_stft(inp) + # Perform dynamic range compression + mel = torch.log(torch.clamp(mel, min=1e-5)) + if self.mel_norms is not None: + self.mel_norms = self.mel_norms.to(mel.device) + mel = mel / self.mel_norms.unsqueeze(0).unsqueeze(-1) + return mel + + +class CheckpointedLayer(nn.Module): + """ + Wraps a module. When forward() is called, passes kwargs that require_grad through torch.checkpoint() and bypasses + checkpoint for all other args. + """ + def __init__(self, wrap): + super().__init__() + self.wrap = wrap + + def forward(self, x, *args, **kwargs): + for k, v in kwargs.items(): + assert not (isinstance(v, torch.Tensor) and v.requires_grad) # This would screw up checkpointing. + partial = functools.partial(self.wrap, **kwargs) + return partial(x, *args) + + +class CheckpointedXTransformerEncoder(nn.Module): + """ + Wraps a ContinuousTransformerWrapper and applies CheckpointedLayer to each layer and permutes from channels-mid + to channels-last that XTransformer expects. + """ + def __init__(self, needs_permute=True, exit_permute=True, checkpoint=True, **xtransformer_kwargs): + super().__init__() + self.transformer = ContinuousTransformerWrapper(**xtransformer_kwargs) + self.needs_permute = needs_permute + self.exit_permute = exit_permute + + if not checkpoint: + return + for i in range(len(self.transformer.attn_layers.layers)): + n, b, r = self.transformer.attn_layers.layers[i] + self.transformer.attn_layers.layers[i] = nn.ModuleList([n, CheckpointedLayer(b), r]) + + def forward(self, x, **kwargs): + if self.needs_permute: + x = x.permute(0,2,1) + h = self.transformer(x, **kwargs) + if self.exit_permute: + h = h.permute(0,2,1) + return h \ No newline at end of file diff --git a/ruth_tts_transformer/models/autoregressive.py b/ruth_tts_transformer/models/autoregressive.py new file mode 100644 index 0000000000000000000000000000000000000000..5333949c266f44556f1d29636f0af66b9a595312 --- /dev/null +++ b/ruth_tts_transformer/models/autoregressive.py @@ -0,0 +1,528 @@ +import functools + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList +from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions +from transformers.utils.model_parallel_utils import get_device_map, assert_device_map +from ruth_tts_transformer.models.arch_util import AttentionBlock +from ruth_tts_transformer.utils.typical_sampling import TypicalLogitsWarper + + +def null_position_embeddings(range, dim): + return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) + + +class ResidualConvolutionBlock(nn.Module): + + def __init__(self, chan): + super().__init__() + self.neural_network = nn.Sequential( + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan // 8, chan), + nn.ReLU(), + nn.Conv1d(chan, chan, kernel_size=3, padding=1), + nn.GroupNorm(chan // 8, chan) + ) + + def forward(self, x): + return F.relu(self.neural_network(x) + x) + + +class GPT2InferenceModel(GPT2PreTrainedModel): + def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear): + super().__init__(config) + self.transformer = gpt + self.text_pos_embedding = text_pos_emb + self.embeddings = embeddings + self.lm_head = nn.Sequential(norm, linear) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.cached_mel_emb = None + + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def store_mel_emb(self, mel_emb): + self.cached_mel_emb = mel_emb + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + "input_ids": input_ids, + "past_key_values": past, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + assert self.cached_mel_emb is not None + assert inputs_embeds is None # Not supported by this inference model. + assert labels is None # Training not supported by this inference model. + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Create embedding + mel_len = self.cached_mel_emb.shape[1] + if input_ids.shape[1] != 1: + text_inputs = input_ids[:, mel_len:] + text_emb = self.embeddings(text_inputs) + text_emb = text_emb + self.text_pos_embedding(text_emb) + if self.cached_mel_emb.shape[0] != text_emb.shape[0]: + mel_emb = self.cached_mel_emb.repeat_interleave(text_emb.shape[0] // self.cached_mel_emb.shape[0], 0) + else: + mel_emb = self.cached_mel_emb + emb = torch.cat([mel_emb, text_emb], dim=1) + else: + emb = self.embeddings(input_ids) + emb = emb + self.text_pos_embedding.get_fixed_embedding(attention_mask.shape[1] - mel_len, + attention_mask.device) + + transformer_outputs = self.transformer( + inputs_embeds=emb, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + transformer_outputs[1:] + + return CausalLMOutputWithCrossAttentions( + loss=None, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past, beam_idx): + """ + This function is used to re-order the :obj:`past_key_values` cache if + :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is + called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +class ConditioningEncoder(nn.Module): + def __init__(self, + spec_dim, + embedding_dim, + attn_blocks=6, + num_attn_heads=4, + do_checkpointing=False, + mean=False): + super().__init__() + attn = [] + self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) + for a in range(attn_blocks): + attn.append(AttentionBlock(embedding_dim, num_attn_heads)) + self.attn = nn.Sequential(*attn) + self.dim = embedding_dim + self.do_checkpointing = do_checkpointing + self.mean = mean + + def forward(self, x): + h = self.init(x) + h = self.attn(h) + if self.mean: + return h.mean(dim=2) + else: + return h[:, :, 0] + + +class LearnedPositionEmbeddings(nn.Module): + def __init__(self, seq_len, model_dim, init=.02): + super().__init__() + self.emb = nn.Embedding(seq_len, model_dim) + # Initializing this way is standard for GPT-2 + self.emb.weight.data.normal_(mean=0.0, std=init) + + def forward(self, x): + sl = x.shape[1] + return self.emb(torch.arange(0, sl, device=x.device)) + + def get_fixed_embedding(self, ind, dev): + return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) + + +def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing): + """ + GPT-2 implemented by the HuggingFace library. + """ + from transformers import GPT2Config, GPT2Model + gpt_config = GPT2Config(vocab_size=256, # Unused. + n_positions=max_mel_seq_len + max_text_seq_len, + n_ctx=max_mel_seq_len + max_text_seq_len, + n_embd=model_dim, + n_layer=layers, + n_head=heads, + gradient_checkpointing=checkpointing, + use_cache=not checkpointing) + gpt = GPT2Model(gpt_config) + # Override the built in positional embeddings + del gpt.wpe + gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) + # Built-in token embeddings are unused. + del gpt.wte + return gpt, LearnedPositionEmbeddings(max_mel_seq_len, model_dim), LearnedPositionEmbeddings(max_text_seq_len, + model_dim), \ + None, None + + +class MelEncoder(nn.Module): + def __init__(self, channels, mel_channels=80, resblocks_per_reduction=2): + super().__init__() + self.channels = channels + self.encoder = nn.Sequential(nn.Conv1d(mel_channels, channels // 4, kernel_size=3, padding=1), + nn.Sequential(*[ResidualConvolutionBlock(channels // 4) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels // 4, channels // 2, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels // 16, channels // 2), + nn.ReLU(), + nn.Sequential(*[ResidualConvolutionBlock(channels // 2) for _ in range(resblocks_per_reduction)]), + nn.Conv1d(channels // 2, channels, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(channels // 8, channels), + nn.ReLU(), + nn.Sequential(*[ResidualConvolutionBlock(channels) for _ in range(resblocks_per_reduction)]), + ) + self.reduction = 4 + + def forward(self, x): + for e in self.encoder: + x = e(x) + return x.permute(0, 2, 1) + + +class UnifiedVoice(nn.Module): + def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, + max_conditioning_inputs=1, + mel_length_compression=1024, number_text_tokens=256, + start_text_token=None, number_mel_codes=8194, start_mel_token=8192, + stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True, + checkpointing=True, types=1): + """ + Args: + layers: Number of layers in transformer stack. + model_dim: Operating dimensions of the transformer + heads: Number of transformer heads. Must be divisible by model_dim. Recommend model_dim//64 + max_text_tokens: Maximum number of text tokens that will be encountered by model. + max_mel_tokens: Maximum number of MEL tokens that will be encountered by model. + max_conditioning_inputs: Maximum number of conditioning inputs provided to the model. If (1), conditioning input can be of format (b,80,s), otherwise (b,n,80,s). + mel_length_compression: The factor between and . Used to compute MEL code padding given wav input length. + number_text_tokens: + start_text_token: + stop_text_token: + number_mel_codes: + start_mel_token: + stop_mel_token: + train_solo_embeddings: + use_mel_codes_as_input: + checkpointing: + """ + super().__init__() + + self.number_text_tokens = number_text_tokens + self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token + self.stop_text_token = 0 + self.number_mel_codes = number_mel_codes + self.start_mel_token = start_mel_token + self.stop_mel_token = stop_mel_token + self.layers = layers + self.heads = heads + self.max_mel_tokens = max_mel_tokens + self.max_text_tokens = max_text_tokens + self.model_dim = model_dim + self.max_conditioning_inputs = max_conditioning_inputs + self.mel_length_compression = mel_length_compression + self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) + self.text_embedding = nn.Embedding(self.number_text_tokens * types + 1, model_dim) + if use_mel_codes_as_input: + self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim) + else: + self.mel_embedding = MelEncoder(model_dim, resblocks_per_reduction=1) + self.gpt, self.mel_pos_embedding, self.text_pos_embedding, self.mel_layer_pos_embedding, self.text_layer_pos_embedding = \ + build_hf_gpt_transformer(layers, model_dim, heads, self.max_mel_tokens + 2 + self.max_conditioning_inputs, + self.max_text_tokens + 2, checkpointing) + if train_solo_embeddings: + self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True) + else: + self.mel_solo_embedding = 0 + self.text_solo_embedding = 0 + + self.final_norm = nn.LayerNorm(model_dim) + self.text_head = nn.Linear(model_dim, self.number_text_tokens * types + 1) + self.mel_head = nn.Linear(model_dim, self.number_mel_codes) + + # Initialize the embeddings per the GPT-2 scheme + embeddings = [self.text_embedding] + if use_mel_codes_as_input: + embeddings.append(self.mel_embedding) + for module in embeddings: + module.weight.data.normal_(mean=0.0, std=.02) + + def build_aligned_inputs_and_targets(self, input, start_token, stop_token): + inp = F.pad(input, (1, 0), value=start_token) + tar = F.pad(input, (0, 1), value=stop_token) + return inp, tar + + def set_mel_padding(self, mel_input_tokens, wav_lengths): + """ + Given mel tokens that are derived from a padded audio clip and the actual lengths of each batch element in + that audio clip, reformats the tokens with STOP_MEL_TOKEN in place of the zero padding. This is required + preformatting to create a working TTS model. + """ + # Set padding areas within MEL (currently it is coded with the MEL code for ). + mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc') + for b in range(len(mel_lengths)): + actual_end = mel_lengths[ + b] + 1 # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token. + if actual_end < mel_input_tokens.shape[-1]: + mel_input_tokens[b, actual_end:] = self.stop_mel_token + return mel_input_tokens + + def get_logits(self, speech_conditioning_inputs, first_inputs, first_head, second_inputs=None, second_head=None, + get_attns=False, return_latent=False): + if second_inputs is not None: + emb = torch.cat([speech_conditioning_inputs, first_inputs, second_inputs], dim=1) + else: + emb = torch.cat([speech_conditioning_inputs, first_inputs], dim=1) + + gpt_out = self.gpt(inputs_embeds=emb, return_dict=True, output_attentions=get_attns) + if get_attns: + return gpt_out.attentions + + enc = gpt_out.last_hidden_state[:, 1:] # The first logit is tied to the speech_conditioning_input + enc = self.final_norm(enc) + + if return_latent: + return enc[:, speech_conditioning_inputs.shape[1]:speech_conditioning_inputs.shape[1] + first_inputs.shape[ + 1]], enc[:, -second_inputs.shape[1]:] + + first_logits = enc[:, :first_inputs.shape[1]] + first_logits = first_head(first_logits) + first_logits = first_logits.permute(0, 2, 1) + if second_inputs is not None: + second_logits = enc[:, -second_inputs.shape[1]:] + second_logits = second_head(second_logits) + second_logits = second_logits.permute(0, 2, 1) + return first_logits, second_logits + else: + return first_logits + + def get_conditioning(self, speech_conditioning_input): + speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len( + speech_conditioning_input.shape) == 3 else speech_conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds = torch.stack(conds, dim=1) + conds = conds.mean(dim=1) + return conds + + def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, + text_first=True, raw_mels=None, return_attentions=False, + return_latent=False, clip_inputs=True): + """ + Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode + (actuated by `text_first`). + + speech_conditioning_input: MEL float tensor, (b,1024) + text_inputs: long tensor, (b,t) + text_lengths: long tensor, (b,) + mel_inputs: long tensor, (b,m) + wav_lengths: long tensor, (b,) + raw_mels: MEL float tensor (b,80,s) + + If return_attentions is specified, only logits are returned. + If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned. + If clip_inputs is True, the inputs will be clipped to the smallest input size across each input modality. + """ + # Types are expressed by expanding the text embedding space. + if types is not None: + text_inputs = text_inputs * (1 + types).unsqueeze(-1) + + if clip_inputs: + # This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by + # chopping the inputs by the maximum actual length. + max_text_len = text_lengths.max() + text_inputs = text_inputs[:, :max_text_len] + max_mel_len = wav_lengths.max() // self.mel_length_compression + mel_codes = mel_codes[:, :max_mel_len] + if raw_mels is not None: + raw_mels = raw_mels[:, :, :max_mel_len * 4] + mel_codes = self.set_mel_padding(mel_codes, wav_lengths) + text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token) + mel_codes = F.pad(mel_codes, (0, 1), value=self.stop_mel_token) + + conds = speech_conditioning_latent.unsqueeze(1) + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, + self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, + self.stop_mel_token) + if raw_mels is not None: + mel_inp = F.pad(raw_mels, (0, 8)) + else: + mel_inp = mel_codes + mel_emb = self.mel_embedding(mel_inp) + mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + + if text_first: + text_logits, mel_logits = self.get_logits(conds, text_emb, self.text_head, mel_emb, self.mel_head, + get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return mel_logits[:, + :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + else: + mel_logits, text_logits = self.get_logits(conds, mel_emb, self.mel_head, text_emb, self.text_head, + get_attns=return_attentions, return_latent=return_latent) + if return_latent: + return text_logits[:, + :-2] # Despite the name, these are not logits. Strip off the two tokens added by this forward pass. + + if return_attentions: + return mel_logits + loss_text = F.cross_entropy(text_logits, text_targets.long()) + loss_mel = F.cross_entropy(mel_logits, mel_targets.long()) + return loss_text.mean(), loss_mel.mean(), mel_logits + + def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1, + max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs): + seq_length = self.max_mel_tokens + self.max_text_tokens + 2 + if not hasattr(self, 'inference_model'): + # TODO: Decouple gpt_config from this inference model. + gpt_config = GPT2Config(vocab_size=self.max_mel_tokens, + n_positions=seq_length, + n_ctx=seq_length, + n_embd=self.model_dim, + n_layer=self.layers, + n_head=self.heads, + gradient_checkpointing=False, + use_cache=True) + self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, + self.final_norm, self.mel_head) + self.gpt.wte = self.mel_embedding + + text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token) + text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, + self.stop_text_token) + text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + + conds = speech_conditioning_latent.unsqueeze(1) + emb = torch.cat([conds, text_emb], dim=1) + self.inference_model.store_mel_emb(emb) + + fake_inputs = torch.full((emb.shape[0], conds.shape[1] + emb.shape[1],), fill_value=1, dtype=torch.long, + device=text_inputs.device) + fake_inputs[:, -1] = self.start_mel_token + trunc_index = fake_inputs.shape[1] + if input_tokens is None: + inputs = fake_inputs + else: + assert num_return_sequences % input_tokens.shape[ + 0] == 0, "The number of return sequences must be divisible by the number of input sequences" + fake_inputs = fake_inputs.repeat(num_return_sequences, 1) + input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) + inputs = torch.cat([fake_inputs, input_tokens], dim=1) + + logits_processor = LogitsProcessorList( + [TypicalLogitsWarper(mass=typical_mass)]) if typical_sampling else LogitsProcessorList() + max_length = trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length + gen = self.inference_model.generate(inputs, bos_token_id=self.start_mel_token, pad_token_id=self.stop_mel_token, + eos_token_id=self.stop_mel_token, + max_length=max_length, logits_processor=logits_processor, + num_return_sequences=num_return_sequences, **hf_generate_kwargs) + return gen[:, trunc_index:] + + +if __name__ == '__main__': + gpt = UnifiedVoice(model_dim=256, heads=4, train_solo_embeddings=True, use_mel_codes_as_input=True, + max_conditioning_inputs=4) + l = gpt(torch.randn(2, 3, 80, 800), + torch.randint(high=120, size=(2, 120)), + torch.tensor([32, 120]), + torch.randint(high=8192, size=(2, 250)), + torch.tensor([250 * 256, 195 * 256])) + gpt.text_forward(torch.randn(2, 80, 800), torch.randint(high=50, size=(2, 80)), torch.tensor([32, 80])) diff --git a/ruth_tts_transformer/models/clvp.py b/ruth_tts_transformer/models/clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..77e3063afbbe7b89fef25f315a4c3b3aa216dc9c --- /dev/null +++ b/ruth_tts_transformer/models/clvp.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import einsum + +from ruth_tts_transformer.models.arch_util import CheckpointedXTransformerEncoder +from ruth_tts_transformer.models.transformer import Transformer +from ruth_tts_transformer.models.xtransformers import Encoder + + +def exists(val): + return val is not None + + +def masked_mean(t, mask, dim = 1): + t = t.masked_fill(~mask[:, :, None], 0.) + return t.sum(dim = 1) / mask.sum(dim = 1)[..., None] + +class CLVP(nn.Module): + """ + CLIP model retrofitted for performing contrastive evaluation between tokenized audio data and the corresponding + transcribed text. + + Originally from https://github.com/lucidrains/DALLE-pytorch/blob/main/dalle_pytorch/dalle_pytorch.py + """ + + def __init__( + self, + *, + dim_text=512, + dim_speech=512, + dim_latent=512, + num_text_tokens=256, + text_enc_depth=6, + text_seq_len=120, + text_heads=8, + num_speech_tokens=8192, + speech_enc_depth=6, + speech_heads=8, + speech_seq_len=250, + text_mask_percentage=0, + voice_mask_percentage=0, + wav_token_compression=1024, + use_xformers=False, + ): + super().__init__() + self.text_emb = nn.Embedding(num_text_tokens, dim_text) + self.to_text_latent = nn.Linear(dim_text, dim_latent, bias=False) + + self.speech_emb = nn.Embedding(num_speech_tokens, dim_speech) + self.to_speech_latent = nn.Linear(dim_speech, dim_latent, bias=False) + + if use_xformers: + self.text_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_text, + depth=text_enc_depth, + heads=text_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + self.speech_transformer = CheckpointedXTransformerEncoder( + needs_permute=False, + exit_permute=False, + max_seq_len=-1, + attn_layers=Encoder( + dim=dim_speech, + depth=speech_enc_depth, + heads=speech_heads, + ff_dropout=.1, + ff_mult=2, + attn_dropout=.1, + use_rmsnorm=True, + ff_glu=True, + rotary_pos_emb=True, + )) + else: + self.text_transformer = Transformer(causal=False, seq_len=text_seq_len, dim=dim_text, depth=text_enc_depth, + heads=text_heads) + self.speech_transformer = Transformer(causal=False, seq_len=speech_seq_len, dim=dim_speech, + depth=speech_enc_depth, heads=speech_heads) + + self.temperature = nn.Parameter(torch.tensor(1.)) + self.text_mask_percentage = text_mask_percentage + self.voice_mask_percentage = voice_mask_percentage + self.wav_token_compression = wav_token_compression + self.xformers = use_xformers + if not use_xformers: + self.text_pos_emb = nn.Embedding(text_seq_len, dim_text) + self.speech_pos_emb = nn.Embedding(num_speech_tokens, dim_speech) + + def forward( + self, + text, + speech_tokens, + return_loss=False + ): + b, device = text.shape[0], text.device + if self.training: + text_mask = torch.rand_like(text.float()) > self.text_mask_percentage + voice_mask = torch.rand_like(speech_tokens.float()) > self.voice_mask_percentage + else: + text_mask = torch.ones_like(text.float()).bool() + voice_mask = torch.ones_like(speech_tokens.float()).bool() + + text_emb = self.text_emb(text) + speech_emb = self.speech_emb(speech_tokens) + + if not self.xformers: + text_emb += self.text_pos_emb(torch.arange(text.shape[1], device=device)) + speech_emb += self.speech_pos_emb(torch.arange(speech_emb.shape[1], device=device)) + + enc_text = self.text_transformer(text_emb, mask=text_mask) + enc_speech = self.speech_transformer(speech_emb, mask=voice_mask) + + text_latents = masked_mean(enc_text, text_mask, dim=1) + speech_latents = masked_mean(enc_speech, voice_mask, dim=1) + + text_latents = self.to_text_latent(text_latents) + speech_latents = self.to_speech_latent(speech_latents) + + text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) + + temp = self.temperature.exp() + + if not return_loss: + sim = einsum('n d, n d -> n', text_latents, speech_latents) * temp + return sim + + sim = einsum('i d, j d -> i j', text_latents, speech_latents) * temp + labels = torch.arange(b, device=device) + loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 + return loss + + +if __name__ == '__main__': + clip = CLVP(text_mask_percentage=.2, voice_mask_percentage=.2) + clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=True) + nonloss = clip(torch.randint(0,256,(2,120)), + torch.tensor([50,100]), + torch.randint(0,8192,(2,250)), + torch.tensor([101,102]), + return_loss=False) + print(nonloss.shape) \ No newline at end of file diff --git a/ruth_tts_transformer/models/diffusion_decoder.py b/ruth_tts_transformer/models/diffusion_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..0b016cf1c3198d9329639e8e63a494adeeae109c --- /dev/null +++ b/ruth_tts_transformer/models/diffusion_decoder.py @@ -0,0 +1,349 @@ +import math +import random +from abc import abstractmethod + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import autocast + +from ruth_tts_transformer.models.arch_util import normalization, AttentionBlock + + +def is_latent(t): + return t.dtype == torch.float + + +def is_sequence(t): + return t.dtype == torch.long + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +class TimestepBlock(nn.Module): + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class ResBlock(TimestepBlock): + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + dims=2, + kernel_size=3, + efficient_config=True, + use_scale_shift_norm=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_scale_shift_norm = use_scale_shift_norm + padding = {1: 0, 3: 1, 5: 2}[kernel_size] + eff_kernel = 1 if efficient_config else 3 + eff_padding = 0 if efficient_config else 1 + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding), + ) + + self.emb_layers = nn.Sequential( + nn.SiLU(), + nn.Linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + else: + self.skip_connection = nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding) + + def forward(self, x, emb): + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = torch.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class DiffusionLayer(TimestepBlock): + def __init__(self, model_channels, dropout, num_heads): + super().__init__() + self.resblk = ResBlock(model_channels, model_channels, dropout, model_channels, dims=1, + use_scale_shift_norm=True) + self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True) + + def forward(self, x, time_emb): + y = self.resblk(x, time_emb) + return self.attn(y) + + +class DiffusionTts(nn.Module): + def __init__( + self, + model_channels=512, + num_layers=8, + in_channels=100, + in_latent_channels=512, + in_tokens=8193, + out_channels=200, # mean and variance + dropout=0, + use_fp16=False, + num_heads=16, + # Parameters for regularization. + layer_drop=.1, + unconditioned_percentage=.1, + # This implements a mechanism similar to what is used in classifier-free training. + ): + super().__init__() + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.dropout = dropout + self.num_heads = num_heads + self.unconditioned_percentage = unconditioned_percentage + self.enable_fp16 = use_fp16 + self.layer_drop = layer_drop + + self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1) + self.time_embed = nn.Sequential( + nn.Linear(model_channels, model_channels), + nn.SiLU(), + nn.Linear(model_channels, model_channels), + ) + + # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed. + # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally + # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive + # transformer network. + self.code_embedding = nn.Embedding(in_tokens, model_channels) + self.code_converter = nn.Sequential( + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.code_norm = normalization(model_channels) + self.latent_conditioner = nn.Sequential( + nn.Conv1d(in_latent_channels, model_channels, 3, padding=1), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + ) + self.contextual_embedder = nn.Sequential(nn.Conv1d(in_channels, model_channels, 3, padding=1, stride=2), + nn.Conv1d(model_channels, model_channels * 2, 3, padding=1, stride=2), + AttentionBlock(model_channels * 2, num_heads, + relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels * 2, num_heads, + relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels * 2, num_heads, + relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels * 2, num_heads, + relative_pos_embeddings=True, do_checkpoint=False), + AttentionBlock(model_channels * 2, num_heads, + relative_pos_embeddings=True, do_checkpoint=False)) + self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1)) + self.conditioning_timestep_integrator = TimestepEmbedSequential( + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + DiffusionLayer(model_channels, dropout, num_heads), + ) + + self.integrating_conv = nn.Conv1d(model_channels * 2, model_channels, kernel_size=1) + self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1) + + self.layers = nn.ModuleList([DiffusionLayer(model_channels, dropout, num_heads) for _ in range(num_layers)] + + [ResBlock(model_channels, model_channels, dropout, dims=1, + use_scale_shift_norm=True) for _ in range(3)]) + + self.out = nn.Sequential( + normalization(model_channels), + nn.SiLU(), + nn.Conv1d(model_channels, out_channels, 3, padding=1), + ) + + def get_grad_norm_parameter_groups(self): + groups = { + 'minicoder': list(self.contextual_embedder.parameters()), + 'layers': list(self.layers.parameters()), + 'code_converters': list(self.code_embedding.parameters()) + list(self.code_converter.parameters()) + list( + self.latent_conditioner.parameters()) + list(self.latent_conditioner.parameters()), + 'timestep_integrator': list(self.conditioning_timestep_integrator.parameters()) + list( + self.integrating_conv.parameters()), + 'time_embed': list(self.time_embed.parameters()), + } + return groups + + def get_conditioning(self, conditioning_input): + speech_conditioning_input = conditioning_input.unsqueeze(1) if len( + conditioning_input.shape) == 3 else conditioning_input + conds = [] + for j in range(speech_conditioning_input.shape[1]): + conds.append(self.contextual_embedder(speech_conditioning_input[:, j])) + conds = torch.cat(conds, dim=-1) + conds = conds.mean(dim=-1) + return conds + + def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred): + # Shuffle aligned_latent to BxCxS format + if is_latent(aligned_conditioning): + aligned_conditioning = aligned_conditioning.permute(0, 2, 1) + + cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1) + if is_latent(aligned_conditioning): + code_emb = self.latent_conditioner(aligned_conditioning) + else: + code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1) + code_emb = self.code_converter(code_emb) + code_emb = self.code_norm(code_emb) * (1 + cond_scale.unsqueeze(-1)) + cond_shift.unsqueeze(-1) + + unconditioned_batches = torch.zeros((code_emb.shape[0], 1, 1), device=code_emb.device) + # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance. + if self.training and self.unconditioned_percentage > 0: + unconditioned_batches = torch.rand((code_emb.shape[0], 1, 1), + device=code_emb.device) < self.unconditioned_percentage + code_emb = torch.where(unconditioned_batches, + self.unconditioned_embedding.repeat(aligned_conditioning.shape[0], 1, 1), + code_emb) + expanded_code_emb = F.interpolate(code_emb, size=expected_seq_len, mode='nearest') + + if not return_code_pred: + return expanded_code_emb + else: + mel_pred = self.mel_head(expanded_code_emb) + # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss. + mel_pred = mel_pred * unconditioned_batches.logical_not() + return expanded_code_emb, mel_pred + + def forward(self, x, timesteps, aligned_conditioning=None, conditioning_latent=None, + precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced. + :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning(). + :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent() + :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered. + :return: an [N x C x ...] Tensor of outputs. + """ + assert precomputed_aligned_embeddings is not None or ( + aligned_conditioning is not None and conditioning_latent is not None) + assert not ( + return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive. + + unused_params = [] + if conditioning_free: + code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1]) + unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + unused_params.extend(list(self.latent_conditioner.parameters())) + else: + if precomputed_aligned_embeddings is not None: + code_emb = precomputed_aligned_embeddings + else: + code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_latent, x.shape[-1], + True) + if is_latent(aligned_conditioning): + unused_params.extend( + list(self.code_converter.parameters()) + list(self.code_embedding.parameters())) + else: + unused_params.extend(list(self.latent_conditioner.parameters())) + + unused_params.append(self.unconditioned_embedding) + + time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + code_emb = self.conditioning_timestep_integrator(code_emb, time_emb) + x = self.inp_block(x) + x = torch.cat([x, code_emb], dim=1) + x = self.integrating_conv(x) + for i, lyr in enumerate(self.layers): + # Do layer drop where applicable. Do not drop first and last layers. + if self.training and self.layer_drop > 0 and i != 0 and i != ( + len(self.layers) - 1) and random.random() < self.layer_drop: + unused_params.extend(list(lyr.parameters())) + else: + # First and last blocks will have autocast disabled for improved precision. + with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + x = lyr(x, time_emb) + + x = x.float() + out = self.out(x) + + # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors. + extraneous_addition = 0 + for p in unused_params: + extraneous_addition = extraneous_addition + p.mean() + out = out + extraneous_addition * 0 + + if return_code_pred: + return out, mel_pred + return out + + +if __name__ == '__main__': + clip = torch.randn(2, 100, 400) + aligned_latent = torch.randn(2, 388, 512) + aligned_sequence = torch.randint(0, 8192, (2, 100)) + cond = torch.randn(2, 100, 400) + ts = torch.LongTensor([600, 600]) + model = DiffusionTts(512, layer_drop=.3, unconditioned_percentage=.5) + # Test with latent aligned conditioning + # o = model(clip, ts, aligned_latent, cond) + # Test with sequence aligned conditioning + o = model(clip, ts, aligned_sequence, cond) diff --git a/ruth_tts_transformer/models/transformer.py b/ruth_tts_transformer/models/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..1ada2c4ab7f7db4d08059b936753e91484aa5dda --- /dev/null +++ b/ruth_tts_transformer/models/transformer.py @@ -0,0 +1,221 @@ +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import nn + + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val, depth=1): + if isinstance(val, list): + val = tuple(val) + return val if isinstance(val, tuple) else (val,) * depth + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def stable_softmax(t, dim=-1, alpha=32 ** 2): + t = t / alpha + t = t - torch.amax(t, dim=dim, keepdim=True).detach() + return (t * alpha).softmax(dim=dim) + + +def route_args(router, args, depth): + routed_args = [(dict(), dict()) for _ in range(depth)] + matched_keys = [key for key in args.keys() if key in router] + + for key in matched_keys: + val = args[key] + for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): + new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) + routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) + return routed_args + + +# classes +class SequentialSequence(nn.Module): + def __init__(self, layers, args_route={}, layer_dropout=0.): + super().__init__() + assert all(len(route) == len(layers) for route in + args_route.values()), 'each argument route map must have the same depth as the number of sequential layers' + self.layers = layers + self.args_route = args_route + self.layer_dropout = layer_dropout + + def forward(self, x, **kwargs): + args = route_args(self.args_route, kwargs, len(self.layers)) + layers_and_args = list(zip(self.layers, args)) + + for (f, g), (f_args, g_args) in layers_and_args: + x = x + f(x, **f_args) + x = x + g(x, **g_args) + return x + + +class DivideMax(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + maxes = x.amax(dim=self.dim, keepdim=True).detach() + return x / maxes + + +# https://arxiv.org/abs/2103.17239 +class LayerScale(nn.Module): + def __init__(self, dim, depth, fn): + super().__init__() + if depth <= 18: + init_eps = 0.1 + elif depth > 18 and depth <= 24: + init_eps = 1e-5 + else: + init_eps = 1e-6 + + scale = torch.zeros(1, 1, dim).fill_(init_eps) + self.scale = nn.Parameter(scale) + self.fn = fn + + def forward(self, x, **kwargs): + return self.fn(x, **kwargs) * self.scale + + +# layer norm + + +class PreNorm(nn.Module): + def __init__(self, dim, fn, sandwich=False): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity() + self.fn = fn + + def forward(self, x, **kwargs): + x = self.norm(x) + x = self.fn(x, **kwargs) + return self.norm_out(x) + + +# feed forward + + +class GEGLU(nn.Module): + def forward(self, x): + x, gates = x.chunk(2, dim=-1) + return x * F.gelu(gates) + + +class FeedForward(nn.Module): + def __init__(self, dim, dropout=0., mult=4.): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, dim * mult * 2), + GEGLU(), + nn.Dropout(dropout), + nn.Linear(dim * mult, dim) + ) + + def forward(self, x): + return self.net(x) + + +# Attention + + +class Attention(nn.Module): + def __init__(self, dim, seq_len, causal=True, heads=8, dim_head=64, dropout=0.): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.seq_len = seq_len + self.scale = dim_head ** -0.5 + + self.causal = causal + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) + self.to_out = nn.Sequential( + nn.Linear(inner_dim, dim), + nn.Dropout(dropout) + ) + + def forward(self, x, mask=None): + b, n, _, h, device = *x.shape, self.heads, x.device + softmax = torch.softmax + + qkv = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv) + + q = q * self.scale + + dots = torch.einsum('b h i d, b h j d -> b h i j', q, k) + mask_value = max_neg_value(dots) + + if exists(mask): + mask = rearrange(mask, 'b j -> b () () j') + dots.masked_fill_(~mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool() + dots.masked_fill_(mask, mask_value) + + attn = softmax(dots, dim=-1) + + out = torch.einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)') + out = self.to_out(out) + return out + + +# main transformer class +class Transformer(nn.Module): + def __init__( + self, + *, + dim, + depth, + seq_len, + causal=True, + heads=8, + dim_head=64, + ff_mult=4, + attn_dropout=0., + ff_dropout=0., + sparse_attn=False, + sandwich_norm=False, + ): + super().__init__() + layers = nn.ModuleList([]) + sparse_layer = cast_tuple(sparse_attn, depth) + + for ind, sparse_attn in zip(range(depth), sparse_layer): + attn = Attention(dim, causal=causal, seq_len=seq_len, heads=heads, dim_head=dim_head, dropout=attn_dropout) + + ff = FeedForward(dim, mult=ff_mult, dropout=ff_dropout) + + layers.append(nn.ModuleList([ + LayerScale(dim, ind + 1, PreNorm(dim, attn, sandwich=sandwich_norm)), + LayerScale(dim, ind + 1, PreNorm(dim, ff, sandwich=sandwich_norm)) + ])) + + execute_type = SequentialSequence + route_attn = ((True, False),) * depth + attn_route_map = {'mask': route_attn} + + self.layers = execute_type(layers, args_route=attn_route_map) + + def forward(self, x, **kwargs): + return self.layers(x, **kwargs) diff --git a/ruth_tts_transformer/models/vocoder.py b/ruth_tts_transformer/models/vocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..37ff8252db692959b83f789f7e0e9a0b3309fc21 --- /dev/null +++ b/ruth_tts_transformer/models/vocoder.py @@ -0,0 +1,323 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +MAX_WAV_VALUE = 32768.0 + +class KernelPredictor(torch.nn.Module): + ''' Kernel predictor for the location-variable convolutions''' + + def __init__( + self, + cond_channels, + conv_in_channels, + conv_out_channels, + conv_layers, + conv_kernel_size=3, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + kpnet_nonlinear_activation="LeakyReLU", + kpnet_nonlinear_activation_params={"negative_slope": 0.1}, + ): + ''' + Args: + cond_channels (int): number of channel for the conditioning sequence, + conv_in_channels (int): number of channel for the input sequence, + conv_out_channels (int): number of channel for the output sequence, + conv_layers (int): number of layers + ''' + super().__init__() + + self.conv_in_channels = conv_in_channels + self.conv_out_channels = conv_out_channels + self.conv_kernel_size = conv_kernel_size + self.conv_layers = conv_layers + + kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w + kpnet_bias_channels = conv_out_channels * conv_layers # l_b + + self.input_conv = nn.Sequential( + nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.residual_convs = nn.ModuleList() + padding = (kpnet_conv_size - 1) // 2 + for _ in range(3): + self.residual_convs.append( + nn.Sequential( + nn.Dropout(kpnet_dropout), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, + bias=True)), + getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + ) + self.kernel_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_kernel_channels, kpnet_conv_size, padding=padding, bias=True)) + self.bias_conv = nn.utils.weight_norm( + nn.Conv1d(kpnet_hidden_channels, kpnet_bias_channels, kpnet_conv_size, padding=padding, bias=True)) + + def forward(self, c): + ''' + Args: + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + ''' + batch, _, cond_length = c.shape + c = self.input_conv(c) + for residual_conv in self.residual_convs: + residual_conv.to(c.device) + c = c + residual_conv(c) + k = self.kernel_conv(c) + b = self.bias_conv(c) + kernels = k.contiguous().view( + batch, + self.conv_layers, + self.conv_in_channels, + self.conv_out_channels, + self.conv_kernel_size, + cond_length, + ) + bias = b.contiguous().view( + batch, + self.conv_layers, + self.conv_out_channels, + cond_length, + ) + + return kernels, bias + + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.input_conv[0]) + nn.utils.remove_weight_norm(self.kernel_conv) + nn.utils.remove_weight_norm(self.bias_conv) + for block in self.residual_convs: + nn.utils.remove_weight_norm(block[1]) + nn.utils.remove_weight_norm(block[3]) + + +class LVCBlock(torch.nn.Module): + '''the location-variable convolutions''' + + def __init__( + self, + in_channels, + cond_channels, + stride, + dilations=[1, 3, 9, 27], + lReLU_slope=0.2, + conv_kernel_size=3, + cond_hop_length=256, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + ): + super().__init__() + + self.cond_hop_length = cond_hop_length + self.conv_layers = len(dilations) + self.conv_kernel_size = conv_kernel_size + + self.kernel_predictor = KernelPredictor( + cond_channels=cond_channels, + conv_in_channels=in_channels, + conv_out_channels=2 * in_channels, + conv_layers=len(dilations), + conv_kernel_size=conv_kernel_size, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=kpnet_dropout, + kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope} + ) + + self.convt_pre = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.ConvTranspose1d(in_channels, in_channels, 2 * stride, stride=stride, + padding=stride // 2 + stride % 2, output_padding=stride % 2)), + ) + + self.conv_blocks = nn.ModuleList() + for dilation in dilations: + self.conv_blocks.append( + nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(in_channels, in_channels, conv_kernel_size, + padding=dilation * (conv_kernel_size - 1) // 2, dilation=dilation)), + nn.LeakyReLU(lReLU_slope), + ) + ) + + def forward(self, x, c): + ''' forward propagation of the location-variable convolutions. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length) + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + + Returns: + Tensor: the output sequence (batch, in_channels, in_length) + ''' + _, in_channels, _ = x.shape # (B, c_g, L') + + x = self.convt_pre(x) # (B, c_g, stride * L') + kernels, bias = self.kernel_predictor(c) + + for i, conv in enumerate(self.conv_blocks): + output = conv(x) # (B, c_g, stride * L') + + k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) + b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) + + output = self.location_variable_convolution(output, k, b, + hop_size=self.cond_hop_length) # (B, 2 * c_g, stride * L'): LVC + x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( + output[:, in_channels:, :]) # (B, c_g, stride * L'): GAU + + return x + + def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): + ''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. + Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length). + kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) + bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) + dilation (int): the dilation of convolution. + hop_size (int): the hop_size of the conditioning sequence. + Returns: + (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). + ''' + batch, _, in_length = x.shape + batch, _, out_channels, kernel_size, kernel_length = kernel.shape + assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" + + padding = dilation * int((kernel_size - 1) / 2) + x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding) + x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) + + if hop_size < dilation: + x = F.pad(x, (0, dilation), 'constant', 0) + x = x.unfold(3, dilation, + dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) + x = x[:, :, :, :, :hop_size] + x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) + x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) + + o = torch.einsum('bildsk,biokl->bolsd', x, kernel) + o = o.to(memory_format=torch.channels_last_3d) + bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) + o = o + bias + o = o.contiguous().view(batch, out_channels, -1) + + return o + + def remove_weight_norm(self): + self.kernel_predictor.remove_weight_norm() + nn.utils.remove_weight_norm(self.convt_pre[1]) + for block in self.conv_blocks: + nn.utils.remove_weight_norm(block[1]) + + +class UnivNetGenerator(nn.Module): + """UnivNet Generator""" + + def __init__(self, noise_dim=64, channel_size=16, dilations=[1,3,9,27], strides=[8,8,4], lReLU_slope=.2, kpnet_conv_size=3, + # Below are MEL configurations options that this generator requires. + hop_length=256, n_mel_channels=100): + super(UnivNetGenerator, self).__init__() + self.mel_channel = n_mel_channels + self.noise_dim = noise_dim + self.hop_length = hop_length + channel_size = channel_size + kpnet_conv_size = kpnet_conv_size + + self.res_stack = nn.ModuleList() + hop_length = 1 + for stride in strides: + hop_length = stride * hop_length + self.res_stack.append( + LVCBlock( + channel_size, + n_mel_channels, + stride=stride, + dilations=dilations, + lReLU_slope=lReLU_slope, + cond_hop_length=hop_length, + kpnet_conv_size=kpnet_conv_size + ) + ) + + self.conv_pre = \ + nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode='reflect')) + + self.conv_post = nn.Sequential( + nn.LeakyReLU(lReLU_slope), + nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode='reflect')), + nn.Tanh(), + ) + + def forward(self, c, z): + ''' + Args: + c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) + z (Tensor): the noise sequence (batch, noise_dim, in_length) + + ''' + z = self.conv_pre(z) # (B, c_g, L) + + for res_block in self.res_stack: + res_block.to(z.device) + z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) + + z = self.conv_post(z) # (B, 1, L * 256) + + return z + + def eval(self, inference=False): + super(UnivNetGenerator, self).eval() + # don't remove weight norm while validation in training loop + if inference: + self.remove_weight_norm() + + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.conv_pre) + + for layer in self.conv_post: + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + + for res_block in self.res_stack: + res_block.remove_weight_norm() + + def inference(self, c, z=None): + # pad input mel with zeros to cut artifact + # see https://github.com/seungwonpark/melgan/issues/8 + zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) + mel = torch.cat((c, zero), dim=2) + + if z is None: + z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) + + audio = self.forward(mel, z) + audio = audio[:, :, :-(self.hop_length * 10)] + audio = audio.clamp(min=-1, max=1) + return audio + + +if __name__ == '__main__': + model = UnivNetGenerator() + + c = torch.randn(3, 100, 10) + z = torch.randn(3, 64, 10) + print(c.shape) + + y = model(c, z) + print(y.shape) + assert y.shape == torch.Size([3, 1, 2560]) + + pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(pytorch_total_params) diff --git a/ruth_tts_transformer/models/xtransformers.py b/ruth_tts_transformer/models/xtransformers.py new file mode 100644 index 0000000000000000000000000000000000000000..8be2df455c46bf8c89efb0d5fdbb704a9fb622f6 --- /dev/null +++ b/ruth_tts_transformer/models/xtransformers.py @@ -0,0 +1,1248 @@ +import math +from collections import namedtuple +from functools import partial +from inspect import isfunction + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch import nn, einsum + +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple('Intermediates', [ + 'pre_softmax_attn', + 'post_softmax_attn' +]) + +LayerIntermediates = namedtuple('Intermediates', [ + 'hiddens', + 'attn_intermediates', + 'past_key_values', +]) + + +# helpers + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def cast_tuple(val, depth): + return val if isinstance(val, tuple) else (val,) * depth + + +class always(): + def __init__(self, val): + self.val = val + + def __call__(self, *args, **kwargs): + return self.val + + +class not_equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x != self.val + + +class equals(): + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x == self.val + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +def l2norm(t): + return F.normalize(t, p=2, dim=-1) + + +# init helpers + +def init_zero_(layer): + nn.init.constant_(layer.weight, 0.) + if exists(layer.bias): + nn.init.constant_(layer.bias, 0.) + + +# keyword argument helpers + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val,) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) + kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items()))) + return kwargs_without_prefix, kwargs + + +# activations + +class ReluSquared(nn.Module): + def forward(self, x): + return F.relu(x) ** 2 + + +# positional embeddings + +class AbsolutePositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + self.scale = dim ** -0.5 + self.emb = nn.Embedding(max_seq_len, dim) + + def forward(self, x): + n = torch.arange(x.shape[1], device=x.device) + pos_emb = self.emb(n) + pos_emb = rearrange(pos_emb, 'n d -> () n d') + return pos_emb * self.scale + + +class FixedPositionalEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset + sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + return rearrange(emb, 'n d -> () n d') + + +class RelativePositionBias(nn.Module): + def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8): + super().__init__() + self.scale = scale + self.causal = causal + self.num_buckets = num_buckets + self.max_distance = max_distance + self.relative_attention_bias = nn.Embedding(num_buckets, heads) + + @staticmethod + def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128): + ret = 0 + n = -relative_position + if not causal: + num_buckets //= 2 + ret += (n < 0).long() * num_buckets + n = torch.abs(n) + else: + n = torch.max(n, torch.zeros_like(n)) + + max_exact = num_buckets // 2 + is_small = n < max_exact + + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).long() + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def forward(self, qk_dots): + i, j, device = *qk_dots.shape[-2:], qk_dots.device + q_pos = torch.arange(i, dtype=torch.long, device=device) + k_pos = torch.arange(j, dtype=torch.long, device=device) + rel_pos = k_pos[None, :] - q_pos[:, None] + rp_bucket = self._relative_position_bucket(rel_pos, causal=self.causal, num_buckets=self.num_buckets, + max_distance=self.max_distance) + values = self.relative_attention_bias(rp_bucket) + bias = rearrange(values, 'i j h -> () h i j') + return qk_dots + (bias * self.scale) + + +class AlibiPositionalBias(nn.Module): + def __init__(self, heads, **kwargs): + super().__init__() + self.heads = heads + slopes = torch.Tensor(self._get_slopes(heads)) + slopes = rearrange(slopes, 'h -> () h () ()') + self.register_buffer('slopes', slopes, persistent=False) + self.register_buffer('bias', None, persistent=False) + + @staticmethod + def _get_slopes(heads): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(heads).is_integer(): + return get_slopes_power_of_2(heads) + + closest_power_of_2 = 2 ** math.floor(math.log2(heads)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][ + :heads - closest_power_of_2] + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + if exists(self.bias) and self.bias.shape[-1] >= j: + return qk_dots + self.bias[..., :j] + + bias = torch.arange(j, device=device) + bias = rearrange(bias, 'j -> () () () j') + bias = bias * self.slopes + + num_heads_unalibied = h - bias.shape[1] + bias = F.pad(bias, (0, 0, 0, 0, 0, num_heads_unalibied)) + + self.register_buffer('bias', bias, persistent=False) + return qk_dots + self.bias + + +class LearnedAlibiPositionalBias(AlibiPositionalBias): + def __init__(self, heads, bidirectional=False): + super().__init__(heads) + los_slopes = torch.log(self.slopes) + self.learned_logslopes = nn.Parameter(los_slopes) + + self.bidirectional = bidirectional + if self.bidirectional: + self.learned_logslopes_future = nn.Parameter(los_slopes) + + def forward(self, qk_dots): + h, i, j, device = *qk_dots.shape[-3:], qk_dots.device + + def get_slopes(param): + return F.pad(param.exp(), (0, 0, 0, 0, 0, h - param.shape[1])) + + if exists(self.bias) and self.bias.shape[-1] >= j: + bias = self.bias[..., :i, :j] + else: + i_arange = torch.arange(i, device=device) + j_arange = torch.arange(j, device=device) + bias = rearrange(j_arange, 'j -> 1 1 1 j') - rearrange(i_arange, 'i -> 1 1 i 1') + self.register_buffer('bias', bias, persistent=False) + + if self.bidirectional: + past_slopes = get_slopes(self.learned_logslopes) + future_slopes = get_slopes(self.learned_logslopes_future) + bias = torch.tril(bias * past_slopes) + torch.triu(bias * future_slopes) + else: + slopes = get_slopes(self.learned_logslopes) + bias = bias * slopes + + return qk_dots + bias + + +class RotaryEmbedding(nn.Module): + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, max_seq_len, device): + t = torch.arange(max_seq_len, device=device).type_as(self.inv_freq) + freqs = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + return rearrange(emb, 'n d -> () () n d') + + +def rotate_half(x): + x = rearrange(x, '... (j d) -> ... j d', j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(t, freqs): + seq_len = t.shape[-2] + freqs = freqs[:, :, -seq_len:] + return (t * freqs.cos()) + (rotate_half(t) * freqs.sin()) + + +# norms + +class Scale(nn.Module): + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + scale_fn = lambda t: t * self.value + + if not isinstance(out, tuple): + return scale_fn(out) + + return (scale_fn(out[0]), *out[1:]) + + +class Rezero(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.zeros(1)) + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + rezero_fn = lambda t: t * self.g + + if not isinstance(out, tuple): + return rezero_fn(out) + + return (rezero_fn(out[0]), *out[1:]) + + +class ScaleNorm(nn.Module): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSScaleShiftNorm(nn.Module): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim ** -0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + self.scale_shift_process = nn.Linear(dim * 2, dim * 2) + + def forward(self, x, norm_scale_shift_inp): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + norm = x / norm.clamp(min=self.eps) * self.g + + ss_emb = self.scale_shift_process(norm_scale_shift_inp) + scale, shift = torch.chunk(ss_emb, 2, dim=1) + h = norm * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + return h + + +# residual and residual gates + +class Residual(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + return x + residual + + +class GRUGating(nn.Module): + def __init__(self, dim, scale_residual=False): + super().__init__() + self.gru = nn.GRUCell(dim, dim) + self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None + + def forward(self, x, residual): + if exists(self.residual_scale): + residual = residual * self.residual_scale + + gated_output = self.gru( + rearrange(x, 'b n d -> (b n) d'), + rearrange(residual, 'b n d -> (b n) d') + ) + + return gated_output.reshape_as(x) + + +# token shifting + +def shift(t, amount, mask=None): + if amount == 0: + return t + + if exists(mask): + t = t.masked_fill(~mask[..., None], 0.) + + return F.pad(t, (0, 0, amount, -amount), value=0.) + + +class ShiftTokens(nn.Module): + def __init__(self, shifts, fn): + super().__init__() + self.fn = fn + self.shifts = tuple(shifts) + + def forward(self, x, **kwargs): + mask = kwargs.get('mask', None) + shifts = self.shifts + segments = len(shifts) + feats_per_shift = x.shape[-1] // segments + splitted = x.split(feats_per_shift, dim=-1) + segments_to_shift, rest = splitted[:segments], splitted[segments:] + segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts))) + x = torch.cat((*segments_to_shift, *rest), dim=-1) + return self.fn(x, **kwargs) + + +# feedforward + +class GLU(nn.Module): + def __init__(self, dim_in, dim_out, activation): + super().__init__() + self.act = activation + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * self.act(gate) + + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out=None, + mult=4, + glu=False, + relu_squared=False, + post_act_ln=False, + dropout=0., + zero_init_output=False + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + activation = ReluSquared() if relu_squared else nn.GELU() + + project_in = nn.Sequential( + nn.Linear(dim, inner_dim), + activation + ) if not glu else GLU(dim, inner_dim, activation) + + self.net = nn.Sequential( + project_in, + nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(), + nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out) + ) + + # init last linear layer to 0 + if zero_init_output: + init_zero_(self.net[-1]) + + def forward(self, x): + return self.net(x) + + +# attention. + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + talking_heads=False, + head_scale=False, + collab_heads=False, + collab_compression=.3, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0., + on_attn=False, + gate_values=False, + zero_init_output=False, + max_attend_past=None, + qk_norm=False, + scale_init_value=None, + rel_pos_bias=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + ): + super().__init__() + self.scale = dim_head ** -0.5 + + self.heads = heads + self.causal = causal + self.max_attend_past = max_attend_past + + qk_dim = v_dim = dim_head * heads + + # collaborative heads + self.collab_heads = collab_heads + if self.collab_heads: + qk_dim = int(collab_compression * qk_dim) + self.collab_mixing = nn.Parameter(torch.randn(heads, qk_dim)) + + self.to_q = nn.Linear(dim, qk_dim, bias=False) + self.to_k = nn.Linear(dim, qk_dim, bias=False) + self.to_v = nn.Linear(dim, v_dim, bias=False) + + self.dropout = nn.Dropout(dropout) + + # add GLU gating for aggregated values, from alphafold2 + self.to_v_gate = None + if gate_values: + self.to_v_gate = nn.Linear(dim, v_dim) + nn.init.constant_(self.to_v_gate.weight, 0) + nn.init.constant_(self.to_v_gate.bias, 1) + + # cosine sim attention + self.qk_norm = qk_norm + if qk_norm: + scale_init_value = default(scale_init_value, + -3) # if not provided, initialize as though it were sequence length of 1024 + self.scale = nn.Parameter(torch.ones(1, heads, 1, 1) * scale_init_value) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + + # head scaling + self.head_scale = head_scale + if head_scale: + self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + # entmax + self.attn_fn = F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = nn.Sequential(nn.Linear(v_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(v_dim, dim) + + self.rel_pos_bias = rel_pos_bias + if rel_pos_bias: + assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance' + self.rel_pos = RelativePositionBias(scale=dim_head ** 0.5, causal=causal, heads=heads, + num_buckets=rel_pos_num_buckets, max_distance=rel_pos_max_distance) + + # init output projection 0 + if zero_init_output: + init_zero_(self.to_out) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + attn_mask=None, + sinusoidal_emb=None, + rotary_pos_emb=None, + prev_attn=None, + mem=None, + layer_past=None, + ): + b, n, _, h, talking_heads, collab_heads, head_scale, scale, device, has_context = *x.shape, self.heads, self.talking_heads, self.collab_heads, self.head_scale, self.scale, x.device, exists( + context) + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = torch.cat((mem, k_input), dim=-2) + v_input = torch.cat((mem, v_input), dim=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + if not collab_heads: + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) + else: + q = einsum('b i d, h d -> b h i d', q, self.collab_mixing) + k = rearrange(k, 'b n d -> b () n d') + v = rearrange(v, 'b n (h d) -> b h n d', h=h) + + if layer_past is not None: + past_key, past_value = layer_past + k = torch.cat([past_key, k], dim=-2) + v = torch.cat([past_value, v], dim=-2) + k_cache = k + v_cache = v + + if exists(rotary_pos_emb) and not has_context: + l = rotary_pos_emb.shape[-1] + (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v)) + ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl)) + q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr))) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool()) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool()) + q_mask = rearrange(q_mask, 'b i -> b () i ()') + k_mask = rearrange(k_mask, 'b j -> b () () j') + input_mask = q_mask * k_mask + + if self.num_mem_kv > 0: + mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v)) + k = torch.cat((mem_k, k), dim=-2) + v = torch.cat((mem_v, v), dim=-2) + if exists(input_mask): + input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True) + + if collab_heads: + k = k.expand(-1, h, -1, -1) + + if self.qk_norm: + q, k = map(l2norm, (q, k)) + scale = 1 / (self.scale.exp().clamp(min=1e-2)) + + dots = einsum('b h i d, b h j d -> b h i j', q, k) * scale + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots.clone() + + if talking_heads: + dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous() + + if self.rel_pos_bias: + dots = self.rel_pos(dots) + + if exists(input_mask): + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if exists(attn_mask): + assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4' + if attn_mask.ndim == 2: + attn_mask = rearrange(attn_mask, 'i j -> () () i j') + elif attn_mask.ndim == 3: + attn_mask = rearrange(attn_mask, 'h i j -> () h i j') + dots.masked_fill_(~attn_mask, mask_value) + + if exists(self.max_attend_past): + i, j = dots.shape[-2:] + range_q = torch.arange(j - i, j, device=device) + range_k = torch.arange(j, device=device) + dist = rearrange(range_q, 'i -> () () i ()') - rearrange(range_k, 'j -> () () () j') + mask = dist > self.max_attend_past + dots.masked_fill_(mask, mask_value) + del mask + + if self.causal: + i, j = dots.shape[-2:] + r = torch.arange(i, device=device) + mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j') + mask = F.pad(mask, (j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, dim=-1) + post_softmax_attn = attn.clone() + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous() + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + + if head_scale: + out = out * self.head_scale_params + + out = rearrange(out, 'b h n d -> b n (h d)') + + if exists(self.to_v_gate): + gates = self.to_v_gate(x) + out = out * gates.sigmoid() + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, + post_softmax_attn=post_softmax_attn + ) + + return self.to_out(out), intermediates, k_cache, v_cache + + +class AttentionLayers(nn.Module): + def __init__( + self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rms_scaleshift_norm=False, + use_rmsnorm=False, + use_rezero=False, + alibi_pos_bias=False, + alibi_num_heads=None, + alibi_learned=False, + position_infused_attn=False, + rotary_pos_emb=False, + rotary_emb_dim=None, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + scale_residual=False, + shift_tokens=0, + sandwich_norm=False, + use_qk_norm_attn=False, + qk_norm_attn_seq_len=None, + zero_init_branch_output=False, + **kwargs + ): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) + attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) + + dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.ModuleList([]) + self.causal = causal + + rel_pos_bias = 'rel_pos_bias' in attn_kwargs + self.has_pos_emb = position_infused_attn or rel_pos_bias or rotary_pos_emb + self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None + + rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) + self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None + + assert not ( + alibi_pos_bias and rel_pos_bias), 'you can only choose Alibi positional bias or T5 relative positional bias, not both' + + if alibi_pos_bias: + alibi_num_heads = default(alibi_num_heads, heads) + assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads' + alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned or not causal else AlibiPositionalBias + self.rel_pos = alibi_pos_klass(heads=alibi_num_heads, bidirectional=not causal) + else: + self.rel_pos = None + + assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm' + self.pre_norm = pre_norm + self.sandwich_norm = sandwich_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + self.cross_attend = cross_attend + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_class = RMSScaleShiftNorm if use_rms_scaleshift_norm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ('a', 'c', 'f') + elif cross_attend and only_cross: + default_block = ('c', 'f') + else: + default_block = ('a', 'f') + + if macaron: + default_block = ('f',) + default_block + + # qk normalization + + if use_qk_norm_attn: + attn_scale_init_value = -math.log(math.log2(qk_norm_attn_seq_len ** 2 - qk_norm_attn_seq_len)) if exists( + qk_norm_attn_seq_len) else None + attn_kwargs = {**attn_kwargs, 'qk_norm': True, 'scale_init_value': attn_scale_init_value} + + # zero init + + if zero_init_branch_output: + attn_kwargs = {**attn_kwargs, 'zero_init_output': True} + ff_kwargs = {**ff_kwargs, 'zero_init_output': True} + + # calculate layer block order + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, 'par ratio out of range' + default_block = tuple(filter(not_equals('f'), default_block)) + par_attn = par_depth // par_ratio + depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert len(default_block) <= par_width, 'default block is too large for par_ratio' + par_block = default_block + ('f',) * (par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ('f',) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' + layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals('a'), layer_types))) + + # calculate token shifting + + shift_tokens = cast_tuple(shift_tokens, len(layer_types)) + + # iterate and construct layers + + for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)): + is_last_layer = ind == (len(self.layer_types) - 1) + + if layer_type == 'a': + layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs) + elif layer_type == 'c': + layer = Attention(dim, heads=heads, **attn_kwargs) + elif layer_type == 'f': + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f'invalid layer type {layer_type}') + + if layer_shift_tokens > 0: + shift_range_upper = layer_shift_tokens + 1 + shift_range_lower = -layer_shift_tokens if not causal else 0 + layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer) + + if exists(branch_fn): + layer = branch_fn(layer) + + residual_fn = GRUGating if gate_residual else Residual + residual = residual_fn(dim, scale_residual=scale_residual) + + layer_uses_qk_norm = use_qk_norm_attn and layer_type in ('a', 'c') + + pre_branch_norm = norm_fn() if pre_norm and not layer_uses_qk_norm else None + post_branch_norm = norm_fn() if sandwich_norm or layer_uses_qk_norm else None + post_main_norm = norm_fn() if not pre_norm and not is_last_layer else None + + norms = nn.ModuleList([ + pre_branch_norm, + post_branch_norm, + post_main_norm + ]) + + self.layers.append(nn.ModuleList([ + norms, + layer, + residual + ])) + + def forward( + self, + x, + context=None, + full_context=None, # for passing a list of hidden states from an encoder + mask=None, + context_mask=None, + attn_mask=None, + mems=None, + return_hiddens=False, + norm_scale_shift_inp=None, + past_key_values=None, + expected_seq_len=None, + ): + + assert not (self.cross_attend ^ (exists(context) or exists( + full_context))), 'context must be passed in if cross_attend is set to True' + assert context is None or full_context is None, 'only one of full_context or context can be provided' + + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + norm_args = {} + if exists(norm_scale_shift_inp): + norm_args['norm_scale_shift_inp'] = norm_scale_shift_inp + + rotary_pos_emb = None + if exists(self.rotary_pos_emb): + if not self.training and self.causal: + assert expected_seq_len is not None, "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + elif expected_seq_len is None: + expected_seq_len = 0 + seq_len = x.shape[1] + if past_key_values is not None: + seq_len += past_key_values[0][0].shape[-2] + max_rotary_emb_length = max(list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len]) + rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device) + + present_key_values = [] + cross_attn_count = 0 + for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)): + if layer_type == 'a': + layer_mem = mems.pop(0) if mems else None + + residual = x + + pre_branch_norm, post_branch_norm, post_main_norm = norm + + if exists(pre_branch_norm): + x = pre_branch_norm(x, **norm_args) + + if layer_type == 'a' or layer_type == 'c': + if past_key_values is not None: + layer_kv = past_key_values.pop(0) + layer_past = tuple(s.to(x.device) for s in layer_kv) + else: + layer_past = None + + if layer_type == 'a': + out, inter, k, v = block(x, None, mask, None, attn_mask, self.pia_pos_emb, rotary_pos_emb, + prev_attn, layer_mem, layer_past) + elif layer_type == 'c': + if exists(full_context): + out, inter, k, v = block(x, full_context[cross_attn_count], mask, context_mask, None, None, + None, prev_attn, None, layer_past) + else: + out, inter, k, v = block(x, context, mask, context_mask, None, None, None, prev_attn, None, layer_past) + elif layer_type == 'f': + out = block(x) + + if layer_type == 'a' or layer_type == 'c' and present_key_values is not None: + present_key_values.append((k.detach(), v.detach())) + + if exists(post_branch_norm): + out = post_branch_norm(out, **norm_args) + + x = residual_fn(out, residual) + + if layer_type in ('a', 'c'): + intermediates.append(inter) + + if layer_type == 'a' and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == 'c' and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if exists(post_main_norm): + x = post_main_norm(x, **norm_args) + + if layer_type == 'c': + cross_attn_count += 1 + + if layer_type == 'f': + hiddens.append(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, + attn_intermediates=intermediates, + past_key_values=present_key_values + ) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on encoder' + super().__init__(causal=False, **kwargs) + + +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on decoder' + super().__init__(causal=True, **kwargs) + + +class CrossAttender(AttentionLayers): + def __init__(self, **kwargs): + super().__init__(cross_attend=True, only_cross=True, **kwargs) + + +class ViTransformerWrapper(nn.Module): + def __init__( + self, + *, + image_size, + patch_size, + attn_layers, + num_classes=None, + dropout=0., + emb_dropout=0. + ): + super().__init__() + assert isinstance(attn_layers, Encoder), 'attention layers must be an Encoder' + assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size' + dim = attn_layers.dim + num_patches = (image_size // patch_size) ** 2 + patch_dim = 3 * patch_size ** 2 + + self.patch_size = patch_size + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) + self.patch_to_embedding = nn.Linear(patch_dim, dim) + self.cls_token = nn.Parameter(torch.randn(1, 1, dim)) + self.dropout = nn.Dropout(emb_dropout) + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + self.mlp_head = FeedForward(dim, dim_out=num_classes, dropout=dropout) if exists(num_classes) else None + + def forward( + self, + img, + return_embeddings=False + ): + p = self.patch_size + + x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p) + x = self.patch_to_embedding(x) + b, n, _ = x.shape + + cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embedding[:, :(n + 1)] + x = self.dropout(x) + + x = self.attn_layers(x) + x = self.norm(x) + + if not exists(self.mlp_head) or return_embeddings: + return x + + return self.mlp_head(x[:, 0]) + + +class TransformerWrapper(nn.Module): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0., + shift_mem_down=0, + emb_dropout=0., + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + self.shift_mem_down = shift_mem_down + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.init_() + + self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim)) + + def init_(self): + nn.init.kaiming_normal_(self.token_emb.weight) + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_hiddens=False, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens + x = self.token_emb(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x = self.project_emb(x) + + if num_mem > 0: + mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) + x = torch.cat((mem, x), dim=1) + + # auto-handle masking after appending memory tokens + if exists(mask): + mask = F.pad(mask, (num_mem, 0), value=True) + + if self.shift_mem_down and exists(mems): + mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:] + mems = [*mems_r, *mems_l] + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + mem, x = x[:, :num_mem], x[:, num_mem:] + + out = self.to_logits(x) if not return_embeddings else x + + if return_hiddens: + hiddens = intermediates.hiddens + return out, hiddens + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + + +class ContinuousTransformerWrapper(nn.Module): + def __init__( + self, + *, + max_seq_len, + attn_layers, + dim_in=None, + dim_out=None, + emb_dim=None, + emb_dropout=0., + use_pos_emb=True + ): + super().__init__() + assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + + self.max_seq_len = max_seq_len + + self.pos_emb = AbsolutePositionalEmbedding(dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_in = nn.Linear(dim_in, dim) if exists(dim_in) else nn.Identity() + + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.project_out = nn.Linear(dim, dim_out) if exists(dim_out) else nn.Identity() + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_attn=False, + mems=None, + use_cache=False, + **kwargs + ): + b, n, _, device = *x.shape, x.device + + x = self.project_in(x) + x = x + self.pos_emb(x) + x = self.emb_dropout(x) + + x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + out = self.project_out(x) if not return_embeddings else x + + res = [out] + if return_attn: + attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + res.append(attn_maps) + if use_cache: + res.append(intermediates.past_key_values) + + if len(res) > 1: + return tuple(res) + return res[0] + diff --git a/ruth_tts_transformer/utils/__init__.py b/ruth_tts_transformer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ruth_tts_transformer/utils/__pycache__/__init__.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..476c83fd3f0c2616ca4186635b6d8b44bde4fc8d Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/__init__.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..247bf3a618f15d634af2ca3cd84e41c3495a153d Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/audio.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/audio.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a212bf920fcb2274536666560ff60f63e58d15a Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/audio.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/audio.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/audio.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6978c15d60a0cf6ffa033b01b6fc940ec24aeb6a Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/audio.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d33479b269df762557ecaca61a3fb2aea7a37104 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53b22e05990022956179d23f32a3ab5ba7e1463f Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/diffusion.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/stft.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/stft.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b755475974addb569b31e22bce4273c45536fe1 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/stft.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/stft.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/stft.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fe48d8dada00e5d2229a062c71b667a939a1d5e Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/stft.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f3b601058ee20550dc17628b19a2366246f5e43 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea5854d0d1069a4e73f2dc606e67da4d12c20ae6 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/tokenizer.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a66d400e064a9a79e70d7ea9c478ccd969d07d4 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fd14d64d0f9dfbcc52d2dc3ab5856eb96fd7f60 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/typical_sampling.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-310.pyc b/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d75de8e52f567406b8b9eb8d49ee2aa2cb97f0ca Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-310.pyc differ diff --git a/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-38.pyc b/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16800e7d4c7e6a6ef1d4cc28308f6c017cd0e940 Binary files /dev/null and b/ruth_tts_transformer/utils/__pycache__/wav2vec_alignment.cpython-38.pyc differ diff --git a/ruth_tts_transformer/utils/audio.py b/ruth_tts_transformer/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..bb8c8ab9104817fb80d2b0dd1685e8a8813074af --- /dev/null +++ b/ruth_tts_transformer/utils/audio.py @@ -0,0 +1,198 @@ +import os +import subprocess +from glob import glob + +import librosa +import torch +import torchaudio +import numpy as np +from scipy.io.wavfile import read + +from ruth_tts_transformer.utils.stft import STFT + +BUILTIN_VOICES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices') +if not os.path.isdir(BUILTIN_VOICES_DIR): + git_voice_download = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') + + os.system(f"cd {git_voice_download} && git init && git remote add origin " + f"https://github.com/prakashr7d/ruth-tts-files.git && " + f"git pull origin main && git checkout main -f ") + + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + if data.dtype == np.int32: + norm_fix = 2 ** 31 + elif data.dtype == np.int16: + norm_fix = 2 ** 15 + elif data.dtype == np.float16 or data.dtype == np.float32: + norm_fix = 1. + else: + raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") + return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) + + +def load_audio(audiopath, sampling_rate): + if audiopath[-4:] == '.wav': + audio, lsr = load_wav_to_torch(audiopath) + elif audiopath[-4:] == '.mp3': + audio, lsr = librosa.load(audiopath, sr=sampling_rate) + audio = torch.FloatTensor(audio) + else: + assert False, f"Unsupported audio format provided: {audiopath[-4:]}" + + # Remove any channel data. + if len(audio.shape) > 1: + if audio.shape[0] < 5: + audio = audio[0] + else: + assert audio.shape[1] < 5 + audio = audio[:, 0] + + if lsr != sampling_rate: + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) + + # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. + # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. + if torch.any(audio > 2) or not torch.any(audio < 0): + print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + audio.clip_(-1, 1) + + return audio.unsqueeze(0) + + +TACOTRON_MEL_MAX = 2.3143386840820312 +TACOTRON_MEL_MIN = -11.512925148010254 + + +def denormalize_tacotron_mel(norm_mel): + return ((norm_mel + 1) / 2) * (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN) + TACOTRON_MEL_MIN + + +def normalize_tacotron_mel(mel): + return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def get_voices(extra_voice_dirs=[]): + dirs = [BUILTIN_VOICES_DIR] + extra_voice_dirs + voices = {} + for d in dirs: + subs = os.listdir(d) + for sub in subs: + subj = os.path.join(d, sub) + if os.path.isdir(subj): + voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth')) + return voices + + +def load_voice(voice, map_location="cuda", extra_voice_dirs=[]): + if voice == 'random': + return None, None + + voices = get_voices(extra_voice_dirs) + paths = voices[voice] + if len(paths) == 1 and paths[0].endswith('.pth'): + return None, torch.load(paths[0], map_location=torch.device(map_location)) + else: + conds = [] + for cond_path in paths: + c = load_audio(cond_path, 22050) + conds.append(c) + return conds, None + + +def load_voices(voices, extra_voice_dirs=[]): + latents = [] + clips = [] + for voice in voices: + if voice == 'random': + if len(voices) > 1: + print("Cannot combine a random voice with a non-random voice. Just using a random voice.") + return None, None + clip, latent = load_voice(voice, extra_voice_dirs) + if latent is None: + assert len( + latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + clips.extend(clip) + elif clip is None: + assert len( + clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + latents.append(latent) + if len(latents) == 0: + return clips, None + else: + latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0) + latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0) + latents = (latents_0, latents_1) + return None, latents + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, + n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, + mel_fmax=8000.0): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + from librosa.filters import mel as librosa_mel_fn + mel_basis = librosa_mel_fn( + sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer('mel_basis', mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert (torch.min(y.data) >= -10) + assert (torch.max(y.data) <= 10) + y = torch.clip(y, min=-1, max=1) + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output + + +def wav_to_univnet_mel(wav, do_normalization=False, device='cuda'): + stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000) + stft = stft.to(device) + mel = stft.mel_spectrogram(wav) + if do_normalization: + mel = normalize_tacotron_mel(mel) + return mel diff --git a/ruth_tts_transformer/utils/diffusion.py b/ruth_tts_transformer/utils/diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..9179da122ed702e92da066b73adcfb6e324a7e88 --- /dev/null +++ b/ruth_tts_transformer/utils/diffusion.py @@ -0,0 +1,1242 @@ + +import enum +import math + +import numpy as np +import torch +import torch as th +from tqdm import tqdm + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, th.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [ + x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + th.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * th.exp(-logvar2) + ) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = th.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = th.where( + x < -0.999, + log_cdf_plus, + th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +class ModelMeanType(enum.Enum): + """ + Which type of output the model predicts. + """ + + PREVIOUS_X = 'previous_x' # the model predicts x_{t-1} + START_X = 'start_x' # the model predicts x_0 + EPSILON = 'epsilon' # the model predicts epsilon + + +class ModelVarType(enum.Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + LEARNED = 'learned' + FIXED_SMALL = 'fixed_small' + FIXED_LARGE = 'fixed_large' + LEARNED_RANGE = 'learned_range' + + +class LossType(enum.Enum): + MSE = 'mse' # use raw MSE loss (and KL when learning variances) + RESCALED_MSE = 'rescaled_mse' # use raw MSE loss (with RESCALED_KL when learning variances) + KL = 'kl' # use the variational lower-bound + RESCALED_KL = 'rescaled_kl' # like KL, but rescale to estimate the full VLB + + def is_vb(self): + return self == LossType.KL or self == LossType.RESCALED_KL + + +class GaussianDiffusion: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + + def __init__( + self, + *, + betas, + model_mean_type, + model_var_type, + loss_type, + rescale_timesteps=False, + conditioning_free=False, + conditioning_free_k=1, + ramp_conditioning_free=True, + ): + self.model_mean_type = ModelMeanType(model_mean_type) + self.model_var_type = ModelVarType(model_var_type) + self.loss_type = LossType(loss_type) + self.rescale_timesteps = rescale_timesteps + self.conditioning_free = conditioning_free + self.conditioning_free_k = conditioning_free_k + self.ramp_conditioning_free = ramp_conditioning_free + + # Use float64 for accuracy. + betas = np.array(betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = ( + betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log( + np.append(self.posterior_variance[1], self.posterior_variance[1:]) + ) + self.posterior_mean_coef1 = ( + betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + self.posterior_mean_coef2 = ( + (1.0 - self.alphas_cumprod_prev) + * np.sqrt(alphas) + / (1.0 - self.alphas_cumprod) + ) + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + ) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = _extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + noise = th.randn_like(x_start) + assert noise.shape == x_start.shape + return ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) + * noise + ) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = ( + _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + assert ( + posterior_mean.shape[0] + == posterior_variance.shape[0] + == posterior_log_variance_clipped.shape[0] + == x_start.shape[0] + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance( + self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None + ): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + B, C = x.shape[:2] + assert t.shape == (B,) + model_output = model(x, self._scale_timesteps(t), **model_kwargs) + if self.conditioning_free: + model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs) + + if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: + assert model_output.shape == (B, C * 2, *x.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + if self.conditioning_free: + model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1) + if self.model_var_type == ModelVarType.LEARNED: + model_log_variance = model_var_values + model_variance = th.exp(model_log_variance) + else: + min_log = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x.shape + ) + max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) + # The model_var_values is [-1, 1] for [min_var, max_var]. + frac = (model_var_values + 1) / 2 + model_log_variance = frac * max_log + (1 - frac) * min_log + model_variance = th.exp(model_log_variance) + else: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.FIXED_LARGE: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log(np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.FIXED_SMALL: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) + + if self.conditioning_free: + if self.ramp_conditioning_free: + assert t.shape[0] == 1 # This should only be used in inference. + cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps) + else: + cfk = self.conditioning_free_k + model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + pred_xstart = process_xstart( + self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output) + ) + model_mean = model_output + elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: + if self.model_mean_type == ModelMeanType.START_X: + pred_xstart = process_xstart(model_output) + else: + pred_xstart = process_xstart( + self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) + ) + model_mean, _, _ = self.q_posterior_mean_variance( + x_start=pred_xstart, x_t=x, t=t + ) + else: + raise NotImplementedError(self.model_mean_type) + + assert ( + model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape + ) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps + ) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev + - _extract_into_tensor( + self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape + ) + * x_t + ) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - pred_xstart + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + return t.float() * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = ( + p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() + ) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, self._scale_timesteps(t), **model_kwargs + ) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t + ) + return out + + def p_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean( + cond_fn, out, x, t, model_kwargs=model_kwargs + ) + sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + for i in tqdm(indices, disable=not progress): + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.p_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = ( + eta + * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) + * th.sqrt(1 - alpha_bar / alpha_bar_prev) + ) + # Equation 12. + noise = th.randn_like(x) + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_prev) + + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps + ) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x + - out["pred_xstart"] + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_next) + + th.sqrt(1 - alpha_bar_next) * eps + ) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices, disable=not progress) + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.ddim_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + def _vb_terms_bpd( + self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None + ): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + ) + out = self.p_mean_variance( + model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs + ) + kl = normal_kl( + true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] + ) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] + ) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = th.where((t == 0), decoder_nll, kl) + return {"output": output, "pred_xstart": out["pred_xstart"]} + + def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + + terms = {} + + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + # TODO: support multiple model outputs for this mode. + terms["loss"] = self._vb_terms_bpd( + model=model, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + model_kwargs=model_kwargs, + )["output"] + if self.loss_type == LossType.RESCALED_KL: + terms["loss"] *= self.num_timesteps + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs) + if isinstance(model_outputs, tuple): + model_output = model_outputs[0] + terms['extra_outputs'] = model_outputs[1:] + else: + model_output = model_outputs + + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C * 2, *x_t.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def autoregressive_training_losses(self, model, x_start, t, model_output_keys, gd_out_key, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + terms = {} + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + assert False # not currently supported for this type of diffusion. + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) + terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) + model_output = terms[gd_out_key] + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C, 2, *x_t.shape[2:]) + model_output, model_var_values = model_output[:, :, 0], model_output[:, :, 1] + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + target = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0] + x_start_pred = torch.zeros(x_start) # Not supported. + elif self.model_mean_type == ModelMeanType.START_X: + target = x_start + x_start_pred = model_output + elif self.model_mean_type == ModelMeanType.EPSILON: + target = noise + x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) + else: + raise NotImplementedError(self.model_mean_type) + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + terms["x_start_predicted"] = x_start_pred + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 + ) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.device + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = th.tensor([t] * batch_size, device=device) + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + with th.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) + mse.append(mean_flat((eps - noise) ** 2)) + + vb = th.stack(vb, dim=1) + xstart_mse = th.stack(xstart_mse, dim=1) + mse = th.stack(mse, dim=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(dim=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +class SpacedDiffusion(GaussianDiffusion): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + + def __init__(self, use_timesteps, **kwargs): + self.use_timesteps = set(use_timesteps) + self.timestep_map = [] + self.original_num_steps = len(kwargs["betas"]) + + base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + kwargs["betas"] = np.array(new_betas) + super().__init__(**kwargs) + + def p_mean_variance( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) + + def training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, **kwargs) + + def autoregressive_training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) + + def _wrap_model(self, model, autoregressive=False): + if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel): + return model + mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel + return mod( + model, self.timestep_map, self.rescale_timesteps, self.original_num_steps + ) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim") :]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError( + f"cannot create exactly {num_timesteps} steps with an integer stride" + ) + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError( + f"cannot divide section of {size} steps into {section_count}" + ) + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +class _WrappedModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, new_ts, **kwargs) + + +class _WrappedAutoregressiveModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, x0, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, x0, new_ts, **kwargs) + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) diff --git a/ruth_tts_transformer/utils/stft.py b/ruth_tts_transformer/utils/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..f54eb968225cfe5928cca6d7686abbcc3728a674 --- /dev/null +++ b/ruth_tts_transformer/utils/stft.py @@ -0,0 +1,193 @@ +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +import librosa.util as librosa_util + + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann'): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction \ No newline at end of file diff --git a/ruth_tts_transformer/utils/text.py b/ruth_tts_transformer/utils/text.py new file mode 100644 index 0000000000000000000000000000000000000000..e28c86786b2ca47823a25f3f251f9bc85bb3facd --- /dev/null +++ b/ruth_tts_transformer/utils/text.py @@ -0,0 +1,132 @@ +import re + + +def split_and_recombine_text(text, desired_length=200, max_length=300): + """Split text it into chunks of a desired length trying to keep sentences intact.""" + # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii + text = re.sub(r'\n\n+', '\n', text) + text = re.sub(r'\s+', ' ', text) + text = re.sub(r'[“”]', '"', text) + + rv = [] + in_quote = False + current = "" + split_pos = [] + pos = -1 + end_pos = len(text) - 1 + + def seek(delta): + nonlocal pos, in_quote, current + is_neg = delta < 0 + for _ in range(abs(delta)): + if is_neg: + pos -= 1 + current = current[:-1] + else: + pos += 1 + current += text[pos] + if text[pos] == '"': + in_quote = not in_quote + return text[pos] + + def peek(delta): + p = pos + delta + return text[p] if p < end_pos and p >= 0 else "" + + def commit(): + nonlocal rv, current, split_pos + rv.append(current) + current = "" + split_pos = [] + + while pos < end_pos: + c = seek(1) + # do we need to force a split? + if len(current) >= max_length: + if len(split_pos) > 0 and len(current) > (desired_length / 2): + # we have at least one sentence and we are over half the desired length, seek back to the last split + d = pos - split_pos[-1] + seek(-d) + else: + # no full sentences, seek back until we are not in the middle of a word and split there + while c not in '!?.\n ' and pos > 0 and len(current) > desired_length: + c = seek(-1) + commit() + # check for sentence boundaries + elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')): + # seek forward if we have consecutive boundary markers but still within the max length + while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.': + c = seek(1) + split_pos.append(pos) + if len(current) >= desired_length: + commit() + # treat end of quote as a boundary if its followed by a space or newline + elif in_quote and peek(1) == '"' and peek(2) in '\n ': + seek(2) + split_pos.append(pos) + rv.append(current) + + # clean up, remove lines with only whitespace or punctuation + rv = [s.strip() for s in rv] + rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)] + + return rv + + +if __name__ == '__main__': + import os + import unittest + + class Test(unittest.TestCase): + def test_split_and_recombine_text(self): + text = """ + This is a sample sentence. + This is another sample sentence. + This is a longer sample sentence that should force a split inthemiddlebutinotinthislongword. + "Don't split my quote... please" + """ + self.assertEqual(split_and_recombine_text(text, desired_length=20, max_length=40), + ['This is a sample sentence.', + 'This is another sample sentence.', + 'This is a longer sample sentence that', + 'should force a split', + 'inthemiddlebutinotinthislongword.', + '"Don\'t split my quote... please"']) + + def test_split_and_recombine_text_2(self): + text = """ + When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?! + I don't know but we should handle this situation.......................... + """ + self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50), + ['When you are really angry sometimes you use', + 'consecutive exclamation marks!!!!!!', + 'Is this a good thing to do?!?!?!', + 'I don\'t know but we should handle this situation.']) + + def test_split_and_recombine_text_3(self): + text_src = os.path.join(os.path.dirname(__file__), '../data/riding_hood.txt') + with open(text_src, 'r') as f: + text = f.read() + self.assertEqual( + split_and_recombine_text(text), + [ + 'Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her.', + 'It suited the girl so extremely well that everybody called her Little Red Riding Hood. One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter."', + 'Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest.', + 'He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." "Does she live far off?" said the wolf "Oh I say,"', + 'answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." "Well," said the wolf, "and I\'ll go and see her too. I\'ll go this way and go you that, and we shall see who will be there first."', + 'The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers.', + 'It was not long before the wolf arrived at the old woman\'s house. He knocked at the door: tap, tap. "Who\'s there?" "Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother."', + 'The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up."', + 'The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten.', + 'He then shut the door and got into the grandmother\'s bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. "Who\'s there?"', + 'Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you."', + 'The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." Little Red Riding Hood pulled the bobbin, and the door opened.', + 'The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." Little Red Riding Hood took off her clothes and got into bed.', + 'She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" "All the better to hug you with, my dear." "Grandmother, what big legs you have!" "All the better to run with, my child." "Grandmother, what big ears you have!"', + '"All the better to hear with, my child." "Grandmother, what big eyes you have!" "All the better to see with, my child." "Grandmother, what big teeth you have got!" "All the better to eat you up with." And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up.', + ] + ) + + unittest.main() diff --git a/ruth_tts_transformer/utils/tokenizer.py b/ruth_tts_transformer/utils/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..3ab1c31223c990e79a4e01fb1b7bb0996d0f66af --- /dev/null +++ b/ruth_tts_transformer/utils/tokenizer.py @@ -0,0 +1,193 @@ +import os +import re + +import inflect +import torch +from tokenizers import Tokenizer + + +# Regular expression matching whitespace: +from unidecode import unidecode + +_whitespace_re = re.compile(r'\s+') + + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + text = text.replace('"', '') + return text + + +def lev_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + + +DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/tokenizer.json') + + +class VoiceBpeTokenizer: + def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): + if vocab_file is not None: + self.tokenizer = Tokenizer.from_file(vocab_file) + + def preprocess_text(self, txt): + txt = english_cleaners(txt) + return txt + + def encode(self, txt): + txt = self.preprocess_text(txt) + txt = txt.replace(' ', '[SPACE]') + return self.tokenizer.encode(txt).ids + + def decode(self, seq): + if isinstance(seq, torch.Tensor): + seq = seq.cpu().numpy() + txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '') + txt = txt.replace('[SPACE]', ' ') + txt = txt.replace('[STOP]', '') + txt = txt.replace('[UNK]', '') + return txt \ No newline at end of file diff --git a/ruth_tts_transformer/utils/try.py b/ruth_tts_transformer/utils/try.py new file mode 100644 index 0000000000000000000000000000000000000000..ac34f2bb72438e013a23116dbc0f825efa271f43 --- /dev/null +++ b/ruth_tts_transformer/utils/try.py @@ -0,0 +1,7 @@ +import os + +git_voice_download = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') + +os.system(f"cd {git_voice_download} && git init && git remote add origin " + f"https://github.com/prakashr7d/ruth-tts-files.git && " + f"git pull origin main && git checkout main -f ") diff --git a/ruth_tts_transformer/utils/typical_sampling.py b/ruth_tts_transformer/utils/typical_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..ff6bf487947e88a55fa45f2ffec1b9540df1d4fd --- /dev/null +++ b/ruth_tts_transformer/utils/typical_sampling.py @@ -0,0 +1,33 @@ +import torch +from transformers import LogitsWarper + + +class TypicalLogitsWarper(LogitsWarper): + def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + self.filter_value = filter_value + self.mass = mass + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + # calculate entropy + normalized = torch.nn.functional.log_softmax(scores, dim=-1) + p = torch.exp(normalized) + ent = -(normalized * p).nansum(-1, keepdim=True) + + # shift and sort + shifted_scores = torch.abs((-normalized) - ent) + sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False) + sorted_logits = scores.gather(-1, sorted_indices) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Remove tokens with cumulative mass above the threshold + last_ind = (cumulative_probs < self.mass).sum(dim=1) + last_ind[last_ind < 0] = 0 + sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1)) + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores \ No newline at end of file diff --git a/ruth_tts_transformer/utils/wav2vec_alignment.py b/ruth_tts_transformer/utils/wav2vec_alignment.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8950dec406d4f2ad8e9dec74d5bd6e82e8f11d --- /dev/null +++ b/ruth_tts_transformer/utils/wav2vec_alignment.py @@ -0,0 +1,150 @@ +import re + +import torch +import torchaudio +from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor + +from ruth_tts_transformer.utils.audio import load_audio + + +def max_alignment(s1, s2, skip_character='~', record=None): + """ + A clever function that aligns s1 to s2 as best it can. Wherever a character from s1 is not found in s2, a '~' is + used to replace that character. + + Finally got to use my DP skills! + """ + if record is None: + record = {} + assert skip_character not in s1, f"Found the skip character {skip_character} in the provided string, {s1}" + if len(s1) == 0: + return '' + if len(s2) == 0: + return skip_character * len(s1) + if s1 == s2: + return s1 + if s1[0] == s2[0]: + return s1[0] + max_alignment(s1[1:], s2[1:], skip_character, record) + + take_s1_key = (len(s1), len(s2) - 1) + if take_s1_key in record: + take_s1, take_s1_score = record[take_s1_key] + else: + take_s1 = max_alignment(s1, s2[1:], skip_character, record) + take_s1_score = len(take_s1.replace(skip_character, '')) + record[take_s1_key] = (take_s1, take_s1_score) + + take_s2_key = (len(s1) - 1, len(s2)) + if take_s2_key in record: + take_s2, take_s2_score = record[take_s2_key] + else: + take_s2 = max_alignment(s1[1:], s2, skip_character, record) + take_s2_score = len(take_s2.replace(skip_character, '')) + record[take_s2_key] = (take_s2, take_s2_score) + + return take_s1 if take_s1_score > take_s2_score else skip_character + take_s2 + + +class Wav2VecAlignment: + """ + Uses wav2vec2 to perform audio<->text alignment. + """ + def __init__(self, device='cuda'): + self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu() + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h") + self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron-symbols') + self.device = device + + def align(self, audio, expected_text, audio_sample_rate=24000): + orig_len = audio.shape[-1] + + with torch.no_grad(): + self.model = self.model.to(self.device) + audio = audio.to(self.device) + audio = torchaudio.functional.resample(audio, audio_sample_rate, 16000) + clip_norm = (audio - audio.mean()) / torch.sqrt(audio.var() + 1e-7) + logits = self.model(clip_norm).logits + self.model = self.model.cpu() + + logits = logits[0] + pred_string = self.tokenizer.decode(logits.argmax(-1).tolist()) + + fixed_expectation = max_alignment(expected_text.lower(), pred_string) + w2v_compression = orig_len // logits.shape[0] + expected_tokens = self.tokenizer.encode(fixed_expectation) + expected_chars = list(fixed_expectation) + if len(expected_tokens) == 1: + return [0] # The alignment is simple; there is only one token. + expected_tokens.pop(0) # The first token is a given. + expected_chars.pop(0) + + alignments = [0] + def pop_till_you_win(): + if len(expected_tokens) == 0: + return None + popped = expected_tokens.pop(0) + popped_char = expected_chars.pop(0) + while popped_char == '~': + alignments.append(-1) + if len(expected_tokens) == 0: + return None + popped = expected_tokens.pop(0) + popped_char = expected_chars.pop(0) + return popped + + next_expected_token = pop_till_you_win() + for i, logit in enumerate(logits): + top = logit.argmax() + if next_expected_token == top: + alignments.append(i * w2v_compression) + if len(expected_tokens) > 0: + next_expected_token = pop_till_you_win() + else: + break + + pop_till_you_win() + if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)): + torch.save([audio, expected_text], 'alignment_debug.pth') + assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \ + "your current working directory. Please report this along with the file so it can get fixed." + + # Now fix up alignments. Anything with -1 should be interpolated. + alignments.append(orig_len) # This'll get removed but makes the algorithm below more readable. + for i in range(len(alignments)): + if alignments[i] == -1: + for j in range(i+1, len(alignments)): + if alignments[j] != -1: + next_found_token = j + break + for j in range(i, next_found_token): + gap = alignments[next_found_token] - alignments[i-1] + alignments[j] = (j-i+1) * gap // (next_found_token-i+1) + alignments[i-1] + + return alignments[:-1] + + def redact(self, audio, expected_text, audio_sample_rate=24000): + if '[' not in expected_text: + return audio + splitted = expected_text.split('[') + fully_split = [splitted[0]] + for spl in splitted[1:]: + assert ']' in spl, 'Every "[" character must be paired with a "]" with no nesting.' + fully_split.extend(spl.split(']')) + + # At this point, fully_split is a list of strings, with every other string being something that should be redacted. + non_redacted_intervals = [] + last_point = 0 + for i in range(len(fully_split)): + if i % 2 == 0: + end_interval = max(0, last_point + len(fully_split[i]) - 1) + non_redacted_intervals.append((last_point, end_interval)) + last_point += len(fully_split[i]) + + bare_text = ''.join(fully_split) + alignments = self.align(audio, bare_text, audio_sample_rate) + + output_audio = [] + for nri in non_redacted_intervals: + start, stop = nri + output_audio.append(audio[:, alignments[start]:alignments[stop]]) + return torch.cat(output_audio, dim=-1) diff --git a/ruth_tts_transformer/voices/gabby_conversation/gabby.pth b/ruth_tts_transformer/voices/gabby_conversation/gabby.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e8bb485cbff630160a01f538514d2b609b43f63 Binary files /dev/null and b/ruth_tts_transformer/voices/gabby_conversation/gabby.pth differ diff --git a/ruth_tts_transformer/voices/gabby_reading/Gabby_voices.pth b/ruth_tts_transformer/voices/gabby_reading/Gabby_voices.pth new file mode 100644 index 0000000000000000000000000000000000000000..560aec1eae105d23282be284f21ee748d4b9bdf7 Binary files /dev/null and b/ruth_tts_transformer/voices/gabby_reading/Gabby_voices.pth differ diff --git a/serve.py b/serve.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae2d2a3ae471dd89192605178f7bf4c377768e0 --- /dev/null +++ b/serve.py @@ -0,0 +1,527 @@ +import hashlib +import random +import ray +import torch +import torch.nn.functional as F +import torchaudio +from copy import copy +from datetime import datetime +from fastapi import FastAPI +from fastapi.responses import FileResponse +from pathlib import Path +from pydantic import BaseModel +from ray import serve +from time import time +from typing import Any, Dict, List, Text, Tuple + +from constants import ( + AUTO_REGRESSIVE_BATCH_SIZE, + DIFFUSION, + DIFFUSION_TEMPERATURE, + GPT, + LENGTH_PENALTY, + MAX_MEL_TOKENS, + NUM_AUTOREGRESSIVE_SAMPLES, + REPETITION_PENALTY, + TEMPERATURE, + TOP_P, + CLVP_const, +) +from ruth_tts_transformer.models.autoregressive import UnifiedVoice +from ruth_tts_transformer.models.clvp import CLVP +from ruth_tts_transformer.models.diffusion_decoder import DiffusionTts +from ruth_tts_transformer.models.vocoder import UnivNetGenerator +from ruth_tts_transformer.utils.audio import load_voice +from ruth_tts_transformer.utils.tokenizer import VoiceBpeTokenizer +from ruth_tts_transformer.utils.wav2vec_alignment import Wav2VecAlignment +from utils import ( + MODELS_DIR, + get_config_file, + get_model_path, + load_discrete_vocoder_diffuser, +) + +app = FastAPI() + + +class Item(BaseModel): + text: str + voice: str + seed: int = 3 + + +class Gpt: + def __init__( + self, + num_autoregressive_samples: int, + top_p: float, + temperature: float, + length_penalty: int, + repetition_penalty: float, + max_mel_tokens: int, + autoregressive_batch_size: int, + ): + self.num_autoregressive_samples = num_autoregressive_samples + self.top_p = top_p + self.temperature = temperature + self.length_penalty = length_penalty + self.repetition_penalty = repetition_penalty + self.max_mel_tokens = max_mel_tokens + self.autoregressive_batch_size = autoregressive_batch_size + self.gpt = ( + UnifiedVoice( + max_mel_tokens=604, + max_text_tokens=402, + max_conditioning_inputs=2, + layers=30, + model_dim=1024, + heads=16, + number_text_tokens=255, + start_text_token=255, + checkpointing=False, + train_solo_embeddings=False, + ) + .cpu() + .eval() + ) + self.gpt.load_state_dict( + torch.load(get_model_path("autoregressive.pth", MODELS_DIR)) + ) + self.gpt = self.gpt.to("cuda") + + def __num_batches(self): + return self.num_autoregressive_samples // self.autoregressive_batch_size + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def parse(self, auto_conditioning, text_tokens, best_results, seed, k=1): + self.deterministic_state(seed=seed) + auto_conditioning = copy(auto_conditioning).to("cuda") + text_tokens = copy(text_tokens).to("cuda") + best_results = copy(best_results).to("cuda") + best_latents = self.gpt( + auto_conditioning.repeat(k, 1), + text_tokens.repeat(k, 1), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), + best_results, + torch.tensor( + [best_results.shape[-1] * self.gpt.mel_length_compression], + device=text_tokens.device, + ), + return_latent=True, + clip_inputs=False, + ) + # return best_latents.cpu().detach().numpy() + return best_latents + + def parse_inference( + self, auto_conditioning: torch.Tensor, text_tokens: torch.Tensor, seed + ) -> Tuple[List[torch.Tensor], int]: + self.deterministic_state(seed=seed) + auto_conditioning = copy(auto_conditioning).to("cuda") + text_tokens = copy(text_tokens).to("cuda") + with torch.no_grad(): + samples = [] + num_batches = self.__num_batches() + for b in range(num_batches): + codes = self.gpt.inference_speech( + auto_conditioning, + text_tokens, + do_sample=True, + top_p=self.top_p, + temperature=self.temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=self.length_penalty, + repetition_penalty=self.repetition_penalty, + max_generate_length=self.max_mel_tokens, + ) + padding_needed = self.max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=self.gpt.stop_mel_token) + # samples.append(codes.cpu().detach().numpy()) + samples.append(codes) + + return samples, self.gpt.stop_mel_token + + +class clvp: + def __init__(self, K): + + self.clvp = ( + CLVP( + dim_text=768, + dim_speech=768, + dim_latent=768, + num_text_tokens=256, + text_enc_depth=20, + text_seq_len=350, + text_heads=12, + num_speech_tokens=8192, + speech_enc_depth=20, + speech_heads=12, + speech_seq_len=430, + use_xformers=True, + ) + .cpu() + .eval() + ) + self.clvp.load_state_dict(torch.load(get_model_path("clvp2.pth", MODELS_DIR))) + self.clvp.to("cuda") + self.K = K + + @staticmethod + def fix_gpt_output(codes, stop_token, complain=True): + stop_token_indices = (codes == stop_token).nonzero() + if len(stop_token_indices) == 0: + if complain: + print( + "No stop tokens found in one of the generated voice clips. This typically means the spoken audio " + "is " + "too long. In some cases, the output will still be good, though. Listen to it and if it is " + "missing words, " + "try breaking up your input text." + ) + return codes + else: + codes[stop_token_indices] = 83 + stm = stop_token_indices.min().item() + codes[stm:] = 83 + if stm - 3 < codes.shape[0]: + codes[-3] = 45 + codes[-2] = 45 + codes[-1] = 248 + + return codes + + def parse( + self, + text_tokens: torch.Tensor, + samples: List[torch.Tensor], + stop_mel_token: int, + seed: int, + ) -> torch.Tensor: + self.deterministic_state(seed=seed) + clip_results = [] + text_tokens = copy(text_tokens).to("cuda") + samples = [copy(batch).to("cuda") for batch in samples] + for batch in samples: + for i in range(batch.shape[0]): + batch[i] = self.fix_gpt_output(batch[i], stop_mel_token) + + clvp = self.clvp( + text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False + ) + clip_results.append(clvp) + + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + # return samples[torch.topk(clip_results, self.K).indices].cpu().detach().numpy() + return samples[torch.topk(clip_results, self.K).indices] + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + +class Diffusion: + def __init__( + self, + diffusion_temperature, + diffusion_iterations=30, + cond_free=True, + cond_free_k=2, + ): + self.diffusion_temperature = diffusion_temperature + self.diffusion = ( + DiffusionTts( + model_channels=1024, + num_layers=10, + in_channels=100, + out_channels=200, + in_latent_channels=1024, + in_tokens=8193, + dropout=0, + use_fp16=False, + num_heads=16, + layer_drop=0, + unconditioned_percentage=0, + ) + .cpu() + .eval() + ) + self.diffusion.load_state_dict( + torch.load(get_model_path("diffusion_decoder.pth", MODELS_DIR)) + ) + self.diffuser = load_discrete_vocoder_diffuser( + desired_diffusion_steps=diffusion_iterations, + cond_free=cond_free, + cond_free_k=cond_free_k, + ) + + self.vocoder = UnivNetGenerator().cpu() + self.vocoder.load_state_dict( + torch.load( + get_model_path("vocoder.pth", MODELS_DIR), + map_location=torch.device("cpu"), + )["model_g"] + ) + self.vocoder.eval(inference=True) + self.diffusion.to("cuda") + self.vocoder.to("cuda") + self.aligner = Wav2VecAlignment() + # state = self.deterministic_state(seed=0) #Remove after testing + self.TACOTRON_MEL_MAX = 2.3143386840820312 + self.TACOTRON_MEL_MIN = -11.512925148010254 + + def denormalize_tacotron_mel(self, norm_mel): + return ((norm_mel + 1) / 2) * ( + self.TACOTRON_MEL_MAX - self.TACOTRON_MEL_MIN + ) + self.TACOTRON_MEL_MIN + + def potentially_redact(self, clip, text): + return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1) + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def do_spectrogram_diffusion( + self, + diffusion_model, + diffuser, + latents, + conditioning_latents, + seed, + temperature=1, + verbose=False, + ): + self.deterministic_state(seed=seed) + with torch.no_grad(): + output_seq_len = ( + latents.shape[1] * 4 * 24000 // 22050 + ) # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. + output_shape = (latents.shape[0], 100, output_seq_len) + precomputed_embeddings = diffusion_model.timestep_independent( + latents, conditioning_latents, output_seq_len, False + ) + + noise = torch.randn(output_shape, device=latents.device) * temperature + mel = diffuser.p_sample_loop( + diffusion_model, + output_shape, + noise=noise, + model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings}, + progress=verbose, + ) + return self.denormalize_tacotron_mel(mel)[:, :, :output_seq_len] + + def parse( + self, best_results, best_latents, calm_token, diffusion_conditioning, text, seed + ): + self.deterministic_state(seed=seed) + best_results = copy(best_results).to("cuda") + best_latents = copy(best_latents).to("cuda") + diffusion_conditioning = copy(diffusion_conditioning).to("cuda") + wav_candidates = [] + for b in range(best_results.shape[0]): + + codes = best_results[b].unsqueeze(0) + latents = best_latents[b].unsqueeze(0) + + ctokens = 0 + for k in range(codes.shape[-1]): + if codes[0, k] == calm_token: + ctokens += 1 + else: + ctokens = 0 + if ctokens > 8: + latents = latents[:, :k] + break + + mel = self.do_spectrogram_diffusion( + self.diffusion, + self.diffuser, + latents, + diffusion_conditioning, + seed, + temperature=self.diffusion_temperature, + verbose=False, + ) + wav = self.vocoder.inference(mel) + wav_candidates.append(wav) + # wav_candidates = [self.potentially_redact(wav_candidate, text).cpu().detach().numpy() for wav_candidate in + # wav_candidates] + # TODO: Check whether wav candidates should be in numpy + wav_candidates = [ + self.potentially_redact(wav_candidate, text) + for wav_candidate in wav_candidates + ] + return wav_candidates + + +# @serve.deployment( +# name="orchestrator", +# num_replicas=4, +# ray_actor_options={"num_cpus": 8, "num_gpus": 0.5}, +# ) +class Orchestractor: + def __init__(self, config: Dict[Text, Any]): + self.calm_token = 83 + self.tokenizer = VoiceBpeTokenizer() + _, conditioning_latent_1 = load_voice("gabby_reading", map_location="cpu") + _, conditioning_latent_2 = load_voice("gabby_conversation", map_location="cpu") + + # self.conditioning_latents1 = (latent.cpu().detach().numpy() for latent in conditioning_latent_1) + # self.conditioning_latents2 = (latent.cpu().detach().numpy() for latent in conditioning_latent_2) + self.conditioning_latents1 = (latent for latent in conditioning_latent_1) + self.conditioning_latents2 = (latent for latent in conditioning_latent_2) + ( + self.auto_conditioning1, + self.diffusion_conditioning1, + ) = self.conditioning_latents1 + ( + self.auto_conditioning2, + self.diffusion_conditioning2, + ) = self.conditioning_latents2 + + self.auto_conditioning = None + self.diffusion_conditioning = None + self.gpt = Gpt( + config[GPT][NUM_AUTOREGRESSIVE_SAMPLES], + config[GPT][TOP_P], + config[GPT][TEMPERATURE], + config[GPT][LENGTH_PENALTY], + config[GPT][REPETITION_PENALTY], + config[GPT][MAX_MEL_TOKENS], + config[GPT][AUTO_REGRESSIVE_BATCH_SIZE], + ) + self.clvp = clvp(config[CLVP_const]["k"]) + self.diffusion = Diffusion(config[DIFFUSION][DIFFUSION_TEMPERATURE]) + self.calm_token = 83 + print("orchestrator setup completed") + + @staticmethod + def __check_for_long_sentence(text_tokens): + assert ( + text_tokens.shape[-1] < 400 + ), "Too much text provided. Break the text up into separate segments and re-try inference." + # TODO: split the text into several pieces and do the generation and combine them last + + @staticmethod + def deterministic_state(seed=None): + seed = int(time()) if seed is None else seed + torch.manual_seed(seed) + random.seed(seed) + return seed + + def preprocess_text(self, text: Text): + torch_tensor = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0) + return torch_tensor + + def parse(self, res): + print("parsing") + file_name = hashlib.sha1(str(datetime.now()).encode("UTF-8")) + res = [torch.Tensor(copy(split)).squeeze(0).cpu() for split in res] + res = [torch.flatten(split) for split in res] + merged_audio_tensor = torch.cat(res).reshape(1, -1) + torchaudio.save(f"./{file_name.hexdigest()}.wav", merged_audio_tensor, 24000) + # torchaudio.save(f"./{file_name.hexdigest()}.wav", torch.Tensor(copy(res)).squeeze(0).cpu(), 24000) + return file_name.hexdigest() + + def generate(self, text, voice, seed): + if voice == "gabby_reading": + self.auto_conditioning = self.auto_conditioning1 + self.diffusion_conditioning = self.diffusion_conditioning1 + elif voice == "gabby_conversation": + self.auto_conditioning = self.auto_conditioning2 + self.diffusion_conditioning = self.diffusion_conditioning2 + + self.deterministic_state(seed=seed) + text_tokens = self.preprocess_text( + text + ) # preprocess the in-coming text into tokens + self.__check_for_long_sentence(text_tokens) + # text_tokens = text_tokens.cpu().detach().numpy() + samples, stop_mel_token = self.gpt.parse_inference( + self.auto_conditioning, text_tokens, seed + ) + best_sample = self.clvp.parse(text_tokens, samples, stop_mel_token, seed) + best_latent = self.gpt.parse( + self.auto_conditioning, text_tokens, best_sample, seed + ) + wav_candidates = self.diffusion.parse( + best_sample, + best_latent, + self.calm_token, + self.diffusion_conditioning, + text, + seed, + ) + if len(wav_candidates) > 1: + res = wav_candidates + else: + res = wav_candidates[0] + + return res.cpu() + + +# @app.on_event("startup") +# def startup_event(): +# ray.init(address="auto") +# serve.start() +# config = get_config_file(Path("config-model.yaml")) +# Orchestractor.deploy(config) +# orchestrator = serve.get_deployment("orchestrator") +# orchestrator = orchestrator.get_handle() +# app.deploy = orchestrator + + +# @app.on_event("shutdown") +# def shutdown_event(): +# ray.shutdown() +# +# +# @app.post("/convert") + +def model1_deployment(voice="gabby_reading", text="hello how are you!", seed=3): + serve.start(detached=True) + config = get_config_file(Path("config-model.yaml")) + Orchestractor.deploy(config) + orchestrator = serve.get_deployment("orchestrator") + orchestrator = orchestrator.get_handle() + app.deploy = orchestrator + if voice == "gabby_reading" or voice == "gabby_convo": + sentences = text.split(". ") + if len(sentences) > 1: + values = ray.get( + [ + app.deploy.generate.remote( + text=sentence, voice=voice, seed=seed + ) + for sentence in sentences + ] + ) + else: + values = [ + ray.get(app.deploy.generate.remote(text, voice, seed)) + ] + file_name = ray.get(app.deploy.parse.remote(values)) + return FileResponse(f"./{file_name}.wav") + else: + return f"{voice} not available!" + + +if __name__ == "__main__": + config = get_config_file(Path("config-model.yaml")) + orches = Orchestractor(config) + orches.generate(text="hello how are you doing from prakash!", voice="gabby_reading", seed=3) diff --git a/try.py b/try.py new file mode 100644 index 0000000000000000000000000000000000000000..37a1eea46c1fd18151f0a63e275a968524e26fc5 --- /dev/null +++ b/try.py @@ -0,0 +1,9 @@ + +from handler import EndpointHandler + + +main = EndpointHandler() + +print(main({"text": "Hello World!", +"voice": "gabby_reading", +"seed": 3})) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b79a1bcc3a5ff7aab534e6aefc41548e93041c16 --- /dev/null +++ b/utils.py @@ -0,0 +1,84 @@ +import os +from pathlib import Path +from typing import Any, Dict, Text +from urllib import request + +import progressbar +import yaml + +from ruth_tts_transformer.utils.diffusion import ( + SpacedDiffusion, + get_named_beta_schedule, + space_timesteps, +) + +MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "ruth", "models") +MODELS = { + "autoregressive.pth": "https://huggingface.co/PuretalkMeca/ruth-tts/resolve/main/gpt.pth", + "clvp2.pth": "https://huggingface.co/PuretalkMeca/ruth-tts/resolve/main/best_sample.pth", + "diffusion_decoder.pth": "https://huggingface.co/PuretalkMeca/ruth-tts/resolve/main/mel_diffussion.pth", + "vocoder.pth": "https://huggingface.co/PuretalkMeca/ruth-tts/resolve/main/univ_c16_0292.pth", +} +pbar = None + + +def get_config_file(file_path: Path) -> Dict[Text, Any]: + with open(file_path, "r") as f: + return yaml.safe_load(f) + + +def download_models(specific_models=None): + os.makedirs(MODELS_DIR, exist_ok=True) + + def show_progress(block_num, block_size, total_size): + global pbar + if pbar is None: + pbar = progressbar.ProgressBar(maxval=total_size) + pbar.start() + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(downloaded) + else: + pbar.finish() + pbar = None + + for model_name, url in MODELS.items(): + if specific_models is not None and model_name not in specific_models: + continue + model_path = os.path.join(MODELS_DIR, model_name) + if os.path.exists(model_path): + continue + + request.urlretrieve(url, model_path, show_progress) + + +def get_model_path(model_name, models_dir=MODELS_DIR): + if model_name not in MODELS: + raise ValueError(f"Model {model_name} not found in available models.") + model_path = os.path.join(models_dir, model_name) + if not os.path.exists(model_path) and models_dir == MODELS_DIR: + download_models([model_name]) + return model_path + + +def load_discrete_vocoder_diffuser( + trained_diffusion_steps=4000, + desired_diffusion_steps=200, + cond_free=True, + cond_free_k=1, +): + """ + Helper function to load a GaussianDiffusion instance configured for use as a vocoder. + """ + return SpacedDiffusion( + use_timesteps=space_timesteps( + trained_diffusion_steps, [desired_diffusion_steps] + ), + model_mean_type="epsilon", + model_var_type="learned_range", + loss_type="mse", + betas=get_named_beta_schedule("linear", trained_diffusion_steps), + conditioning_free=cond_free, + conditioning_free_k=cond_free_k, + )