Spaces:

Karayakar
/

chtrbx

Sleeping

App Files Files Community

Karay Akar commited on Aug 9

Commit

9b1625c

1 Parent(s): 650e533

f1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -35
README.md +0 -13
app.py +0 -230
chatterbox/src/chatterbox/__init__.py +0 -10
chatterbox/src/chatterbox/__pycache__/__init__.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/tts.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/vc.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/__init__.py +0 -0
chatterbox/src/chatterbox/models/__pycache__/__init__.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/__pycache__/model_v2.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/__pycache__/utils.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/model_v2.py +0 -405
chatterbox/src/chatterbox/models/s3gen/__init__.py +0 -2
chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/configs.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/configs.py +0 -10
chatterbox/src/chatterbox/models/s3gen/const.py +0 -1
chatterbox/src/chatterbox/models/s3gen/decoder.py +0 -317
chatterbox/src/chatterbox/models/s3gen/f0_predictor.py +0 -55
chatterbox/src/chatterbox/models/s3gen/flow.py +0 -282
chatterbox/src/chatterbox/models/s3gen/flow_matching.py +0 -218
chatterbox/src/chatterbox/models/s3gen/hifigan.py +0 -474
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py +0 -443
chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py +0 -129
chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py +0 -413
chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py +0 -316
chatterbox/src/chatterbox/models/s3gen/s3gen.py +0 -298
chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-310.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/activation.py +0 -84
chatterbox/src/chatterbox/models/s3gen/transformer/attention.py +0 -330

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: Chtrbx
-emoji: ⚡
-colorFrom: red
-colorTo: purple
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
-license: cc-by-nc-4.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py DELETED Viewed

@@ -1,230 +0,0 @@
-import random
-import numpy as np
-import torch
-import gradio as gr
-from chatterbox.tts import ChatterboxTTS
-import json
-from pathlib import Path
-from torch import nn, Tensor
-import threading
-device = "cuda" if torch.cuda.is_available() else "cpu"
-t3_model_path="https://huggingface.co/Karayakar/PRVCHTBX/resolve/main/model.safetensors"
-tokenizer_path="https://huggingface.co/Karayakar/PRVCHTBX/resolve/main/tokenizer.json"
-model=None
-import functools
-_MODEL_SINGLETON = None
-_MODEL_LOCK = threading.Lock()
-def get_model_singleton():
-    global _MODEL_SINGLETON
-    if _MODEL_SINGLETON is None:
-        with _MODEL_LOCK:
-            if _MODEL_SINGLETON is None:   # double-checked locking
-                _MODEL_SINGLETON = load_model_init()
-    return _MODEL_SINGLETON
-def set_seed(seed: int):
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-def set_inference_flags(tts, device="cuda", dtype=torch.bfloat16):
-    # t3 (LLM tarafı)
-    if hasattr(tts, "t3") and isinstance(tts.t3, nn.Module):
-        tts.t3.eval()
-        try:
-            tts.t3.to(device=device, dtype=dtype)
-        except Exception:
-            pass
-        # varsa HF backbone'u derle
-        if hasattr(tts.t3, "tfmr"):
-            try:
-                tts.t3.tfmr = torch.compile(tts.t3.tfmr, mode="reduce-overhead")
-            except Exception as e:
-                print("compile skipped:", e)
-    # s3gen (speech generator / tokenizer çevresi)
-    if hasattr(tts, "s3gen") and isinstance(tts.s3gen, nn.Module):
-        tts.s3gen.eval()
-        try:
-            tts.s3gen.to(device=device, dtype=dtype)
-        except Exception:
-            pass
-    # voice encoder
-    if hasattr(tts, "ve") and isinstance(tts.ve, nn.Module):
-        tts.ve.eval()
-        try:
-            tts.ve.to(device=device, dtype=dtype)
-        except Exception:
-            pass
-def load_model():
-    return get_model_singleton()
-def load_model_init():
-    print("LOAD MODEL CALLED")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    t3_model_path="https://huggingface.co/Karayakar/PRVCHTBX/resolve/main/model.safetensors"
-    tokenizer_path="https://huggingface.co/Karayakar/PRVCHTBX/resolve/main/tokenizer.json"
-    with open("/model_path.json", "r") as f:
-       model_path = json.load(f)
-    if not t3_model_path or not Path(t3_model_path).exists():
-        raise gr.Error("Please select a valid T3 model file")
-    voice_encoder_path = model_path["voice_encoder_path"]
-    s3gen_path = model_path["s3gen_path"]
-    if not tokenizer_path or not Path(tokenizer_path).exists():
-        tokenizer_path = model_path["tokenizer_path"]
-    conds_path = Path(model_path["conds_path"])
-    #if model is None:
-        #model = ChatterboxTTS.from_pretrained(DEVICE)
-    model = ChatterboxTTS.from_specified(
-            voice_encoder_path=voice_encoder_path,
-            t3_path=t3_model_path,
-            s3gen_path=s3gen_path,
-            tokenizer_path=tokenizer_path,
-            conds_path=conds_path,
-            device=device)
-    #model = ChatterboxTTS.from_pretrained(DEVICE)
-    # --- HIZ AYARLARI: tam burada yap ---
-    # hız bayrakları (3090)
-    torch.set_float32_matmul_precision("high")
-    from torch.backends.cuda import sdp_kernel
-    sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False)
-    # weights float32 doesn't work --> set_inference_flags(model, device=device, dtype=torch.bfloat16)  # <- burada
-    torch.backends.cudnn.benchmark = True
-    return model
-def generate( text, audio_prompt_path, exaggeration, temperature, seed_num, cfgw, min_p, top_p, repetition_penalty):
-    #t3_model_path="G:\\OPENAI\\chatterbox\\checkpoint_159960\\model.safetensors"
-    #tokenizer_path="G:\\OPENAI\\chatterbox\\checkpoint_159960\\tokenizer.json"
-    #device = "cuda" if torch.cuda.is_available() else "cpu"
-    #with open("G:\\OPENAI\\chatterbox\\checkpoint_159960\\model_path.json", "r") as f:
-    #    model_path = json.load(f)
-    #
-    #
-    #if not t3_model_path or not Path(t3_model_path).exists():
-    #    raise gr.Error("Please select a valid T3 model file")
-    #
-    #voice_encoder_path = model_path["voice_encoder_path"]
-    #s3gen_path = model_path["s3gen_path"]
-    #
-    #if not tokenizer_path or not Path(tokenizer_path).exists():
-    #    tokenizer_path = model_path["tokenizer_path"]
-    #
-    #conds_path = Path(model_path["conds_path"])
-    #
-    #if model is None:
-    #    #model = ChatterboxTTS.from_pretrained(DEVICE)
-    #    model = ChatterboxTTS.from_specified(
-    #        voice_encoder_path=voice_encoder_path,
-    #        t3_path=t3_model_path,
-    #        s3gen_path=s3gen_path,
-    #        tokenizer_path=tokenizer_path,
-    #        conds_path=conds_path,
-    #        device=device)
-    #if model is None:
-    model=get_model_singleton()
-    if seed_num != 0:
-        set_seed(int(seed_num))
-    #wav = model.generate(
-    #    text,
-    #    audio_prompt_path=audio_prompt_path,
-    #    exaggeration=exaggeration,
-    #    temperature=temperature,
-    #    cfg_weight=cfgw,
-    #    min_p=min_p,
-    #    top_p=top_p,
-    #    repetition_penalty=repetition_penalty,
-    #)
-    wav= model.generate(
-            text,
-            repetition_penalty=repetition_penalty,
-            min_p=min_p,
-            top_p=top_p,
-            audio_prompt_path=audio_prompt_path,
-            exaggeration=exaggeration,
-            cfg_weight=cfgw,
-            temperature=temperature,
-        )
-    return (model.sr, wav.squeeze(0).numpy())
-with gr.Blocks() as demo:
-    #model_state = gr.State(None)  # Loaded once per session/user
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(
-                value="Yapzek.ai, çok yakında ses klonlama özelliğiyle sınırları bir adım daha öteye taşıyor! Kendi sesinizi birkaç dakikalık örnekle klonlayabilecek, dilediğiniz metni kendi sesinizle, üstelik duygulu ve doğal bir şekilde seslendirebileceksiniz.",
-                label="Text to synthesize (max chars 300)",
-                max_lines=5
-            )
-            ref_wav = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Reference Audio File", value=None)
-            exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5)
-            cfg_weight = gr.Slider(0.0, 1, step=.05, label="CFG/Pace", value=0.5)
-            with gr.Accordion("More options", open=False):
-                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
-                temp = gr.Slider(0.05, 5, step=.05, label="temperature", value=.8)
-                min_p = gr.Slider(0.00, 1.00, step=0.01, label="min_p || Newer Sampler. Recommend 0.02 > 0.1. Handles Higher Temperatures better. 0.00 Disables", value=0.05)
-                top_p = gr.Slider(0.00, 1.00, step=0.01, label="top_p || Original Sampler. 1.0 Disables(recommended). Original 0.8", value=1.00)
-                repetition_penalty = gr.Slider(1.00, 2.00, step=0.1, label="repetition_penalty", value=1.2)
-            run_btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="Output Audio")
-    demo.load(fn=load_model, inputs=[])#, outputs=model_state)
-    run_btn.click(
-        fn=generate,
-        inputs=[
-            #model_state,
-            text,
-            ref_wav,
-            exaggeration,
-            temp,
-            seed_num,
-            cfg_weight,
-            min_p,
-            top_p,
-            repetition_penalty,
-        ],
-        outputs=audio_output,
-    )
-if __name__ == "__main__":
-    demo.queue(
-        max_size=50,
-        default_concurrency_limit=1,
-    ).launch(share=True)

chatterbox/src/chatterbox/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-try:
-    from importlib.metadata import version
-except ImportError:
-    from importlib_metadata import version  # For Python <3.8
-__version__ = version("chatterbox-tts")
-from .tts import ChatterboxTTS
-from .vc import ChatterboxVC

chatterbox/src/chatterbox/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (391 Bytes)

chatterbox/src/chatterbox/__pycache__/tts.cpython-310.pyc DELETED Viewed

Binary file (17.4 kB)

chatterbox/src/chatterbox/__pycache__/vc.cpython-310.pyc DELETED Viewed

Binary file (7.16 kB)

chatterbox/src/chatterbox/models/__init__.py DELETED Viewed

File without changes

chatterbox/src/chatterbox/models/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (147 Bytes)

chatterbox/src/chatterbox/models/__pycache__/model_v2.cpython-310.pyc DELETED Viewed

Binary file (12.4 kB)

chatterbox/src/chatterbox/models/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (520 Bytes)

chatterbox/src/chatterbox/models/model_v2.py DELETED Viewed

@@ -1,405 +0,0 @@
-# Copyright (c)  (Mddct: Dinghao Zhou)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple
-import torch
-from einops import rearrange
-from s3tokenizer.model import Conv1d, LayerNorm, Linear, MultiHeadAttention
-from s3tokenizer.utils import make_non_pad_mask, mask_to_bias, onnx2torch
-@dataclass
-class ModelConfig:
-    n_mels: int = 128
-    n_audio_ctx: int = 1500
-    n_audio_state: int = 1280
-    n_audio_head: int = 20
-    n_audio_layer: int = 6
-    n_codebook_size: int = 3**8
-    use_sdpa: bool = False
-def precompute_freqs_cis(dim: int,
-                         end: int,
-                         theta: float = 10000.0,
-                         scaling=None):
-    freqs = 1.0 / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)  # type: ignore
-    if scaling is not None:
-        t = t * scaling
-    freqs = torch.outer(t, freqs).float()  # type: ignore
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return torch.cat((freqs_cis, freqs_cis), dim=-1)
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    real = torch.view_as_real(freqs_cis)
-    cos, sin = real[:, :, 0], real[:, :, 1]
-    cos = cos.unsqueeze(0).unsqueeze(2)
-    sin = sin.unsqueeze(0).unsqueeze(2)
-    D = xq.shape[-1]
-    half_l, half_r = xq[:, :, :, :D // 2], xq[:, :, :, D // 2:]
-    xq_r = torch.cat((-half_r, half_l), dim=-1)
-    D = xk.shape[-1]
-    half_l, half_r = xk[:, :, :, :D // 2], xk[:, :, :, D // 2:]
-    xk_r = torch.cat((-half_r, half_l), dim=-1)
-    return xq * cos + xq_r * sin, xk * cos + xk_r * sin
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [
-        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
-    ]
-    return freqs_cis.view(*shape)
-class FSQCodebook(torch.nn.Module):
-    def __init__(self, dim: int, level: int = 3):
-        super().__init__()
-        self.project_down = torch.nn.Linear(dim, 8)
-        self.level = level
-        self.embed = None
-    @torch.inference_mode()
-    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    @torch.inference_mode()
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        x_shape = x.shape
-        # pre-process
-        x = self.preprocess(x)
-        # quantize
-        h = self.project_down(x).float()
-        h = h.tanh()
-        h = h * 0.9990000128746033
-        h = h.round() + 1
-        # h = ((self.level - 1) * h).round()  # range [-k, k]
-        powers = torch.pow(
-            self.level,
-            torch.arange(2**self.level, device=x.device, dtype=h.dtype))
-        mu = torch.sum(h * powers.unsqueeze(0), dim=-1)
-        ind = mu.reshape(x_shape[0], x_shape[1]).int()
-        return ind
-    @torch.inference_mode()
-    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError(
-            'There is no official up project component provided')
-class FSQVectorQuantization(torch.nn.Module):
-    """Vector quantization implementation (inference-only).
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-    ):
-        super().__init__()
-        assert 3**8 == codebook_size
-        self._codebook = FSQCodebook(dim=dim, level=3)
-        self.codebook_size = codebook_size
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    @torch.inference_mode()
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        return self._codebook.encode(x)
-    @torch.inference_mode()
-    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
-        quantize = self._codebook.decode(embed_ind)
-        quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-class FSMNMultiHeadAttention(MultiHeadAttention):
-    def __init__(
-        self,
-        n_state: int,
-        n_head: int,
-        kernel_size: int = 31,
-        use_sdpa: bool = False,
-    ):
-        super().__init__(n_state, n_head)
-        self.fsmn_block = torch.nn.Conv1d(n_state,
-                                          n_state,
-                                          kernel_size,
-                                          stride=1,
-                                          padding=0,
-                                          groups=n_state,
-                                          bias=False)
-        self.left_padding = (kernel_size - 1) // 2
-        self.right_padding = kernel_size - 1 - self.left_padding
-        self.pad_fn = torch.nn.ConstantPad1d(
-            (self.left_padding, self.right_padding), 0.0)
-        self.use_sdpa = use_sdpa
-    def forward_fsmn(self,
-                     inputs: torch.Tensor,
-                     mask: Optional[torch.Tensor] = None):
-        b, t, _, _ = inputs.size()
-        inputs = inputs.view(b, t, -1)
-        if mask is not None and mask.size(2) > 0:  # time2 > 0
-            inputs = inputs * mask
-        x = inputs.transpose(1, 2)
-        x = self.pad_fn(x)
-        x = self.fsmn_block(x)
-        x = x.transpose(1, 2)
-        x += inputs
-        return x * mask
-    def qkv_attention(self,
-                      q: torch.Tensor,
-                      k: torch.Tensor,
-                      v: torch.Tensor,
-                      mask: Optional[torch.Tensor] = None,
-                      mask_pad: Optional[torch.Tensor] = None,
-                      freqs_cis: Optional[torch.Tensor] = None):
-        _, _, D = q.shape
-        scale = (D // self.n_head)**-0.25
-        q = q.view(*q.shape[:2], self.n_head, -1)
-        k = k.view(*k.shape[:2], self.n_head, -1)
-        v = v.view(*v.shape[:2], self.n_head, -1)
-        if freqs_cis is not None:
-            q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)
-        fsm_memory = self.forward_fsmn(v, mask_pad)
-        q = q.permute(0, 2, 1, 3) * scale
-        v = v.permute(0, 2, 1, 3)
-        if not self.use_sdpa:
-            k = k.permute(0, 2, 3, 1) * scale
-            qk = q @ k  # (B, n_head, T, T)
-            if mask is not None:
-                qk = qk + mask
-            qk = qk.float()
-            w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
-            return (w @ v).permute(
-                0, 2, 1, 3).flatten(start_dim=2), qk.detach(), fsm_memory
-        else:
-            k = k.permute(0, 2, 1, 3) * scale
-            assert mask is not None
-            output = torch.nn.functional.scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=0.,
-                scale=1.,
-            )
-            output = (output.transpose(1,
-                                       2).contiguous().view(q.size(0), -1, D)
-                      )  # (batch, time1, d_model)
-            return output, None, fsm_memory
-    def forward(self,
-                x: torch.Tensor,
-                mask: Optional[torch.Tensor] = None,
-                mask_pad: Optional[torch.Tensor] = None,
-                freqs_cis: Optional[torch.Tensor] = None):
-        q = self.query(x)
-        k = self.key(x)
-        v = self.value(x)
-        wv, qk, fsm_memory = self.qkv_attention(q, k, v, mask, mask_pad,
-                                                freqs_cis)
-        return self.out(wv) + fsm_memory, qk
-class ResidualAttentionBlock(torch.nn.Module):
-    def __init__(
-        self,
-        n_state: int,
-        n_head: int,
-        kernel_size: int = 31,
-        use_sdpa: bool = False,
-    ):
-        super().__init__()
-        self.attn = FSMNMultiHeadAttention(n_state,
-                                           n_head,
-                                           kernel_size,
-                                           use_sdpa=use_sdpa)
-        self.attn_ln = LayerNorm(n_state, eps=1e-6)
-        n_mlp = n_state * 4
-        self.mlp = torch.nn.Sequential(Linear(n_state, n_mlp), torch.nn.GELU(),
-                                       Linear(n_mlp, n_state))
-        self.mlp_ln = LayerNorm(n_state)
-    def forward(
-        self,
-        x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        mask_pad: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[torch.Tensor] = None,
-    ):
-        x = x + self.attn(
-            self.attn_ln(x), mask=mask, mask_pad=mask_pad,
-            freqs_cis=freqs_cis)[0]
-        x = x + self.mlp(self.mlp_ln(x))
-        return x
-class AudioEncoderV2(torch.nn.Module):
-    def __init__(
-        self,
-        n_mels: int,
-        n_state: int,
-        n_head: int,
-        n_layer: int,
-        stride: int,
-        use_sdpa: bool,
-    ):
-        super().__init__()
-        self.stride = stride
-        self.conv1 = Conv1d(n_mels,
-                            n_state,
-                            kernel_size=3,
-                            stride=stride,
-                            padding=1)
-        self.conv2 = Conv1d(n_state,
-                            n_state,
-                            kernel_size=3,
-                            stride=2,
-                            padding=1)
-        self.freqs_cis = precompute_freqs_cis(64, 1024 * 2)
-        self.blocks = torch.nn.ModuleList([
-            ResidualAttentionBlock(n_state, n_head, use_sdpa=use_sdpa)
-            for _ in range(n_layer)
-        ])
-    def forward(self, x: torch.Tensor,
-                x_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        x : torch.Tensor, shape = (batch_size, n_mels, T)
-            the mel spectrogram of the audio
-        x_len: torch.Tensor, shape = (batch_size,)
-            length of each audio in x
-        """
-        mask = make_non_pad_mask(x_len).unsqueeze(1)
-        x = torch.nn.functional.gelu(self.conv1(x * mask))
-        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // self.stride + 1
-        mask = make_non_pad_mask(x_len).unsqueeze(1)
-        x = torch.nn.functional.gelu(self.conv2(x * mask))
-        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // 2 + 1
-        mask = make_non_pad_mask(x_len).unsqueeze(1)
-        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
-        freqs_cis = self.freqs_cis.to(x.device)
-        mask_pad = mask.transpose(1, 2)
-        mask = mask_to_bias(mask, x.dtype)
-        tmp = torch.view_as_real(freqs_cis)
-        cos, sin = tmp[:, :, 0], tmp[:, :, 1]
-        cos = torch.cat((cos, cos), dim=-1)
-        sin = torch.cat((sin, sin), dim=-1)
-        cos = cos.unsqueeze(0).unsqueeze(2)
-        sin = sin.unsqueeze(0).unsqueeze(2)
-        for block in self.blocks:
-            x = block(x, mask.unsqueeze(1), mask_pad, freqs_cis[:x.size(1)])
-        return x, x_len
-class S3TokenizerV2(torch.nn.Module):
-    """S3 tokenizer v2 implementation (inference-only).
-    Args:
-        config (ModelConfig): Config
-    """
-    def __init__(self, name: str, config: ModelConfig = ModelConfig()):
-        super().__init__()
-        if 'v1' not in name:
-            assert 'v2' in name
-            # TODO(Mddct): make it configureable
-            config.n_codebook_size = 3**8
-        self.config = config
-        self.encoder = AudioEncoderV2(
-            self.config.n_mels,
-            self.config.n_audio_state,
-            self.config.n_audio_head,
-            self.config.n_audio_layer,
-            2,
-            self.config.use_sdpa,
-        )
-        self.quantizer = FSQVectorQuantization(
-            self.config.n_audio_state,
-            self.config.n_codebook_size,
-        )
-    def forward(self, mel: torch.Tensor,
-                mel_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        return self.quantize(mel, mel_len)
-    @torch.inference_mode()
-    def quantize(self, mel: torch.Tensor,
-                 mel_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        hidden, code_len = self.encoder(mel, mel_len)
-        code = self.quantizer.encode(hidden)
-        return code, code_len
-    @property
-    def device(self):
-        return next(self.parameters()).device
-    def init_from_onnx(self, onnx_path: str):
-        ckpt = onnx2torch(onnx_path, None, False)
-        self.load_state_dict(ckpt, strict=True)
-    def init_from_pt(self, ckpt_path: str):
-        ckpt = torch.load(ckpt_path, map_location="cpu", mmap=True)
-        self.load_state_dict(ckpt, strict=True)
-    def freeze(self):
-        for _, param in self.named_parameters():
-            param.requires_grad = False

chatterbox/src/chatterbox/models/s3gen/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .s3gen import S3Token2Wav as S3Gen
2	- from .const import S3GEN_SR

chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (239 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/configs.cpython-310.pyc DELETED Viewed

Binary file (374 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-310.pyc DELETED Viewed

Binary file (166 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-310.pyc DELETED Viewed

Binary file (8.31 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-310.pyc DELETED Viewed

Binary file (1.37 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-310.pyc DELETED Viewed

Binary file (6.3 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-310.pyc DELETED Viewed

Binary file (6.91 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-310.pyc DELETED Viewed

Binary file (13.8 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-310.pyc DELETED Viewed

Binary file (7.87 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-310.pyc DELETED Viewed

Binary file (12.1 kB)

chatterbox/src/chatterbox/models/s3gen/configs.py DELETED Viewed

@@ -1,10 +0,0 @@
-from ..utils import AttrDict
-CFM_PARAMS = AttrDict({
-    "sigma_min": 1e-06,
-    "solver": "euler",
-    "t_scheduler": "cosine",
-    "training_cfg_rate": 0.2,
-    "inference_cfg_rate": 0.7,
-    "reg_loss_type": "l1"
-})

chatterbox/src/chatterbox/models/s3gen/const.py DELETED Viewed

	@@ -1 +0,0 @@
1	- S3GEN_SR = 24000

chatterbox/src/chatterbox/models/s3gen/decoder.py DELETED Viewed

@@ -1,317 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import pack, rearrange, repeat
-from .utils.mask import add_optional_chunk_mask
-from .matcha.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, \
-    TimestepEmbedding, Upsample1D
-from .matcha.transformer import BasicTransformerBlock
-def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
-    assert mask.dtype == torch.bool
-    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
-    mask = mask.to(dtype)
-    # attention mask bias
-    # NOTE(Mddct): torch.finfo jit issues
-    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
-    mask = (1.0 - mask) * -1.0e+10
-    return mask
-class Transpose(torch.nn.Module):
-    def __init__(self, dim0: int, dim1: int):
-        super().__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-    def forward(self, x: torch.Tensor):
-        x = torch.transpose(x, self.dim0, self.dim1)
-        return x
-class CausalBlock1D(Block1D):
-    def __init__(self, dim: int, dim_out: int):
-        super(CausalBlock1D, self).__init__(dim, dim_out)
-        self.block = torch.nn.Sequential(
-            CausalConv1d(dim, dim_out, 3),
-            Transpose(1, 2),
-            nn.LayerNorm(dim_out),
-            Transpose(1, 2),
-            nn.Mish(),
-        )
-    def forward(self, x: torch.Tensor, mask: torch.Tensor):
-        output = self.block(x * mask)
-        return output * mask
-class CausalResnetBlock1D(ResnetBlock1D):
-    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
-        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
-        self.block1 = CausalBlock1D(dim, dim_out)
-        self.block2 = CausalBlock1D(dim_out, dim_out)
-class CausalConv1d(torch.nn.Conv1d):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int = 1,
-        dilation: int = 1,
-        groups: int = 1,
-        bias: bool = True,
-        padding_mode: str = 'zeros',
-        device=None,
-        dtype=None
-    ) -> None:
-        super(CausalConv1d, self).__init__(in_channels, out_channels,
-                                           kernel_size, stride,
-                                           padding=0, dilation=dilation,
-                                           groups=groups, bias=bias,
-                                           padding_mode=padding_mode,
-                                           device=device, dtype=dtype)
-        assert stride == 1
-        self.causal_padding = (kernel_size - 1, 0)
-    def forward(self, x: torch.Tensor):
-        x = F.pad(x, self.causal_padding)
-        x = super(CausalConv1d, self).forward(x)
-        return x
-class ConditionalDecoder(nn.Module):
-    def __init__(
-        self,
-        in_channels=320,
-        out_channels=80,
-        causal=True,
-        channels=[256],
-        dropout=0.0,
-        attention_head_dim=64,
-        n_blocks=4,
-        num_mid_blocks=12,
-        num_heads=8,
-        act_fn="gelu",
-    ):
-        """
-        This decoder requires an input with the same shape of the target. So, if your text content
-        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
-        """
-        super().__init__()
-        channels = tuple(channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.causal = causal
-        self.time_embeddings = SinusoidalPosEmb(in_channels)
-        time_embed_dim = channels[0] * 4
-        self.time_mlp = TimestepEmbedding(
-            in_channels=in_channels,
-            time_embed_dim=time_embed_dim,
-            act_fn="silu",
-        )
-        self.down_blocks = nn.ModuleList([])
-        self.mid_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-        # NOTE jrm: `static_chunk_size` is missing?
-        self.static_chunk_size = 0
-        output_channel = in_channels
-        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
-            input_channel = output_channel
-            output_channel = channels[i]
-            is_last = i == len(channels) - 1
-            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
-                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            downsample = (
-                Downsample1D(output_channel) if not is_last else
-                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
-        for _ in range(num_mid_blocks):
-            input_channel = channels[-1]
-            out_channels = channels[-1]
-            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
-                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
-        channels = channels[::-1] + (channels[0],)
-        for i in range(len(channels) - 1):
-            input_channel = channels[i] * 2
-            output_channel = channels[i + 1]
-            is_last = i == len(channels) - 2
-            resnet = CausalResnetBlock1D(
-                dim=input_channel,
-                dim_out=output_channel,
-                time_emb_dim=time_embed_dim,
-            ) if self.causal else ResnetBlock1D(
-                dim=input_channel,
-                dim_out=output_channel,
-                time_emb_dim=time_embed_dim,
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    BasicTransformerBlock(
-                        dim=output_channel,
-                        num_attention_heads=num_heads,
-                        attention_head_dim=attention_head_dim,
-                        dropout=dropout,
-                        activation_fn=act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            upsample = (
-                Upsample1D(output_channel, use_conv_transpose=True)
-                if not is_last
-                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
-        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
-        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
-        self.initialize_weights()
-    def initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.GroupNorm):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-    def forward(self, x, mask, mu, t, spks=None, cond=None):
-        """Forward pass of the UNet1DConditional model.
-        Args:
-            x (torch.Tensor): shape (batch_size, in_channels, time)
-            mask (_type_): shape (batch_size, 1, time)
-            t (_type_): shape (batch_size)
-            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
-            cond (_type_, optional): placeholder for future use. Defaults to None.
-        Raises:
-            ValueError: _description_
-            ValueError: _description_
-        Returns:
-            _type_: _description_
-        """
-        t = self.time_embeddings(t).to(t.dtype)
-        t = self.time_mlp(t)
-        x = pack([x, mu], "b * t")[0]
-        if spks is not None:
-            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
-            x = pack([x, spks], "b * t")[0]
-        if cond is not None:
-            x = pack([x, cond], "b * t")[0]
-        hiddens = []
-        masks = [mask]
-        for resnet, transformer_blocks, downsample in self.down_blocks:
-            mask_down = masks[-1]
-            x = resnet(x, mask_down, t)
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
-            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
-            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            hiddens.append(x)  # Save hidden states for skip connections
-            x = downsample(x * mask_down)
-            masks.append(mask_down[:, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        for resnet, transformer_blocks in self.mid_blocks:
-            x = resnet(x, mask_mid, t)
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
-            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
-            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-        for resnet, transformer_blocks, upsample in self.up_blocks:
-            mask_up = masks.pop()
-            skip = hiddens.pop()
-            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
-            x = resnet(x, mask_up, t)
-            x = rearrange(x, "b c t -> b t c").contiguous()
-            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
-            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
-            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=attn_mask,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t").contiguous()
-            x = upsample(x * mask_up)
-        x = self.final_block(x, mask_up)
-        output = self.final_proj(x * mask_up)
-        return output * mask

chatterbox/src/chatterbox/models/s3gen/f0_predictor.py DELETED Viewed

@@ -1,55 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from torch.nn.utils.parametrizations import weight_norm
-class ConvRNNF0Predictor(nn.Module):
-    def __init__(self,
-                 num_class: int = 1,
-                 in_channels: int = 80,
-                 cond_channels: int = 512
-                 ):
-        super().__init__()
-        self.num_class = num_class
-        self.condnet = nn.Sequential(
-            weight_norm(
-                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-            weight_norm(
-                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
-            ),
-            nn.ELU(),
-        )
-        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.condnet(x)
-        x = x.transpose(1, 2)
-        return torch.abs(self.classifier(x).squeeze(-1))

chatterbox/src/chatterbox/models/s3gen/flow.py DELETED Viewed

@@ -1,282 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import random
-from typing import Dict, Optional
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from .utils.mask import make_pad_mask
-from .configs import CFM_PARAMS
-class MaskedDiffWithXvec(torch.nn.Module):
-    def __init__(
-        self,
-        input_size: int = 512,
-        output_size: int = 80,
-        spk_embed_dim: int = 192,
-        output_type: str = "mel",
-        vocab_size: int = 4096,
-        input_frame_rate: int = 50,
-        only_mask_loss: bool = True,
-        encoder: torch.nn.Module = None,
-        length_regulator: torch.nn.Module = None,
-        decoder: torch.nn.Module = None,
-        decoder_conf: Dict = {
-            'in_channels': 240,
-            'out_channel': 80,
-            'spk_emb_dim': 80,
-            'n_spks': 1,
-            'cfm_params': CFM_PARAMS,
-            'decoder_params': {
-                'channels': [256, 256],
-                'dropout': 0.0,
-                'attention_head_dim': 64,
-                'n_blocks': 4,
-                'num_mid_blocks': 12,
-                'num_heads': 8,
-                'act_fn': 'gelu',
-            }
-        },
-        mel_feat_conf: Dict = {
-            'n_fft': 1024,
-            'num_mels': 80,
-            'sampling_rate': 22050,
-            'hop_size': 256,
-            'win_size': 1024,
-            'fmin': 0,
-            'fmax': 8000
-        }
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.output_size = output_size
-        self.decoder_conf = decoder_conf
-        self.mel_feat_conf = mel_feat_conf
-        self.vocab_size = vocab_size
-        self.output_type = output_type
-        self.input_frame_rate = input_frame_rate
-        logging.info(f"input frame rate={self.input_frame_rate}")
-        self.input_embedding = nn.Embedding(vocab_size, input_size)
-        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
-        self.encoder = encoder
-        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
-        self.decoder = decoder
-        self.length_regulator = length_regulator
-        self.only_mask_loss = only_mask_loss
-    def forward(
-            self,
-            batch: dict,
-            device: torch.device,
-    ) -> Dict[str, Optional[torch.Tensor]]:
-        token = batch['speech_token'].to(device)
-        token_len = batch['speech_token_len'].to(device)
-        feat = batch['speech_feat'].to(device)
-        feat_len = batch['speech_feat_len'].to(device)
-        embedding = batch['embedding'].to(device)
-        # xvec projection
-        embedding = F.normalize(embedding, dim=1)
-        embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
-        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
-        # text encode
-        h, h_lengths = self.encoder(token, token_len)
-        h = self.encoder_proj(h)
-        h, h_lengths = self.length_regulator(h, feat_len)
-        # get conditions
-        conds = torch.zeros(feat.shape, device=token.device)
-        for i, j in enumerate(feat_len):
-            if random.random() < 0.5:
-                continue
-            index = random.randint(0, int(0.3 * j))
-            conds[i, :index] = feat[i, :index]
-        conds = conds.transpose(1, 2)
-        mask = (~make_pad_mask(feat_len)).to(h)
-        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
-        loss, _ = self.decoder.compute_loss(
-            feat.transpose(1, 2).contiguous(),
-            mask.unsqueeze(1),
-            h.transpose(1, 2).contiguous(),
-            embedding,
-            cond=conds
-        )
-        return {'loss': loss}
-    @torch.inference_mode()
-    def inference(self,
-                  token,
-                  token_len,
-                  prompt_token,
-                  prompt_token_len,
-                  prompt_feat,
-                  prompt_feat_len,
-                  embedding,
-                  flow_cache):
-        if self.fp16 is True:
-            prompt_feat = prompt_feat.half()
-            embedding = embedding.half()
-        assert token.shape[0] == 1
-        # xvec projection
-        embedding = F.normalize(embedding, dim=1)
-        embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
-        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
-        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
-        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
-        # text encode
-        h, h_lengths = self.encoder(token, token_len)
-        h = self.encoder_proj(h)
-        mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
-        h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
-        # get conditions
-        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
-        conds[:, :mel_len1] = prompt_feat
-        conds = conds.transpose(1, 2)
-        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
-        feat, flow_cache = self.decoder(
-            mu=h.transpose(1, 2).contiguous(),
-            mask=mask.unsqueeze(1),
-            spks=embedding,
-            cond=conds,
-            n_timesteps=10,
-            prompt_len=mel_len1,
-            flow_cache=flow_cache
-        )
-        feat = feat[:, :, mel_len1:]
-        assert feat.shape[2] == mel_len2
-        return feat.float(), flow_cache
-class CausalMaskedDiffWithXvec(torch.nn.Module):
-    def __init__(
-        self,
-        input_size: int = 512,
-        output_size: int = 80,
-        spk_embed_dim: int = 192,
-        output_type: str = "mel",
-        vocab_size: int = 6561,
-        input_frame_rate: int = 25,
-        only_mask_loss: bool = True,
-        token_mel_ratio: int = 2,
-        pre_lookahead_len: int = 3,
-        encoder: torch.nn.Module = None,
-        decoder: torch.nn.Module = None,
-        decoder_conf: Dict = {
-            'in_channels': 240,
-            'out_channel': 80,
-            'spk_emb_dim': 80,
-            'n_spks': 1,
-            'cfm_params': CFM_PARAMS,
-            'decoder_params': {
-                'channels': [256, 256],
-                'dropout': 0.0,
-                'attention_head_dim': 64,
-                'n_blocks': 4,
-                'num_mid_blocks': 12,
-                'num_heads': 8,
-                'act_fn': 'gelu',
-            }
-        },
-        mel_feat_conf: Dict = {
-            'n_fft': 1024,
-            'num_mels': 80,
-            'sampling_rate': 22050,
-            'hop_size': 256,
-            'win_size': 1024,
-            'fmin': 0,
-            'fmax': 8000
-        }
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.output_size = output_size
-        self.decoder_conf = decoder_conf
-        self.mel_feat_conf = mel_feat_conf
-        self.vocab_size = vocab_size
-        self.output_type = output_type
-        self.input_frame_rate = input_frame_rate
-        logging.info(f"input frame rate={self.input_frame_rate}")
-        self.input_embedding = nn.Embedding(vocab_size, input_size)
-        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
-        self.encoder = encoder
-        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
-        self.decoder = decoder
-        self.only_mask_loss = only_mask_loss
-        self.token_mel_ratio = token_mel_ratio
-        self.pre_lookahead_len = pre_lookahead_len
-        # FIXME: this was missing - just putting it in as false
-        self.fp16 = False
-    @torch.inference_mode()
-    def inference(self,
-                  token,
-                  token_len,
-                  prompt_token,
-                  prompt_token_len,
-                  prompt_feat,
-                  prompt_feat_len,
-                  embedding,
-                  finalize):
-        if self.fp16 is True:
-            prompt_feat = prompt_feat.half()
-            embedding = embedding.half()
-        assert token.shape[0] == 1
-        # xvec projection
-        embedding = F.normalize(embedding, dim=1)
-        embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
-        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
-        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
-        # text encode
-        h, h_lengths = self.encoder(token, token_len)
-        if finalize is False:
-            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
-        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
-        h = self.encoder_proj(h)
-        # get conditions
-        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
-        conds[:, :mel_len1] = prompt_feat
-        conds = conds.transpose(1, 2)
-        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
-        feat, _ = self.decoder(
-            mu=h.transpose(1, 2).contiguous(),
-            mask=mask.unsqueeze(1),
-            spks=embedding,
-            cond=conds,
-            n_timesteps=10
-        )
-        feat = feat[:, :, mel_len1:]
-        assert feat.shape[2] == mel_len2
-        return feat.float(), None  # NOTE jrm: why are they returning None here?

chatterbox/src/chatterbox/models/s3gen/flow_matching.py DELETED Viewed

@@ -1,218 +0,0 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import threading
-import torch
-import torch.nn.functional as F
-from .matcha.flow_matching import BASECFM
-from .configs import CFM_PARAMS
-class ConditionalCFM(BASECFM):
-    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
-        super().__init__(
-            n_feats=in_channels,
-            cfm_params=cfm_params,
-            n_spks=n_spks,
-            spk_emb_dim=spk_emb_dim,
-        )
-        self.t_scheduler = cfm_params.t_scheduler
-        self.training_cfg_rate = cfm_params.training_cfg_rate
-        self.inference_cfg_rate = cfm_params.inference_cfg_rate
-        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
-        # Just change the architecture of the estimator here
-        self.estimator = estimator
-        self.lock = threading.Lock()
-    @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
-        cache_size = flow_cache.shape[2]
-        # fix prompt and overlap part mu and z
-        if cache_size != 0:
-            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
-            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
-        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
-        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
-        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
-        if self.t_scheduler == 'cosine':
-            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
-        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
-    def solve_euler(self, x, t_span, mu, mask, spks, cond):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        t = t.unsqueeze(dim=0)
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
-        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
-        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
-        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
-        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-        for step in range(1, len(t_span)):
-            # Classifier-Free Guidance inference introduced in VoiceBox
-            x_in[:] = x
-            mask_in[:] = mask
-            mu_in[0] = mu
-            t_in[:] = t.unsqueeze(0)
-            spks_in[0] = spks
-            cond_in[0] = cond
-            dphi_dt = self.forward_estimator(
-                x_in, mask_in,
-                mu_in, t_in,
-                spks_in,
-                cond_in
-            )
-            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
-            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1].float()
-    def forward_estimator(self, x, mask, mu, t, spks, cond):
-        if isinstance(self.estimator, torch.nn.Module):
-            return self.estimator.forward(x, mask, mu, t, spks, cond)
-        else:
-            with self.lock:
-                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
-                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
-                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
-                self.estimator.set_input_shape('t', (2,))
-                self.estimator.set_input_shape('spks', (2, 80))
-                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
-                # run trt engine
-                self.estimator.execute_v2([x.contiguous().data_ptr(),
-                                           mask.contiguous().data_ptr(),
-                                           mu.contiguous().data_ptr(),
-                                           t.contiguous().data_ptr(),
-                                           spks.contiguous().data_ptr(),
-                                           cond.contiguous().data_ptr(),
-                                           x.data_ptr()])
-            return x
-    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): target mask
-                shape: (batch_size, 1, mel_timesteps)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        b, _, t = mu.shape
-        # random timestep
-        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
-        if self.t_scheduler == 'cosine':
-            t = 1 - torch.cos(t * 0.5 * torch.pi)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
-        if self.training_cfg_rate > 0:
-            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
-            mu = mu * cfg_mask.view(-1, 1, 1)
-            spks = spks * cfg_mask.view(-1, 1)
-            cond = cond * cfg_mask.view(-1, 1, 1)
-        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
-        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
-        return loss, y
-class CausalConditionalCFM(ConditionalCFM):
-    def __init__(self, in_channels=240, cfm_params=CFM_PARAMS, n_spks=1, spk_emb_dim=80, estimator=None):
-        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
-        self.rand_noise = torch.randn([1, 80, 50 * 300])
-    @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
-        # fix prompt and overlap part mu and z
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
-        if self.t_scheduler == 'cosine':
-            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
-        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None

chatterbox/src/chatterbox/models/s3gen/hifigan.py DELETED Viewed

@@ -1,474 +0,0 @@
-# jrm: adapted from CosyVoice/cosyvoice/hifigan/generator.py
-#      most modules should be reusable, but I found their SineGen changed a git.
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""HIFI-GAN"""
-from typing import Dict, Optional, List
-import numpy as np
-from scipy.signal import get_window
-import torch
-import torch.nn.functional as F
-from torch.nn import Conv1d
-from torch.nn import ConvTranspose1d
-from torch.nn.utils import remove_weight_norm
-from torch.nn.utils.parametrizations import weight_norm
-from torch.distributions.uniform import Uniform
-from torch import nn, sin, pow
-from torch.nn import Parameter
-class Snake(nn.Module):
-    '''
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(Snake, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-"""hifigan based generator implementation.
-This code is modified from https://github.com/jik876/hifi-gan
- ,https://github.com/kan-bayashi/ParallelWaveGAN and
- https://github.com/NVIDIA/BigVGAN
-"""
-class ResBlock(torch.nn.Module):
-    """Residual block module in HiFiGAN/BigVGAN."""
-    def __init__(
-        self,
-        channels: int = 512,
-        kernel_size: int = 3,
-        dilations: List[int] = [1, 3, 5],
-    ):
-        super(ResBlock, self).__init__()
-        self.convs1 = nn.ModuleList()
-        self.convs2 = nn.ModuleList()
-        for dilation in dilations:
-            self.convs1.append(
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation,
-                        padding=get_padding(kernel_size, dilation)
-                    )
-                )
-            )
-            self.convs2.append(
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1)
-                    )
-                )
-            )
-        self.convs1.apply(init_weights)
-        self.convs2.apply(init_weights)
-        self.activations1 = nn.ModuleList([
-            Snake(channels, alpha_logscale=False)
-            for _ in range(len(self.convs1))
-        ])
-        self.activations2 = nn.ModuleList([
-            Snake(channels, alpha_logscale=False)
-            for _ in range(len(self.convs2))
-        ])
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for idx in range(len(self.convs1)):
-            xt = self.activations1[idx](x)
-            xt = self.convs1[idx](xt)
-            xt = self.activations2[idx](xt)
-            xt = self.convs2[idx](xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for idx in range(len(self.convs1)):
-            remove_weight_norm(self.convs1[idx])
-            remove_weight_norm(self.convs2[idx])
-class SineGen(torch.nn.Module):
-    """ Definition of sine generator
-    SineGen(samp_rate, harmonic_num = 0,
-            sine_amp = 0.1, noise_std = 0.003,
-            voiced_threshold = 0,
-            flag_for_pulse=False)
-    samp_rate: sampling rate in Hz
-    harmonic_num: number of harmonic overtones (default 0)
-    sine_amp: amplitude of sine-wavefrom (default 0.1)
-    noise_std: std of Gaussian noise (default 0.003)
-    voiced_thoreshold: F0 threshold for U/V classification (default 0)
-    flag_for_pulse: this SinGen is used inside PulseGen (default False)
-    Note: when flag_for_pulse is True, the first time step of a voiced
-        segment is always sin(np.pi) or cos(0)
-    """
-    def __init__(self, samp_rate, harmonic_num=0,
-                 sine_amp=0.1, noise_std=0.003,
-                 voiced_threshold=0):
-        super(SineGen, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = noise_std
-        self.harmonic_num = harmonic_num
-        self.sampling_rate = samp_rate
-        self.voiced_threshold = voiced_threshold
-    def _f02uv(self, f0):
-        # generate uv signal
-        uv = (f0 > self.voiced_threshold).type(torch.float32)
-        return uv
-    @torch.no_grad()
-    def forward(self, f0):
-        """
-        :param f0: [B, 1, sample_len], Hz
-        :return: [B, 1, sample_len]
-        """
-        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
-        for i in range(self.harmonic_num + 1):
-            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
-        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
-        u_dist = Uniform(low=-np.pi, high=np.pi)
-        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
-        phase_vec[:, 0, :] = 0
-        # generate sine waveforms
-        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
-        # generate uv signal
-        uv = self._f02uv(f0)
-        # noise: for unvoiced should be similar to sine_amp
-        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
-        # .       for voiced regions is self.noise_std
-        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-        noise = noise_amp * torch.randn_like(sine_waves)
-        # first: set the unvoiced part to 0 by uv
-        # then: additive noise
-        sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
-class SourceModuleHnNSF(torch.nn.Module):
-    """ SourceModule for hn-nsf
-    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0)
-    sampling_rate: sampling_rate in Hz
-    harmonic_num: number of harmonic above F0 (default: 0)
-    sine_amp: amplitude of sine source signal (default: 0.1)
-    add_noise_std: std of additive Gaussian noise (default: 0.003)
-        note that amplitude of noise in unvoiced is decided
-        by sine_amp
-    voiced_threshold: threhold to set U/V given F0 (default: 0)
-    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-    F0_sampled (batchsize, length, 1)
-    Sine_source (batchsize, length, 1)
-    noise_source (batchsize, length 1)
-    uv (batchsize, length, 1)
-    """
-    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
-                 add_noise_std=0.003, voiced_threshod=0):
-        super(SourceModuleHnNSF, self).__init__()
-        self.sine_amp = sine_amp
-        self.noise_std = add_noise_std
-        # to produce sine waveforms
-        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
-                                 sine_amp, add_noise_std, voiced_threshod)
-        # to merge source harmonics into a single excitation
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
-        self.l_tanh = torch.nn.Tanh()
-    def forward(self, x):
-        """
-        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
-        F0_sampled (batchsize, length, 1)
-        Sine_source (batchsize, length, 1)
-        noise_source (batchsize, length 1)
-        """
-        # source for harmonic branch
-        with torch.no_grad():
-            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
-            sine_wavs = sine_wavs.transpose(1, 2)
-            uv = uv.transpose(1, 2)
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-        # source for noise branch, in the same shape as uv
-        noise = torch.randn_like(uv) * self.sine_amp / 3
-        return sine_merge, noise, uv
-class HiFTGenerator(nn.Module):
-    """
-    HiFTNet Generator: Neural Source Filter + ISTFTNet
-    https://arxiv.org/abs/2309.09493
-    """
-    def __init__(
-            self,
-            in_channels: int = 80,
-            base_channels: int = 512,
-            nb_harmonics: int = 8,
-            sampling_rate: int = 22050,
-            nsf_alpha: float = 0.1,
-            nsf_sigma: float = 0.003,
-            nsf_voiced_threshold: float = 10,
-            upsample_rates: List[int] = [8, 8],
-            upsample_kernel_sizes: List[int] = [16, 16],
-            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
-            resblock_kernel_sizes: List[int] = [3, 7, 11],
-            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            source_resblock_kernel_sizes: List[int] = [7, 11],
-            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
-            lrelu_slope: float = 0.1,
-            audio_limit: float = 0.99,
-            f0_predictor: torch.nn.Module = None,
-    ):
-        super(HiFTGenerator, self).__init__()
-        self.out_channels = 1
-        self.nb_harmonics = nb_harmonics
-        self.sampling_rate = sampling_rate
-        self.istft_params = istft_params
-        self.lrelu_slope = lrelu_slope
-        self.audio_limit = audio_limit
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.m_source = SourceModuleHnNSF(
-            sampling_rate=sampling_rate,
-            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
-            harmonic_num=nb_harmonics,
-            sine_amp=nsf_alpha,
-            add_noise_std=nsf_sigma,
-            voiced_threshod=nsf_voiced_threshold)
-        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
-        self.conv_pre = weight_norm(
-            Conv1d(in_channels, base_channels, 7, 1, padding=3)
-        )
-        # Up
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        base_channels // (2**i),
-                        base_channels // (2**(i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        # Down
-        self.source_downs = nn.ModuleList()
-        self.source_resblocks = nn.ModuleList()
-        downsample_rates = [1] + upsample_rates[::-1][:-1]
-        downsample_cum_rates = np.cumprod(downsample_rates)
-        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
-            if u == 1:
-                self.source_downs.append(
-                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
-                )
-            else:
-                self.source_downs.append(
-                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
-                )
-            self.source_resblocks.append(
-                ResBlock(base_channels // (2 ** (i + 1)), k, d)
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = base_channels // (2**(i + 1))
-            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(ResBlock(ch, k, d))
-        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-        self.reflection_pad = nn.ReflectionPad1d((1, 0))
-        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
-        self.f0_predictor = f0_predictor
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-        self.m_source.remove_weight_norm()
-        for l in self.source_downs:
-            remove_weight_norm(l)
-        for l in self.source_resblocks:
-            l.remove_weight_norm()
-    def _stft(self, x):
-        spec = torch.stft(
-            x,
-            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
-            return_complex=True)
-        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
-        return spec[..., 0], spec[..., 1]
-    def _istft(self, magnitude, phase):
-        magnitude = torch.clip(magnitude, max=1e2)
-        real = magnitude * torch.cos(phase)
-        img = magnitude * torch.sin(phase)
-        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
-                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
-        return inverse_transform
-    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
-        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
-        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, self.lrelu_slope)
-            x = self.ups[i](x)
-            if i == self.num_upsamples - 1:
-                x = self.reflection_pad(x)
-            # fusion
-            si = self.source_downs[i](s_stft)
-            si = self.source_resblocks[i](si)
-            x = x + si
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
-        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
-        x = self._istft(magnitude, phase)
-        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
-        return x
-    def forward(
-            self,
-            batch: dict,
-            device: torch.device,
-    ) -> Dict[str, Optional[torch.Tensor]]:
-        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
-        # mel->f0
-        f0 = self.f0_predictor(speech_feat)
-        # f0->source
-        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
-        s, _, _ = self.m_source(s)
-        s = s.transpose(1, 2)
-        # mel+source->speech
-        generated_speech = self.decode(x=speech_feat, s=s)
-        return generated_speech, f0
-    @torch.inference_mode()
-    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
-        # mel->f0
-        f0 = self.f0_predictor(speech_feat)
-        # f0->source
-        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
-        s, _, _ = self.m_source(s)
-        s = s.transpose(1, 2)
-        # use cache_source to avoid glitch
-        if cache_source.shape[2] != 0:
-            s[:, :, :cache_source.shape[2]] = cache_source
-        generated_speech = self.decode(x=speech_feat, s=s)
-        return generated_speech, s

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-310.pyc DELETED Viewed

Binary file (11.1 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-310.pyc DELETED Viewed

Binary file (4.52 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-310.pyc DELETED Viewed

Binary file (9.35 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py DELETED Viewed

@@ -1,443 +0,0 @@
-import math
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from conformer import ConformerBlock
-from diffusers.models.activations import get_activation
-from einops import pack, rearrange, repeat
-from .transformer import BasicTransformerBlock
-class SinusoidalPosEmb(torch.nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
-    def forward(self, x, scale=1000):
-        if x.ndim < 1:
-            x = x.unsqueeze(0)
-        device = x.device
-        half_dim = self.dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
-        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
-        return emb
-class Block1D(torch.nn.Module):
-    def __init__(self, dim, dim_out, groups=8):
-        super().__init__()
-        self.block = torch.nn.Sequential(
-            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
-            torch.nn.GroupNorm(groups, dim_out),
-            nn.Mish(),
-        )
-    def forward(self, x, mask):
-        output = self.block(x * mask)
-        return output * mask
-class ResnetBlock1D(torch.nn.Module):
-    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
-        super().__init__()
-        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
-        self.block1 = Block1D(dim, dim_out, groups=groups)
-        self.block2 = Block1D(dim_out, dim_out, groups=groups)
-        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
-    def forward(self, x, mask, time_emb):
-        h = self.block1(x, mask)
-        h += self.mlp(time_emb).unsqueeze(-1)
-        h = self.block2(h, mask)
-        output = h + self.res_conv(x * mask)
-        return output
-class Downsample1D(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
-    def forward(self, x):
-        return self.conv(x)
-class TimestepEmbedding(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        time_embed_dim: int,
-        act_fn: str = "silu",
-        out_dim: int = None,
-        post_act_fn: Optional[str] = None,
-        cond_proj_dim=None,
-    ):
-        super().__init__()
-        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
-        if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
-        else:
-            self.cond_proj = None
-        self.act = get_activation(act_fn)
-        if out_dim is not None:
-            time_embed_dim_out = out_dim
-        else:
-            time_embed_dim_out = time_embed_dim
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
-        if post_act_fn is None:
-            self.post_act = None
-        else:
-            self.post_act = get_activation(post_act_fn)
-    def forward(self, sample, condition=None):
-        if condition is not None:
-            sample = sample + self.cond_proj(condition)
-        sample = self.linear_1(sample)
-        if self.act is not None:
-            sample = self.act(sample)
-        sample = self.linear_2(sample)
-        if self.post_act is not None:
-            sample = self.post_act(sample)
-        return sample
-class Upsample1D(nn.Module):
-    """A 1D upsampling layer with an optional convolution.
-    Parameters:
-        channels (`int`):
-            number of channels in the inputs and outputs.
-        use_conv (`bool`, default `False`):
-            option to use a convolution.
-        use_conv_transpose (`bool`, default `False`):
-            option to use a convolution transpose.
-        out_channels (`int`, optional):
-            number of output channels. Defaults to `channels`.
-    """
-    def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-        self.conv = None
-        if use_conv_transpose:
-            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
-    def forward(self, inputs):
-        assert inputs.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(inputs)
-        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
-        if self.use_conv:
-            outputs = self.conv(outputs)
-        return outputs
-class ConformerWrapper(ConformerBlock):
-    def __init__(  # pylint: disable=useless-super-delegation
-        self,
-        *,
-        dim,
-        dim_head=64,
-        heads=8,
-        ff_mult=4,
-        conv_expansion_factor=2,
-        conv_kernel_size=31,
-        attn_dropout=0,
-        ff_dropout=0,
-        conv_dropout=0,
-        conv_causal=False,
-    ):
-        super().__init__(
-            dim=dim,
-            dim_head=dim_head,
-            heads=heads,
-            ff_mult=ff_mult,
-            conv_expansion_factor=conv_expansion_factor,
-            conv_kernel_size=conv_kernel_size,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            conv_dropout=conv_dropout,
-            conv_causal=conv_causal,
-        )
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        timestep=None,
-    ):
-        return super().forward(x=hidden_states, mask=attention_mask.bool())
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        channels=(256, 256),
-        dropout=0.05,
-        attention_head_dim=64,
-        n_blocks=1,
-        num_mid_blocks=2,
-        num_heads=4,
-        act_fn="snake",
-        down_block_type="transformer",
-        mid_block_type="transformer",
-        up_block_type="transformer",
-    ):
-        super().__init__()
-        channels = tuple(channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.time_embeddings = SinusoidalPosEmb(in_channels)
-        time_embed_dim = channels[0] * 4
-        self.time_mlp = TimestepEmbedding(
-            in_channels=in_channels,
-            time_embed_dim=time_embed_dim,
-            act_fn="silu",
-        )
-        self.down_blocks = nn.ModuleList([])
-        self.mid_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-        output_channel = in_channels
-        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
-            input_channel = output_channel
-            output_channel = channels[i]
-            is_last = i == len(channels) - 1
-            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        down_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            downsample = (
-                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
-        for i in range(num_mid_blocks):
-            input_channel = channels[-1]
-            out_channels = channels[-1]
-            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        mid_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
-        channels = channels[::-1] + (channels[0],)
-        for i in range(len(channels) - 1):
-            input_channel = channels[i]
-            output_channel = channels[i + 1]
-            is_last = i == len(channels) - 2
-            resnet = ResnetBlock1D(
-                dim=2 * input_channel,
-                dim_out=output_channel,
-                time_emb_dim=time_embed_dim,
-            )
-            transformer_blocks = nn.ModuleList(
-                [
-                    self.get_block(
-                        up_block_type,
-                        output_channel,
-                        attention_head_dim,
-                        num_heads,
-                        dropout,
-                        act_fn,
-                    )
-                    for _ in range(n_blocks)
-                ]
-            )
-            upsample = (
-                Upsample1D(output_channel, use_conv_transpose=True)
-                if not is_last
-                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
-            )
-            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
-        self.final_block = Block1D(channels[-1], channels[-1])
-        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
-        self.initialize_weights()
-        # nn.init.normal_(self.final_proj.weight)
-    @staticmethod
-    def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
-        if block_type == "conformer":
-            block = ConformerWrapper(
-                dim=dim,
-                dim_head=attention_head_dim,
-                heads=num_heads,
-                ff_mult=1,
-                conv_expansion_factor=2,
-                ff_dropout=dropout,
-                attn_dropout=dropout,
-                conv_dropout=dropout,
-                conv_kernel_size=31,
-            )
-        elif block_type == "transformer":
-            block = BasicTransformerBlock(
-                dim=dim,
-                num_attention_heads=num_heads,
-                attention_head_dim=attention_head_dim,
-                dropout=dropout,
-                activation_fn=act_fn,
-            )
-        else:
-            raise ValueError(f"Unknown block type {block_type}")
-        return block
-    def initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.GroupNorm):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-    def forward(self, x, mask, mu, t, spks=None, cond=None):
-        """Forward pass of the UNet1DConditional model.
-        Args:
-            x (torch.Tensor): shape (batch_size, in_channels, time)
-            mask (_type_): shape (batch_size, 1, time)
-            t (_type_): shape (batch_size)
-            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
-            cond (_type_, optional): placeholder for future use. Defaults to None.
-        Raises:
-            ValueError: _description_
-            ValueError: _description_
-        Returns:
-            _type_: _description_
-        """
-        t = self.time_embeddings(t)
-        t = self.time_mlp(t)
-        x = pack([x, mu], "b * t")[0]
-        if spks is not None:
-            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
-            x = pack([x, spks], "b * t")[0]
-        hiddens = []
-        masks = [mask]
-        for resnet, transformer_blocks, downsample in self.down_blocks:
-            mask_down = masks[-1]
-            x = resnet(x, mask_down, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_down = rearrange(mask_down, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_down,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_down = rearrange(mask_down, "b t -> b 1 t")
-            hiddens.append(x)  # Save hidden states for skip connections
-            x = downsample(x * mask_down)
-            masks.append(mask_down[:, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        for resnet, transformer_blocks in self.mid_blocks:
-            x = resnet(x, mask_mid, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_mid = rearrange(mask_mid, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_mid,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_mid = rearrange(mask_mid, "b t -> b 1 t")
-        for resnet, transformer_blocks, upsample in self.up_blocks:
-            mask_up = masks.pop()
-            x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
-            x = rearrange(x, "b c t -> b t c")
-            mask_up = rearrange(mask_up, "b 1 t -> b t")
-            for transformer_block in transformer_blocks:
-                x = transformer_block(
-                    hidden_states=x,
-                    attention_mask=mask_up,
-                    timestep=t,
-                )
-            x = rearrange(x, "b t c -> b c t")
-            mask_up = rearrange(mask_up, "b t -> b 1 t")
-            x = upsample(x * mask_up)
-        x = self.final_block(x, mask_up)
-        output = self.final_proj(x * mask_up)
-        return output * mask

chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py DELETED Viewed

@@ -1,129 +0,0 @@
-from abc import ABC
-import torch
-import torch.nn.functional as F
-from .decoder import Decoder
-class BASECFM(torch.nn.Module, ABC):
-    def __init__(
-        self,
-        n_feats,
-        cfm_params,
-        n_spks=1,
-        spk_emb_dim=128,
-    ):
-        super().__init__()
-        self.n_feats = n_feats
-        self.n_spks = n_spks
-        self.spk_emb_dim = spk_emb_dim
-        self.solver = cfm_params.solver
-        if hasattr(cfm_params, "sigma_min"):
-            self.sigma_min = cfm_params.sigma_min
-        else:
-            self.sigma_min = 1e-4
-        self.estimator = None
-    @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
-        """Forward diffusion
-        Args:
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            n_timesteps (int): number of diffusion steps
-            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        Returns:
-            sample: generated mel-spectrogram
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        z = torch.randn_like(mu) * temperature
-        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
-        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
-    def solve_euler(self, x, t_span, mu, mask, spks, cond):
-        """
-        Fixed euler solver for ODEs.
-        Args:
-            x (torch.Tensor): random noise
-            t_span (torch.Tensor): n_timesteps interpolated
-                shape: (n_timesteps + 1,)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): output_mask
-                shape: (batch_size, 1, mel_timesteps)
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-            cond: Not used but kept for future purposes
-        """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in range(1, len(t_span)):
-            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
-    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
-        """Computes diffusion loss
-        Args:
-            x1 (torch.Tensor): Target
-                shape: (batch_size, n_feats, mel_timesteps)
-            mask (torch.Tensor): target mask
-                shape: (batch_size, 1, mel_timesteps)
-            mu (torch.Tensor): output of encoder
-                shape: (batch_size, n_feats, mel_timesteps)
-            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
-                shape: (batch_size, spk_emb_dim)
-        Returns:
-            loss: conditional flow matching loss
-            y: conditional flow
-                shape: (batch_size, n_feats, mel_timesteps)
-        """
-        b, _, t = mu.shape
-        # random timestep
-        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
-        # sample noise p(x_0)
-        z = torch.randn_like(x1)
-        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
-        u = x1 - (1 - self.sigma_min) * z
-        loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
-            torch.sum(mask) * u.shape[1]
-        )
-        return loss, y
-class CFM(BASECFM):
-    def __init__(self, in_channels, out_channel, cfm_params, decoder_params, n_spks=1, spk_emb_dim=64):
-        super().__init__(
-            n_feats=in_channels,
-            cfm_params=cfm_params,
-            n_spks=n_spks,
-            spk_emb_dim=spk_emb_dim,
-        )
-        in_channels = in_channels + (spk_emb_dim if n_spks > 1 else 0)
-        # Just change the architecture of the estimator here
-        self.estimator = Decoder(in_channels=in_channels, out_channels=out_channel, **decoder_params)

chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py DELETED Viewed

@@ -1,413 +0,0 @@
-""" from https://github.com/jaywalnut310/glow-tts """
-import math
-import torch
-import torch.nn as nn
-from einops import rearrange
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = torch.nn.Parameter(torch.ones(channels))
-        self.beta = torch.nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-class ConvReluNorm(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-        super().__init__()
-        self.in_channels = in_channels
-        self.hidden_channels = hidden_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
-        self.conv_layers = torch.nn.ModuleList()
-        self.norm_layers = torch.nn.ModuleList()
-        self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-        self.norm_layers.append(LayerNorm(hidden_channels))
-        self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
-        for _ in range(n_layers - 1):
-            self.conv_layers.append(
-                torch.nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
-            )
-            self.norm_layers.append(LayerNorm(hidden_channels))
-        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
-    def forward(self, x, x_mask):
-        x_org = x
-        for i in range(self.n_layers):
-            x = self.conv_layers[i](x * x_mask)
-            x = self.norm_layers[i](x)
-            x = self.relu_drop(x)
-        x = x_org + self.proj(x)
-        return x * x_mask
-class DurationPredictor(nn.Module):
-    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.p_dropout = p_dropout
-        self.drop = torch.nn.Dropout(p_dropout)
-        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_1 = LayerNorm(filter_channels)
-        self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_2 = LayerNorm(filter_channels)
-        self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
-    def forward(self, x, x_mask):
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-class RotaryPositionalEmbeddings(nn.Module):
-    """
-    ## RoPE module
-    Rotary encoding transforms pairs of features by rotating in the 2D plane.
-    That is, it organizes the $d$ features as $\frac{d}{2}$ pairs.
-    Each pair can be considered a coordinate in a 2D plane, and the encoding will rotate it
-    by an angle depending on the position of the token.
-    """
-    def __init__(self, d: int, base: int = 10_000):
-        r"""
-        * `d` is the number of features $d$
-        * `base` is the constant used for calculating $\Theta$
-        """
-        super().__init__()
-        self.base = base
-        self.d = int(d)
-        self.cos_cached = None
-        self.sin_cached = None
-    def _build_cache(self, x: torch.Tensor):
-        r"""
-        Cache $\cos$ and $\sin$ values
-        """
-        # Return if cache is already built
-        if self.cos_cached is not None and x.shape[0] <= self.cos_cached.shape[0]:
-            return
-        # Get sequence length
-        seq_len = x.shape[0]
-        # $\Theta = {\theta_i = 10000^{-\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (self.base ** (torch.arange(0, self.d, 2).float() / self.d)).to(x.device)
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, device=x.device).float().to(x.device)
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.einsum("n,d->nd", seq_idx, theta)
-        # Concatenate so that for row $m$ we have
-        # $[m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}, m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}]$
-        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=1)
-        # Cache them
-        self.cos_cached = idx_theta2.cos()[:, None, None, :]
-        self.sin_cached = idx_theta2.sin()[:, None, None, :]
-    def _neg_half(self, x: torch.Tensor):
-        # $\frac{d}{2}$
-        d_2 = self.d // 2
-        # Calculate $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
-        return torch.cat([-x[:, :, :, d_2:], x[:, :, :, :d_2]], dim=-1)
-    def forward(self, x: torch.Tensor):
-        """
-        * `x` is the Tensor at the head of a key or a query with shape `[seq_len, batch_size, n_heads, d]`
-        """
-        # Cache $\cos$ and $\sin$ values
-        x = rearrange(x, "b h t d -> t b h d")
-        self._build_cache(x)
-        # Split the features, we can choose to apply rotary embeddings only to a partial set of features.
-        x_rope, x_pass = x[..., : self.d], x[..., self.d :]
-        # Calculate
-        # $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
-        neg_half_x = self._neg_half(x_rope)
-        x_rope = (x_rope * self.cos_cached[: x.shape[0]]) + (neg_half_x * self.sin_cached[: x.shape[0]])
-        return rearrange(torch.cat((x_rope, x_pass), dim=-1), "t b h d -> b h t d")
-class MultiHeadAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        out_channels,
-        n_heads,
-        heads_share=True,
-        p_dropout=0.0,
-        proximal_bias=False,
-        proximal_init=False,
-    ):
-        super().__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.heads_share = heads_share
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
-        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
-        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
-        # from https://nn.labml.ai/transformers/rope/index.html
-        self.query_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
-        self.key_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
-        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
-        self.drop = torch.nn.Dropout(p_dropout)
-        torch.nn.init.xavier_uniform_(self.conv_q.weight)
-        torch.nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        torch.nn.init.xavier_uniform_(self.conv_v.weight)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = rearrange(query, "b (h c) t-> b h t c", h=self.n_heads)
-        key = rearrange(key, "b (h c) t-> b h t c", h=self.n_heads)
-        value = rearrange(value, "b (h c) t-> b h t c", h=self.n_heads)
-        query = self.query_rotary_pe(query)
-        key = self.key_rotary_pe(key)
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-        p_attn = torch.nn.functional.softmax(scores, dim=-1)
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
-        return output, p_attn
-    @staticmethod
-    def _attention_bias_proximal(length):
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.drop = torch.nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        return x * x_mask
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size=1,
-        p_dropout=0.0,
-        **kwargs,
-    ):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.drop = torch.nn.Dropout(p_dropout)
-        self.attn_layers = torch.nn.ModuleList()
-        self.norm_layers_1 = torch.nn.ModuleList()
-        self.ffn_layers = torch.nn.ModuleList()
-        self.norm_layers_2 = torch.nn.ModuleList()
-        for _ in range(self.n_layers):
-            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(
-                    hidden_channels,
-                    hidden_channels,
-                    filter_channels,
-                    kernel_size,
-                    p_dropout=p_dropout,
-                )
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        for i in range(self.n_layers):
-            x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        encoder_type,
-        encoder_params,
-        duration_predictor_params,
-        n_vocab,
-        n_spks=1,
-        spk_emb_dim=128,
-    ):
-        super().__init__()
-        self.encoder_type = encoder_type
-        self.n_vocab = n_vocab
-        self.n_feats = encoder_params.n_feats
-        self.n_channels = encoder_params.n_channels
-        self.spk_emb_dim = spk_emb_dim
-        self.n_spks = n_spks
-        self.emb = torch.nn.Embedding(n_vocab, self.n_channels)
-        torch.nn.init.normal_(self.emb.weight, 0.0, self.n_channels**-0.5)
-        if encoder_params.prenet:
-            self.prenet = ConvReluNorm(
-                self.n_channels,
-                self.n_channels,
-                self.n_channels,
-                kernel_size=5,
-                n_layers=3,
-                p_dropout=0.5,
-            )
-        else:
-            self.prenet = lambda x, x_mask: x
-        self.encoder = Encoder(
-            encoder_params.n_channels + (spk_emb_dim if n_spks > 1 else 0),
-            encoder_params.filter_channels,
-            encoder_params.n_heads,
-            encoder_params.n_layers,
-            encoder_params.kernel_size,
-            encoder_params.p_dropout,
-        )
-        self.proj_m = torch.nn.Conv1d(self.n_channels + (spk_emb_dim if n_spks > 1 else 0), self.n_feats, 1)
-        self.proj_w = DurationPredictor(
-            self.n_channels + (spk_emb_dim if n_spks > 1 else 0),
-            duration_predictor_params.filter_channels_dp,
-            duration_predictor_params.kernel_size,
-            duration_predictor_params.p_dropout,
-        )
-    def forward(self, x, x_lengths, spks=None):
-        """Run forward pass to the transformer based encoder and duration predictor
-        Args:
-            x (torch.Tensor): text input
-                shape: (batch_size, max_text_length)
-            x_lengths (torch.Tensor): text input lengths
-                shape: (batch_size,)
-            spks (torch.Tensor, optional): speaker ids. Defaults to None.
-                shape: (batch_size,)
-        Returns:
-            mu (torch.Tensor): average output of the encoder
-                shape: (batch_size, n_feats, max_text_length)
-            logw (torch.Tensor): log duration predicted by the duration predictor
-                shape: (batch_size, 1, max_text_length)
-            x_mask (torch.Tensor): mask for the text input
-                shape: (batch_size, 1, max_text_length)
-        """
-        x = self.emb(x) * math.sqrt(self.n_channels)
-        x = torch.transpose(x, 1, -1)
-        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-        x = self.prenet(x, x_mask)
-        if self.n_spks > 1:
-            x = torch.cat([x, spks.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
-        x = self.encoder(x, x_mask)
-        mu = self.proj_m(x) * x_mask
-        x_dp = torch.detach(x)
-        logw = self.proj_w(x_dp, x_mask)
-        return mu, logw, x_mask

chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py DELETED Viewed

@@ -1,316 +0,0 @@
-from typing import Any, Dict, Optional
-import torch
-import torch.nn as nn
-from diffusers.models.attention import (
-    GEGLU,
-    GELU,
-    AdaLayerNorm,
-    AdaLayerNormZero,
-    ApproximateGELU,
-)
-from diffusers.models.attention_processor import Attention
-from diffusers.models.lora import LoRACompatibleLinear
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-class SnakeBeta(nn.Module):
-    """
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    """
-    def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
-        """
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        """
-        super().__init__()
-        self.in_features = out_features if isinstance(out_features, list) else [out_features]
-        self.proj = LoRACompatibleLinear(in_features, out_features)
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
-            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
-            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        """
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        """
-        x = self.proj(x)
-        if self.alpha_logscale:
-            alpha = torch.exp(self.alpha)
-            beta = torch.exp(self.beta)
-        else:
-            alpha = self.alpha
-            beta = self.beta
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
-        return x
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-    """
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim)
-        elif activation_fn == "snakebeta":
-            act_fn = SnakeBeta(dim, inner_dim)
-        self.net = nn.ModuleList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-    def forward(self, hidden_states):
-        for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-@maybe_allow_in_graph
-class BasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block.
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-    """
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
-        # 2. Cross-Attn
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm)
-                if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-            )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-                # scale_qk=False, # uncomment this to not to use flash attention
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = None
-            self.attn2 = None
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-    ):
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-        # 2. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
-                raise ValueError(
-                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-                )
-            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
-            ff_output = torch.cat(
-                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
-                dim=self._chunk_dim,
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = ff_output + hidden_states
-        return hidden_states

chatterbox/src/chatterbox/models/s3gen/s3gen.py DELETED Viewed

@@ -1,298 +0,0 @@
-# Modified from CosyVoice https://github.com/FunAudioLLM/CosyVoice
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import numpy as np
-import torch
-import torchaudio as ta
-from functools import lru_cache
-from typing import Optional
-from ..s3tokenizer import S3_SR, SPEECH_VOCAB_SIZE, S3Tokenizer
-from .const import S3GEN_SR
-from .flow import CausalMaskedDiffWithXvec
-from .xvector import CAMPPlus
-from .utils.mel import mel_spectrogram
-from .f0_predictor import ConvRNNF0Predictor
-from .hifigan import HiFTGenerator
-from .transformer.upsample_encoder import UpsampleConformerEncoder
-from .flow_matching import CausalConditionalCFM
-from .decoder import ConditionalDecoder
-from .configs import CFM_PARAMS
-def drop_invalid_tokens(x):
-    assert len(x.shape) <= 2 and x.shape[0] == 1, "only batch size of one allowed for now"
-    return x[x < SPEECH_VOCAB_SIZE]
-# TODO: global resampler cache
-@lru_cache(100)
-def get_resampler(src_sr, dst_sr, device):
-    return ta.transforms.Resample(src_sr, dst_sr).to(device)
-class S3Token2Mel(torch.nn.Module):
-    """
-    CosyVoice2's CFM decoder maps S3 speech tokens to mel-spectrograms.
-    TODO: make these modules configurable?
-    """
-    def __init__(self):
-        super().__init__()
-        self.tokenizer = S3Tokenizer("speech_tokenizer_v2_25hz")
-        self.mel_extractor = mel_spectrogram # TODO: make it a torch module?
-        self.speaker_encoder = CAMPPlus()  # use default args
-        encoder = UpsampleConformerEncoder(
-            output_size=512,
-            attention_heads=8,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.1,
-            normalize_before=True,
-            input_layer='linear',
-            pos_enc_layer_type='rel_pos_espnet',
-            selfattention_layer_type='rel_selfattn',
-            input_size=512,
-            use_cnn_module=False,
-            macaron_style=False,
-        )
-        estimator = ConditionalDecoder(
-            in_channels=320,
-            out_channels=80,
-            causal=True,
-            channels=[256],
-            dropout=0.0,
-            attention_head_dim=64,
-            n_blocks=4,
-            num_mid_blocks=12,
-            num_heads=8,
-            act_fn='gelu',
-        )
-        cfm_params = CFM_PARAMS
-        decoder = CausalConditionalCFM(
-            spk_emb_dim=80,
-            cfm_params=cfm_params,
-            estimator=estimator,
-        )
-        self.flow = CausalMaskedDiffWithXvec(
-            encoder=encoder,
-            decoder=decoder
-        )
-        self.resamplers = {}
-    @property
-    def device(self):
-        params = self.tokenizer.parameters()
-        return next(params).device
-    def embed_ref(
-        self,
-        ref_wav: torch.Tensor,
-        ref_sr: int,
-        device="auto",
-        ref_fade_out=True,
-    ):
-        device = self.device if device == "auto" else device
-        if isinstance(ref_wav, np.ndarray):
-            ref_wav = torch.from_numpy(ref_wav).float()
-        if ref_wav.device != device:
-            ref_wav = ref_wav.to(device)
-        if len(ref_wav.shape) == 1:
-            ref_wav = ref_wav.unsqueeze(0)  # (B, L)
-        if ref_wav.size(1) > 10 * ref_sr:
-            print("WARNING: cosydec received ref longer than 10s")
-        ref_wav_24 = ref_wav
-        if ref_sr != S3GEN_SR:
-            ref_wav_24 = get_resampler(ref_sr, S3GEN_SR, device)(ref_wav)
-        ref_mels_24 = self.mel_extractor(ref_wav_24).transpose(1, 2).to(device)
-        ref_mels_24_len = None
-        # Resample to 16kHz
-        ref_wav_16 = get_resampler(ref_sr, S3_SR, device)(ref_wav).to(device)
-        # Speaker embedding
-        ref_x_vector = self.speaker_encoder.inference(ref_wav_16)
-        # Tokenize 16khz reference
-        ref_speech_tokens, ref_speech_token_lens = self.tokenizer(ref_wav_16)
-        # Make sure mel_len = 2 * stoken_len (happens when the input is not padded to multiple of 40ms)
-        if ref_mels_24.shape[1] != 2 * ref_speech_tokens.shape[1]:
-            logging.warning(
-                "Reference mel length is not equal to 2 * reference token length.\n"
-            )
-            ref_speech_tokens = ref_speech_tokens[:, :ref_mels_24.shape[1] // 2]
-            ref_speech_token_lens[0] = ref_speech_tokens.shape[1]
-        return dict(
-            prompt_token=ref_speech_tokens.to(device),
-            prompt_token_len=ref_speech_token_lens,
-            prompt_feat=ref_mels_24,
-            prompt_feat_len=ref_mels_24_len,
-            embedding=ref_x_vector,
-        )
-    def forward(
-        self,
-        speech_tokens: torch.LongTensor,
-        # locally-computed ref embedding (mutex with ref_dict)
-        ref_wav: Optional[torch.Tensor],
-        ref_sr: Optional[int],
-        # pre-computed ref embedding (prod API)
-        ref_dict: Optional[dict] = None,
-        finalize: bool = False,
-    ):
-        """
-        Generate waveforms from S3 speech tokens and a reference waveform, which the speaker timbre is inferred from.
-        NOTE:
-        - The speaker encoder accepts 16 kHz waveform.
-        - S3TokenizerV2 accepts 16 kHz waveform.
-        - The mel-spectrogram for the reference assumes 24 kHz input signal.
-        - This function is designed for batch_size=1 only.
-        Args
-        ----
-        - `speech_tokens`: S3 speech tokens [B=1, T]
-        - `ref_wav`: reference waveform (`torch.Tensor` with shape=[B=1, T])
-        - `ref_sr`: reference sample rate
-        - `finalize`: whether streaming is finished or not. Note that if False, the last 3 tokens will be ignored.
-        """
-        assert (ref_wav is None) ^ (ref_dict is None), f"Must provide exactly one of ref_wav or ref_dict (got {ref_wav} and {ref_dict})"
-        if ref_dict is None:
-            ref_dict = self.embed_ref(ref_wav, ref_sr)
-        else:
-            # type/device casting (all values will be numpy if it's from a prod API call)
-            for rk in list(ref_dict):
-                if isinstance(ref_dict[rk], np.ndarray):
-                    ref_dict[rk] = torch.from_numpy(ref_dict[rk])
-                if torch.is_tensor(ref_dict[rk]):
-                    ref_dict[rk] = ref_dict[rk].to(self.device)
-        if len(speech_tokens.shape) == 1:
-            speech_tokens = speech_tokens.unsqueeze(0)
-        # assert speech_tokens.shape[0] == 1, "only batch size of one allowed for now"
-        speech_token_lens = torch.LongTensor([speech_tokens.size(1)]).to(self.device)
-        output_mels, _ = self.flow.inference(
-            token=speech_tokens,
-            token_len=speech_token_lens,
-            finalize=finalize,
-            **ref_dict,
-        )
-        return output_mels
-class S3Token2Wav(S3Token2Mel):
-    """
-    The decoder of CosyVoice2 is a concat of token-to-mel (CFM) and a mel-to-waveform (HiFiGAN) modules.
-    TODO: make these modules configurable?
-    """
-    def __init__(self):
-        super().__init__()
-        f0_predictor = ConvRNNF0Predictor()
-        self.mel2wav = HiFTGenerator(
-            sampling_rate=S3GEN_SR,
-            upsample_rates=[8, 5, 3],
-            upsample_kernel_sizes=[16, 11, 7],
-            source_resblock_kernel_sizes=[7, 7, 11],
-            source_resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-            f0_predictor=f0_predictor,
-        )
-        # silence out a few ms and fade audio in to reduce artifacts
-        n_trim = S3GEN_SR // 50  # 20ms = half of a frame
-        trim_fade = torch.zeros(2 * n_trim)
-        trim_fade[n_trim:] = (torch.cos(torch.linspace(torch.pi, 0, n_trim)) + 1) / 2
-        self.register_buffer("trim_fade", trim_fade, persistent=False) # (buffers get automatic device casting)
-    def forward(
-        self,
-        speech_tokens,
-        # locally-computed ref embedding (mutex with ref_dict)
-        ref_wav: Optional[torch.Tensor],
-        ref_sr: Optional[int],
-        # pre-computed ref embedding (prod API)
-        ref_dict: Optional[dict] = None,
-        finalize: bool = False
-    ):
-        output_mels = super().forward(speech_tokens, ref_wav=ref_wav, ref_sr=ref_sr, ref_dict=ref_dict, finalize=finalize)
-        # TODO jrm: ignoring the speed control (mel interpolation) and the HiFTGAN caching mechanisms for now.
-        hift_cache_source = torch.zeros(1, 1, 0).to(self.device)
-        output_wavs, *_ = self.mel2wav.inference(speech_feat=output_mels, cache_source=hift_cache_source)
-        if not self.training:
-            # NOTE: ad-hoc method to reduce "spillover" from the reference clip.
-            output_wavs[:, :len(self.trim_fade)] *= self.trim_fade
-        return output_wavs
-    @torch.inference_mode()
-    def flow_inference(
-        self,
-        speech_tokens,
-        # locally-computed ref embedding (mutex with ref_dict)
-        ref_wav: Optional[torch.Tensor] = None,
-        ref_sr: Optional[int] = None,
-        # pre-computed ref embedding (prod API)
-        ref_dict: Optional[dict] = None,
-        finalize: bool = False,
-    ):
-        return super().forward(speech_tokens, ref_wav=ref_wav, ref_sr=ref_sr, ref_dict=ref_dict, finalize=finalize)
-    @torch.inference_mode()
-    def hift_inference(self, speech_feat, cache_source: torch.Tensor = None):
-        if cache_source is None:
-            cache_source = torch.zeros(1, 1, 0).to(self.device)
-        return self.mel2wav.inference(speech_feat=speech_feat, cache_source=cache_source)
-    @torch.inference_mode()
-    def inference(
-        self,
-        speech_tokens,
-        # locally-computed ref embedding (mutex with ref_dict)
-        ref_wav: Optional[torch.Tensor] = None,
-        ref_sr: Optional[int] = None,
-        # pre-computed ref embedding (prod API)
-        ref_dict: Optional[dict] = None,
-        cache_source: torch.Tensor = None, # NOTE: this arg is for streaming, it can probably be removed here
-        finalize: bool = True,
-    ):
-        output_mels = self.flow_inference(speech_tokens, ref_wav=ref_wav, ref_sr=ref_sr, ref_dict=ref_dict, finalize=finalize)
-        output_wavs, output_sources = self.hift_inference(output_mels, cache_source)
-        # NOTE: ad-hoc method to reduce "spillover" from the reference clip.
-        output_wavs[:, :len(self.trim_fade)] *= self.trim_fade
-        return output_wavs, output_sources

chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py DELETED Viewed

File without changes

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (165 Bytes)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-310.pyc DELETED Viewed

Binary file (2.49 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (9.34 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-310.pyc DELETED Viewed

Binary file (3.07 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-310.pyc DELETED Viewed

Binary file (9.53 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-310.pyc DELETED Viewed

Binary file (7.34 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc DELETED Viewed

Binary file (3.8 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-310.pyc DELETED Viewed

Binary file (9.84 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-310.pyc DELETED Viewed

Binary file (10 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/activation.py DELETED Viewed

@@ -1,84 +0,0 @@
-# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
-#               2020 Northwestern Polytechnical University (Pengcheng Guo)
-#               2020 Mobvoi Inc (Binbin Zhang)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Swish() activation function for Conformer."""
-import torch
-from torch import nn, sin, pow
-from torch.nn import Parameter
-class Swish(torch.nn.Module):
-    """Construct an Swish object."""
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Return Swish activation function."""
-        return x * torch.sigmoid(x)
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-#   LICENSE is in incl_licenses directory.
-class Snake(nn.Module):
-    '''
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(Snake, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x

chatterbox/src/chatterbox/models/s3gen/transformer/attention.py DELETED Viewed

@@ -1,330 +0,0 @@
-# Copyright (c) 2019 Shigeki Karita
-#               2020 Mobvoi Inc (Binbin Zhang)
-#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
-#               2024 Alibaba Inc (Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Multi-Head Attention layer definition."""
-import math
-from typing import Tuple
-import torch
-from torch import nn
-class MultiHeadedAttention(nn.Module):
-    """Multi-Head Attention layer.
-    Args:
-        n_head (int): The number of heads.
-        n_feat (int): The number of features.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(self,
-                 n_head: int,
-                 n_feat: int,
-                 dropout_rate: float,
-                 key_bias: bool = True):
-        """Construct an MultiHeadedAttention object."""
-        super().__init__()
-        assert n_feat % n_head == 0
-        # We assume d_v always equals d_k
-        self.d_k = n_feat // n_head
-        self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
-        self.dropout = nn.Dropout(p=dropout_rate)
-    def forward_qkv(
-        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Transform query, key and value.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-        Returns:
-            torch.Tensor: Transformed query tensor, size
-                (#batch, n_head, time1, d_k).
-            torch.Tensor: Transformed key tensor, size
-                (#batch, n_head, time2, d_k).
-            torch.Tensor: Transformed value tensor, size
-                (#batch, n_head, time2, d_k).
-        """
-        n_batch = query.size(0)
-        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
-        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
-        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
-        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
-        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
-        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
-        return q, k, v
-    def forward_attention(
-        self,
-        value: torch.Tensor,
-        scores: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
-    ) -> torch.Tensor:
-        """Compute attention context vector.
-        Args:
-            value (torch.Tensor): Transformed value, size
-                (#batch, n_head, time2, d_k).
-            scores (torch.Tensor): Attention score, size
-                (#batch, n_head, time1, time2).
-            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
-                (#batch, time1, time2), (0, 0, 0) means fake mask.
-        Returns:
-            torch.Tensor: Transformed value (#batch, time1, d_model)
-                weighted by the attention score (#batch, time1, time2).
-        """
-        n_batch = value.size(0)
-        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
-        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
-        #           1st chunk to ease the onnx export.]
-        #   2. pytorch training
-        if mask.size(2) > 0:  # time2 > 0
-            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
-            # For last chunk, time2 might be larger than scores.size(-1)
-            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
-            scores = scores.masked_fill(mask, -float('inf'))
-            attn = torch.softmax(scores, dim=-1).masked_fill(
-                mask, 0.0)  # (batch, head, time1, time2)
-        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
-        #   1. onnx(16/-1, -1/-1, 16/0)
-        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
-        else:
-            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
-        p_attn = self.dropout(attn)
-        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
-                                                 self.h * self.d_k)
-             )  # (batch, time1, d_model)
-        return self.linear_out(x)  # (batch, time1, d_model)
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        pos_emb: torch.Tensor = torch.empty(0),
-        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute scaled dot product attention.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
-                (#batch, time1, time2).
-                1.When applying cross attention between decoder and encoder,
-                the batch padding mask for input is in (#batch, 1, T) shape.
-                2.When applying self attention of encoder,
-                the mask is in (#batch, T, T)  shape.
-                3.When applying self attention of decoder,
-                the mask is in (#batch, L, L)  shape.
-                4.If the different position in decoder see different block
-                of the encoder, such as Mocha, the passed in mask could be
-                in (#batch, L, T) shape. But there is no such case in current
-                CosyVoice.
-            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        Returns:
-            torch.Tensor: Output tensor (#batch, time1, d_model).
-            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        """
-        q, k, v = self.forward_qkv(query, key, value)
-        # NOTE(xcsong):
-        #   when export onnx model, for 1st chunk, we feed
-        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
-        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
-        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
-        #       and we will always do splitting and
-        #       concatnation(this will simplify onnx export). Note that
-        #       it's OK to concat & split zero-shaped tensors(see code below).
-        #   when export jit  model, for 1st chunk, we always feed
-        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
-        # >>> a = torch.ones((1, 2, 0, 4))
-        # >>> b = torch.ones((1, 2, 3, 4))
-        # >>> c = torch.cat((a, b), dim=2)
-        # >>> torch.equal(b, c)        # True
-        # >>> d = torch.split(a, 2, dim=-1)
-        # >>> torch.equal(d[0], d[1])  # True
-        if cache.size(0) > 0:
-            key_cache, value_cache = torch.split(cache,
-                                                 cache.size(-1) // 2,
-                                                 dim=-1)
-            k = torch.cat([key_cache, k], dim=2)
-            v = torch.cat([value_cache, v], dim=2)
-        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-        new_cache = torch.cat((k, v), dim=-1)
-        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
-        return self.forward_attention(v, scores, mask), new_cache
-class RelPositionMultiHeadedAttention(MultiHeadedAttention):
-    """Multi-Head Attention layer with relative position encoding.
-    Paper: https://arxiv.org/abs/1901.02860
-    Args:
-        n_head (int): The number of heads.
-        n_feat (int): The number of features.
-        dropout_rate (float): Dropout rate.
-    """
-    def __init__(self,
-                 n_head: int,
-                 n_feat: int,
-                 dropout_rate: float,
-                 key_bias: bool = True):
-        """Construct an RelPositionMultiHeadedAttention object."""
-        super().__init__(n_head, n_feat, dropout_rate, key_bias)
-        # linear transformation for positional encoding
-        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
-        # these two learnable bias are used in matrix c and matrix d
-        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
-        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
-        torch.nn.init.xavier_uniform_(self.pos_bias_u)
-        torch.nn.init.xavier_uniform_(self.pos_bias_v)
-    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute relative positional encoding.
-        Args:
-            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
-            time1 means the length of query vector.
-        Returns:
-            torch.Tensor: Output tensor.
-        """
-        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
-                               device=x.device,
-                               dtype=x.dtype)
-        x_padded = torch.cat([zero_pad, x], dim=-1)
-        x_padded = x_padded.view(x.size()[0],
-                                 x.size()[1],
-                                 x.size(3) + 1, x.size(2))
-        x = x_padded[:, :, 1:].view_as(x)[
-            :, :, :, : x.size(-1) // 2 + 1
-        ]  # only keep the positions from 0 to time2
-        return x
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        pos_emb: torch.Tensor = torch.empty(0),
-        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
-        Args:
-            query (torch.Tensor): Query tensor (#batch, time1, size).
-            key (torch.Tensor): Key tensor (#batch, time2, size).
-            value (torch.Tensor): Value tensor (#batch, time2, size).
-            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
-                (#batch, time1, time2), (0, 0, 0) means fake mask.
-            pos_emb (torch.Tensor): Positional embedding tensor
-                (#batch, time2, size).
-            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        Returns:
-            torch.Tensor: Output tensor (#batch, time1, d_model).
-            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
-                where `cache_t == chunk_size * num_decoding_left_chunks`
-                and `head * d_k == size`
-        """
-        q, k, v = self.forward_qkv(query, key, value)
-        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
-        # NOTE(xcsong):
-        #   when export onnx model, for 1st chunk, we feed
-        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
-        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
-        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
-        #       and we will always do splitting and
-        #       concatnation(this will simplify onnx export). Note that
-        #       it's OK to concat & split zero-shaped tensors(see code below).
-        #   when export jit  model, for 1st chunk, we always feed
-        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
-        # >>> a = torch.ones((1, 2, 0, 4))
-        # >>> b = torch.ones((1, 2, 3, 4))
-        # >>> c = torch.cat((a, b), dim=2)
-        # >>> torch.equal(b, c)        # True
-        # >>> d = torch.split(a, 2, dim=-1)
-        # >>> torch.equal(d[0], d[1])  # True
-        if cache.size(0) > 0:
-            key_cache, value_cache = torch.split(cache,
-                                                 cache.size(-1) // 2,
-                                                 dim=-1)
-            k = torch.cat([key_cache, k], dim=2)
-            v = torch.cat([value_cache, v], dim=2)
-        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-        new_cache = torch.cat((k, v), dim=-1)
-        n_batch_pos = pos_emb.size(0)
-        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
-        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
-        # (batch, head, time1, d_k)
-        q_with_bias_u = (q + self.pos_bias_u.to(q.device)).transpose(1, 2)
-        # (batch, head, time1, d_k)
-        q_with_bias_v = (q + self.pos_bias_v.to(q.device)).transpose(1, 2)
-        # compute attention score
-        # first compute matrix a and matrix c
-        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
-        # (batch, head, time1, time2)
-        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
-        # compute matrix b and matrix d
-        # (batch, head, time1, time2)
-        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
-        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
-        if matrix_ac.shape != matrix_bd.shape:
-            matrix_bd = self.rel_shift(matrix_bd)
-        scores = (matrix_ac + matrix_bd) / math.sqrt(
-            self.d_k)  # (batch, head, time1, time2)
-        return self.forward_attention(v, scores, mask), new_cache