Chatterbox

Running on T4

App Files Files Community

j commited on 7 days ago

Commit

f98c92f

1 Parent(s): af25078

update to upstream chatterbox implementation, fixes token filtering/clamping

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +1 -0
chatterbox/src/chatterbox/__init__.py +15 -0
chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/__init__.py +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/configs.py +10 -0
chatterbox/src/chatterbox/models/s3gen/flow.py +89 -41
chatterbox/src/chatterbox/models/s3gen/flow_matching.py +1 -11
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/s3gen.py +2 -9
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/mel.py +8 -4
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py +56 -32
chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/t3_config.py +30 -16
chatterbox/src/chatterbox/models/t3/t3.py +56 -34

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 short_description: Expressive Zeroshot TTS
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 short_description: Expressive Zeroshot TTS
+python_version: 3.10
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

chatterbox/src/chatterbox/__init__.py CHANGED Viewed

@@ -1,2 +1,17 @@
 from .tts import ChatterboxTTS
 from .vc import ChatterboxVC

+try:
+    from importlib.metadata import version, PackageNotFoundError
+    try:
+        __version__ = version("chatterbox-tts")
+    except PackageNotFoundError:
+        __version__ = "0.1.4"  # Default fallback version
+except ImportError:
+    from importlib_metadata import version, PackageNotFoundError  # For Python <3.8
+    try:
+        __version__ = version("chatterbox-tts")
+    except PackageNotFoundError:
+        __version__ = "0.1.4"
 from .tts import ChatterboxTTS
 from .vc import ChatterboxVC
+from .mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES

chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (275 Bytes)

chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc DELETED Viewed

Binary file (13.3 kB)

chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc DELETED Viewed

Binary file (858 Bytes)

chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc DELETED Viewed

Binary file (5.44 kB)

chatterbox/src/chatterbox/models/__init__.py ADDED Viewed

File without changes

chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (294 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc DELETED Viewed

Binary file (190 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc DELETED Viewed

Binary file (16.9 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc DELETED Viewed

Binary file (2.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc DELETED Viewed

Binary file (13.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc DELETED Viewed

Binary file (13.3 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc DELETED Viewed

Binary file (26.3 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc DELETED Viewed

Binary file (13.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc DELETED Viewed

Binary file (24 kB)

chatterbox/src/chatterbox/models/s3gen/configs.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ..utils import AttrDict
+CFM_PARAMS = AttrDict({
+    "sigma_min": 1e-06,
+    "solver": "euler",
+    "t_scheduler": "cosine",
+    "training_cfg_rate": 0.2,
+    "inference_cfg_rate": 0.7,
+    "reg_loss_type": "l1"
+})

chatterbox/src/chatterbox/models/s3gen/flow.py CHANGED Viewed

@@ -14,32 +14,54 @@
 import logging
 import random
 from typing import Dict, Optional
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from omegaconf import DictConfig
 from .utils.mask import make_pad_mask
 class MaskedDiffWithXvec(torch.nn.Module):
-    def __init__(self,
-                 input_size: int = 512,
-                 output_size: int = 80,
-                 spk_embed_dim: int = 192,
-                 output_type: str = "mel",
-                 vocab_size: int = 4096,
-                 input_frame_rate: int = 50,
-                 only_mask_loss: bool = True,
-                 encoder: torch.nn.Module = None,
-                 length_regulator: torch.nn.Module = None,
-                 decoder: torch.nn.Module = None,
-                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
-                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
-                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
-                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
-                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
-                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
-                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
@@ -74,7 +96,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         # concat text and prompt_text
         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)
@@ -124,7 +146,13 @@ class MaskedDiffWithXvec(torch.nn.Module):
         token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)
@@ -153,25 +181,45 @@ class MaskedDiffWithXvec(torch.nn.Module):
 class CausalMaskedDiffWithXvec(torch.nn.Module):
-    def __init__(self,
-                 input_size: int = 512,
-                 output_size: int = 80,
-                 spk_embed_dim: int = 192,
-                 output_type: str = "mel",
-                 vocab_size: int = 6561,
-                 input_frame_rate: int = 25,
-                 only_mask_loss: bool = True,
-                 token_mel_ratio: int = 2,
-                 pre_lookahead_len: int = 3,
-                 encoder: torch.nn.Module = None,
-                 decoder: torch.nn.Module = None,
-                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
-                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
-                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
-                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
-                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
-                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
-                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
@@ -215,7 +263,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         # concat text and prompt_text
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
-        token = self.input_embedding(torch.clamp(token, min=0)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)

 import logging
 import random
 from typing import Dict, Optional
+logger = logging.getLogger(__name__)
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from .utils.mask import make_pad_mask
+from .configs import CFM_PARAMS
 class MaskedDiffWithXvec(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 80,
+        spk_embed_dim: int = 192,
+        output_type: str = "mel",
+        vocab_size: int = 4096,
+        input_frame_rate: int = 50,
+        only_mask_loss: bool = True,
+        encoder: torch.nn.Module = None,
+        length_regulator: torch.nn.Module = None,
+        decoder: torch.nn.Module = None,
+        decoder_conf: Dict = {
+            'in_channels': 240,
+            'out_channel': 80,
+            'spk_emb_dim': 80,
+            'n_spks': 1,
+            'cfm_params': CFM_PARAMS,
+            'decoder_params': {
+                'channels': [256, 256],
+                'dropout': 0.0,
+                'attention_head_dim': 64,
+                'n_blocks': 4,
+                'num_mid_blocks': 12,
+                'num_heads': 8,
+                'act_fn': 'gelu',
+            }
+        },
+        mel_feat_conf: Dict = {
+            'n_fft': 1024,
+            'num_mels': 80,
+            'sampling_rate': 22050,
+            'hop_size': 256,
+            'win_size': 1024,
+            'fmin': 0,
+            'fmax': 8000
+        }
+    ):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
         # concat text and prompt_text
         mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0, max=self.input_embedding.num_embeddings-1)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)
         token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        # Check for out-of-bounds token IDs
+        vocab_size = self.input_embedding.num_embeddings
+        if token.max() >= vocab_size or token.min() < 0:
+            logging.warning(f"S3Gen: Token IDs out of bounds: min={token.min().item()}, max={token.max().item()}, vocab_size={vocab_size}")
+        token = self.input_embedding(torch.clamp(token, min=0, max=vocab_size-1)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)
 class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 80,
+        spk_embed_dim: int = 192,
+        output_type: str = "mel",
+        vocab_size: int = 6561,
+        input_frame_rate: int = 25,
+        only_mask_loss: bool = True,
+        token_mel_ratio: int = 2,
+        pre_lookahead_len: int = 3,
+        encoder: torch.nn.Module = None,
+        decoder: torch.nn.Module = None,
+        decoder_conf: Dict = {
+            'in_channels': 240,
+            'out_channel': 80,
+            'spk_emb_dim': 80,
+            'n_spks': 1,
+            'cfm_params': CFM_PARAMS,
+            'decoder_params': {
+                'channels': [256, 256],
+                'dropout': 0.0,
+                'attention_head_dim': 64,
+                'n_blocks': 4,
+                'num_mid_blocks': 12,
+                'num_heads': 8,
+                'act_fn': 'gelu',
+            }
+        },
+        mel_feat_conf: Dict = {
+            'n_fft': 1024,
+            'num_mels': 80,
+            'sampling_rate': 22050,
+            'hop_size': 256,
+            'win_size': 1024,
+            'fmin': 0,
+            'fmax': 8000
+        }
+    ):
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
         # concat text and prompt_text
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0, max=self.input_embedding.num_embeddings-1)) * mask
         # text encode
         h, h_lengths = self.encoder(token, token_len)

chatterbox/src/chatterbox/models/s3gen/flow_matching.py CHANGED Viewed

@@ -15,17 +15,7 @@ import threading
 import torch
 import torch.nn.functional as F
 from .matcha.flow_matching import BASECFM
-from omegaconf import OmegaConf
-CFM_PARAMS = OmegaConf.create({
-    "sigma_min": 1e-06,
-    "solver": "euler",
-    "t_scheduler": "cosine",
-    "training_cfg_rate": 0.2,
-    "inference_cfg_rate": 0.7,
-    "reg_loss_type": "l1"
-})
 class ConditionalCFM(BASECFM):

 import torch
 import torch.nn.functional as F
 from .matcha.flow_matching import BASECFM
+from .configs import CFM_PARAMS
 class ConditionalCFM(BASECFM):

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc DELETED Viewed

Binary file (21.3 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc DELETED Viewed

Binary file (6.46 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc DELETED Viewed

Binary file (14.7 kB)

chatterbox/src/chatterbox/models/s3gen/s3gen.py CHANGED Viewed

@@ -19,7 +19,6 @@ import torch
 import torchaudio as ta
 from functools import lru_cache
 from typing import Optional
-from omegaconf import DictConfig
 from ..s3tokenizer import S3_SR, SPEECH_VOCAB_SIZE, S3Tokenizer
 from .const import S3GEN_SR
@@ -31,6 +30,7 @@ from .hifigan import HiFTGenerator
 from .transformer.upsample_encoder import UpsampleConformerEncoder
 from .flow_matching import CausalConditionalCFM
 from .decoder import ConditionalDecoder
 def drop_invalid_tokens(x):
@@ -85,14 +85,7 @@ class S3Token2Mel(torch.nn.Module):
             num_heads=8,
             act_fn='gelu',
         )
-        cfm_params = DictConfig({
-            "sigma_min": 1e-06,
-            "solver": 'euler',
-            "t_scheduler": 'cosine',
-            "training_cfg_rate": 0.2,
-            "inference_cfg_rate": 0.7,
-            "reg_loss_type": 'l1',
-        })
         decoder = CausalConditionalCFM(
             spk_emb_dim=80,
             cfm_params=cfm_params,

 import torchaudio as ta
 from functools import lru_cache
 from typing import Optional
 from ..s3tokenizer import S3_SR, SPEECH_VOCAB_SIZE, S3Tokenizer
 from .const import S3GEN_SR
 from .transformer.upsample_encoder import UpsampleConformerEncoder
 from .flow_matching import CausalConditionalCFM
 from .decoder import ConditionalDecoder
+from .configs import CFM_PARAMS
 def drop_invalid_tokens(x):
             num_heads=8,
             act_fn='gelu',
         )
+        cfm_params = CFM_PARAMS
         decoder = CausalConditionalCFM(
             spk_emb_dim=80,
             cfm_params=cfm_params,

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (190 Bytes)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc DELETED Viewed

Binary file (3.58 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc DELETED Viewed

Binary file (15.7 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc DELETED Viewed

Binary file (5.54 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc DELETED Viewed

Binary file (17.3 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc DELETED Viewed

Binary file (11.2 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc DELETED Viewed

Binary file (6.24 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc DELETED Viewed

Binary file (18.9 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc DELETED Viewed

Binary file (15.6 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc DELETED Viewed

Binary file (1.93 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc DELETED Viewed

Binary file (6.25 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc DELETED Viewed

Binary file (4.05 kB)

chatterbox/src/chatterbox/models/s3gen/utils/mel.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """mel-spectrogram extraction in Matcha-TTS"""
 from librosa.filters import mel as librosa_mel_fn
 import torch
 import numpy as np
 # NOTE: they decalred these global vars
 mel_basis = {}
@@ -42,10 +45,11 @@ def mel_spectrogram(y, n_fft=1920, num_mels=80, sampling_rate=24000, hop_size=48
     if len(y.shape) == 1:
         y = y[None, ]
-    if torch.min(y) < -1.0:
-        print("min value is ", torch.min(y))
-    if torch.max(y) > 1.0:
-        print("max value is ", torch.max(y))
     global mel_basis, hann_window  # pylint: disable=global-statement,global-variable-not-assigned
     if f"{str(fmax)}_{str(y.device)}" not in mel_basis:

 """mel-spectrogram extraction in Matcha-TTS"""
+import logging
 from librosa.filters import mel as librosa_mel_fn
 import torch
 import numpy as np
+logger = logging.getLogger(__name__)
 # NOTE: they decalred these global vars
 mel_basis = {}
     if len(y.shape) == 1:
         y = y[None, ]
+    # Debug: Check for audio clipping (values outside [-1.0, 1.0] range)
+    min_val = torch.min(y)
+    max_val = torch.max(y)
+    if min_val < -1.0 or max_val > 1.0:
+        logger.warning(f"Audio values outside normalized range: min={min_val.item():.4f}, max={max_val.item():.4f}")
     global mel_basis, hann_window  # pylint: disable=global-statement,global-variable-not-assigned
     if f"{str(fmax)}_{str(y.device)}" not in mel_basis:

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (1.37 kB)

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc DELETED Viewed

Binary file (7.94 kB)

chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (218 Bytes)

chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc DELETED Viewed

Binary file (1.34 kB)

chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc DELETED Viewed

Binary file (15.8 kB)

chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc DELETED Viewed

Binary file (7.08 kB)

chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc DELETED Viewed

Binary file (4.65 kB)

chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py CHANGED Viewed

@@ -10,6 +10,9 @@ from types import MethodType
 logger = logging.getLogger(__name__)
 @dataclass
 class AlignmentAnalysisResult:
     # was this frame detected as being part of a noisy beginning chunk with potential hallucinations?
@@ -49,21 +52,22 @@ class AlignmentStreamAnalyzer:
         self.complete = False
         self.completed_at = None
         # Using `output_attentions=True` is incompatible with optimized attention kernels, so
         # using it for all layers slows things down too much. We can apply it to just one layer
         # by intercepting the kwargs and adding a forward hook (credit: jrm)
-        self.last_aligned_attn = None
-        self._add_attention_spy(tfmr, alignment_layer_idx)
-    def _add_attention_spy(self, tfmr, alignment_layer_idx):
         """
         Adds a forward hook to a specific attention layer to collect outputs.
-        Using `output_attentions=True` is incompatible with optimized attention kernels, so
-        using it for all layers slows things down too much.
-        (credit: jrm)
         """
         def attention_forward_hook(module, input, output):
             """
             See `LlamaAttention.forward`; the output is a 3-tuple: `attn_output, attn_weights, past_key_value`.
@@ -71,27 +75,23 @@ class AlignmentStreamAnalyzer:
             - When `output_attentions=True`, `LlamaSdpaAttention.forward` calls `LlamaAttention.forward`.
             - `attn_output` has shape [B, H, T0, T0] for the 0th entry, and [B, H, 1, T0+i] for the rest i-th.
             """
-            step_attention = output[1].cpu() # (B, 16, N, N)
-            self.last_aligned_attn = step_attention[0].mean(0) # (N, N)
-        target_layer = tfmr.layers[alignment_layer_idx].self_attn
-        hook_handle = target_layer.register_forward_hook(attention_forward_hook)
-        # Backup original forward
-        original_forward = target_layer.forward
-        def patched_forward(self, *args, **kwargs):
-            kwargs['output_attentions'] = True
-            return original_forward(*args, **kwargs)
-        # TODO: how to unpatch it?
-        target_layer.forward = MethodType(patched_forward, target_layer)
-    def step(self, logits):
         """
         Emits an AlignmentAnalysisResult into the output queue, and potentially modifies the logits to force an EOS.
         """
         # extract approximate alignment matrix chunk (1 frame at a time after the first chunk)
-        aligned_attn = self.last_aligned_attn # (N, N)
         i, j = self.text_tokens_slice
         if self.curr_frame_pos == 0:
             # first chunk has conditioning info, text tokens, and BOS token
@@ -133,22 +133,46 @@ class AlignmentStreamAnalyzer:
         last_text_token_duration = A[15:, -3:].sum()
         # Activations for the final token that last too long are likely hallucinations.
-        long_tail = self.complete and (A[self.completed_at:, -3:].sum(dim=0).max() >= 10) # 400ms
         # If there are activations in previous tokens after generation has completed, assume this is a repetition error.
-        repetition = self.complete and (A[self.completed_at:, :-5].max(dim=1).values.sum() > 5)
         # If a bad ending is detected, force emit EOS by modifying logits
         # NOTE: this means logits may be inconsistent with latents!
-        if long_tail or repetition:
-            logger.warn(f"forcing EOS token, {long_tail=}, {repetition=}")
             # (±2**15 is safe for all dtypes >= 16bit)
             logits = -(2**15) * torch.ones_like(logits)
             logits[..., self.eos_idx] = 2**15
-        # Suppress EoS to prevent early termination
-        if cur_text_posn < S - 3: # FIXME: arbitrary
-            logits[..., self.eos_idx] = -2**15
         self.curr_frame_pos += 1
         return logits

 logger = logging.getLogger(__name__)
+LLAMA_ALIGNED_HEADS = [(12, 15), (13, 11), (9, 2)]
 @dataclass
 class AlignmentAnalysisResult:
     # was this frame detected as being part of a noisy beginning chunk with potential hallucinations?
         self.complete = False
         self.completed_at = None
+        # Track generated tokens for repetition detection
+        self.generated_tokens = []
         # Using `output_attentions=True` is incompatible with optimized attention kernels, so
         # using it for all layers slows things down too much. We can apply it to just one layer
         # by intercepting the kwargs and adding a forward hook (credit: jrm)
+        self.last_aligned_attns = []
+        for i, (layer_idx, head_idx) in enumerate(LLAMA_ALIGNED_HEADS):
+            self.last_aligned_attns += [None]
+            self._add_attention_spy(tfmr, i, layer_idx, head_idx)
+    def _add_attention_spy(self, tfmr, buffer_idx, layer_idx, head_idx):
         """
         Adds a forward hook to a specific attention layer to collect outputs.
         """
         def attention_forward_hook(module, input, output):
             """
             See `LlamaAttention.forward`; the output is a 3-tuple: `attn_output, attn_weights, past_key_value`.
             - When `output_attentions=True`, `LlamaSdpaAttention.forward` calls `LlamaAttention.forward`.
             - `attn_output` has shape [B, H, T0, T0] for the 0th entry, and [B, H, 1, T0+i] for the rest i-th.
             """
+            if isinstance(output, tuple) and len(output) > 1 and output[1] is not None:
+                step_attention = output[1].cpu()  # (B, n_heads, T0, Ti)
+                self.last_aligned_attns[buffer_idx] = step_attention[0, head_idx]  # (T0, Ti)
+        target_layer = tfmr.layers[layer_idx].self_attn
+        # Register hook and store the handle
+        target_layer.register_forward_hook(attention_forward_hook)
+        if hasattr(tfmr, 'config') and hasattr(tfmr.config, 'output_attentions'):
+            self.original_output_attentions = tfmr.config.output_attentions
+            tfmr.config.output_attentions = True
+    def step(self, logits, next_token=None):
         """
         Emits an AlignmentAnalysisResult into the output queue, and potentially modifies the logits to force an EOS.
         """
         # extract approximate alignment matrix chunk (1 frame at a time after the first chunk)
+        aligned_attn = torch.stack(self.last_aligned_attns).mean(dim=0) # (N, N)
         i, j = self.text_tokens_slice
         if self.curr_frame_pos == 0:
             # first chunk has conditioning info, text tokens, and BOS token
         last_text_token_duration = A[15:, -3:].sum()
         # Activations for the final token that last too long are likely hallucinations.
+        long_tail = self.complete and (A[self.completed_at:, -3:].sum(dim=0).max() >= 5) # 200ms
         # If there are activations in previous tokens after generation has completed, assume this is a repetition error.
+        alignment_repetition = self.complete and (A[self.completed_at:, :-5].max(dim=1).values.sum() > 5)
+        # Track generated tokens for repetition detection
+        if next_token is not None:
+            # Convert tensor to scalar if needed
+            if isinstance(next_token, torch.Tensor):
+                token_id = next_token.item() if next_token.numel() == 1 else next_token.view(-1)[0].item()
+            else:
+                token_id = next_token
+            self.generated_tokens.append(token_id)
+            # Keep only last 8 tokens to prevent memory issues
+            if len(self.generated_tokens) > 8:
+                self.generated_tokens = self.generated_tokens[-8:]
+        # Check for excessive token repetition (3x same token in a row)
+        token_repetition = (
+            # self.complete and
+            len(self.generated_tokens) >= 3 and
+            len(set(self.generated_tokens[-2:])) == 1
+        )
+        if token_repetition:
+            repeated_token = self.generated_tokens[-1]
+            logger.warning(f"🚨 Detected 2x repetition of token {repeated_token}")
+        # Suppress EoS to prevent early termination
+        if cur_text_posn < S - 3 and S > 5:  # Only suppress if text is longer than 5 tokens
+            logits[..., self.eos_idx] = -2**15
         # If a bad ending is detected, force emit EOS by modifying logits
         # NOTE: this means logits may be inconsistent with latents!
+        if long_tail or alignment_repetition or token_repetition:
+            logger.warning(f"forcing EOS token, {long_tail=}, {alignment_repetition=}, {token_repetition=}")
             # (±2**15 is safe for all dtypes >= 16bit)
             logits = -(2**15) * torch.ones_like(logits)
             logits[..., self.eos_idx] = 2**15
         self.curr_frame_pos += 1
         return logits

chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc DELETED Viewed

Binary file (5.37 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc DELETED Viewed

Binary file (2.54 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc DELETED Viewed

Binary file (12.6 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc DELETED Viewed

Binary file (1.27 kB)

chatterbox/src/chatterbox/models/t3/modules/t3_config.py CHANGED Viewed

@@ -2,26 +2,40 @@ from ..llama_configs import LLAMA_CONFIGS
 class T3Config:
-    start_text_token = 255
-    stop_text_token = 0
-    text_tokens_dict_size = 704
-    max_text_tokens = 2048
-    start_speech_token = 6561
-    stop_speech_token = 6562
-    speech_tokens_dict_size = 8194
-    max_speech_tokens = 4096
-    llama_config_name = "Llama_520M"
-    input_pos_emb = "learned"
-    speech_cond_prompt_len = 150
-    # For T3CondEnc
-    encoder_type = "voice_encoder"
-    speaker_embed_size = 256
-    use_perceiver_resampler = True
-    emotion_adv = True
     @property
     def n_channels(self):
         return LLAMA_CONFIGS[self.llama_config_name]["hidden_size"]

 class T3Config:
+    def __init__(self, text_tokens_dict_size=704):
+        self.start_text_token = 255
+        self.stop_text_token = 0
+        self.text_tokens_dict_size = text_tokens_dict_size
+        self.max_text_tokens = 2048
+        self.start_speech_token = 6561
+        self.stop_speech_token = 6562
+        self.speech_tokens_dict_size = 8194
+        self.max_speech_tokens = 4096
+        self.llama_config_name = "Llama_520M"
+        self.input_pos_emb = "learned"
+        self.speech_cond_prompt_len = 150
+        self.encoder_type = "voice_encoder"
+        self.speaker_embed_size = 256
+        self.use_perceiver_resampler = True
+        self.emotion_adv = True
     @property
     def n_channels(self):
         return LLAMA_CONFIGS[self.llama_config_name]["hidden_size"]
+    @property
+    def is_multilingual(self):
+        return self.text_tokens_dict_size == 2454
+    @classmethod
+    def english_only(cls):
+        """Create configuration for English-only TTS model."""
+        return cls(text_tokens_dict_size=704)
+    @classmethod
+    def multilingual(cls):
+        """Create configuration for multilingual TTS model."""
+        return cls(text_tokens_dict_size=2454)

chatterbox/src/chatterbox/models/t3/t3.py CHANGED Viewed

@@ -3,12 +3,14 @@
 import logging
 from typing import Union, Optional, List
 from tqdm import tqdm
 import torch
 import torch.nn.functional as F
 from torch import nn, Tensor
 from transformers import LlamaModel, LlamaConfig
-from transformers.generation.logits_process import TopPLogitsWarper, RepetitionPenaltyLogitsProcessor
 from .modules.learned_pos_emb import LearnedPositionEmbeddings
@@ -17,17 +19,12 @@ from .modules.t3_config import T3Config
 from .llama_configs import LLAMA_CONFIGS
 from .inference.t3_hf_backend import T3HuggingfaceBackend
 from .inference.alignment_stream_analyzer import AlignmentStreamAnalyzer
 logger = logging.getLogger(__name__)
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
 def _ensure_BOT_EOT(text_tokens: Tensor, hp):
     B = text_tokens.size(0)
     assert (text_tokens == hp.start_text_token).int().sum() >= B, "missing start_text_token"
@@ -44,7 +41,9 @@ class T3(nn.Module):
             different PE embedding space for speech.
     """
-    def __init__(self, hp=T3Config()):
         super().__init__()
         self.hp = hp
         self.cfg = LlamaConfig(**LLAMA_CONFIGS[hp.llama_config_name])
@@ -89,11 +88,13 @@ class T3(nn.Module):
         t3_cond: T3Cond,
         text_tokens: torch.LongTensor,
         speech_tokens: torch.LongTensor,
     ):
         # prepare input embeddings (skip backbone tranformer embeddings)
         cond_emb = self.prepare_conditioning(t3_cond)  # (B, len_cond, dim)
         text_emb = self.text_emb(text_tokens)  # (B, len_text, dim)
-        text_emb[1].zero_()  # CFG uncond
         speech_emb = self.speech_emb(speech_tokens)  # (B, len_speech, dim)
         if self.hp.input_pos_emb == "learned":
@@ -221,10 +222,11 @@ class T3(nn.Module):
         stop_on_eos=True,
         do_sample=True,
         temperature=0.8,
-        top_p=0.8,
         length_penalty=1.0,
-        repetition_penalty=2.0,
-        cfg_weight=0,
     ):
         """
         Args:
@@ -244,6 +246,7 @@ class T3(nn.Module):
             t3_cond=t3_cond,
             text_tokens=text_tokens,
             speech_tokens=initial_speech_tokens,
         )
         # In order to use the standard HF generate method, we need to extend some methods to inject our custom logic
@@ -254,19 +257,24 @@ class T3(nn.Module):
         # TODO? synchronize the expensive compile function
         # with self.compile_lock:
         if not self.compiled:
-            # alignment_stream_analyzer = AlignmentStreamAnalyzer(
-            #     self.tfmr,
-            #     None,
-            #     text_tokens_slice=(len_cond, len_cond + text_tokens.size(-1)),
-            #     alignment_layer_idx=9, # TODO: hparam or something?
-            #     eos_idx=self.hp.stop_speech_token,
-            # )
             patched_model = T3HuggingfaceBackend(
                 config=self.cfg,
                 llama=self.tfmr,
                 speech_enc=self.speech_emb,
                 speech_head=self.speech_head,
-                # alignment_stream_analyzer=alignment_stream_analyzer,
             )
             self.patched_model = patched_model
             self.compiled = True
@@ -281,7 +289,7 @@ class T3(nn.Module):
         #     max_new_tokens=max_new_tokens or self.hp.max_speech_tokens,
         #     num_return_sequences=num_return_sequences,
         #     temperature=temperature,
-        #     top_p=top_p,
         #     length_penalty=length_penalty,
         #     repetition_penalty=repetition_penalty,
         #     do_sample=do_sample,
@@ -306,7 +314,9 @@ class T3(nn.Module):
         # Instantiate the logits processors.
         top_p_warper = TopPLogitsWarper(top_p=top_p)
-        repetition_penalty_processor = RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
         # ---- Initial Forward Pass (no kv_cache yet) ----
         output = self.patched_model(
@@ -322,21 +332,32 @@ class T3(nn.Module):
         # ---- Generation Loop using kv_cache ----
         for i in tqdm(range(max_new_tokens), desc="Sampling", dynamic_ncols=True):
-            logits = output.logits[:, -1, :]
-            # CFG
-            logits_cond = logits[0:1]
-            logits_uncond = logits[1:2]
-            logits = logits_cond + cfg_weight * (logits_cond - logits_uncond)
-            logits = logits.squeeze(1)
             # Apply temperature scaling.
             if temperature != 1.0:
                 logits = logits / temperature
-            # Apply repetition penalty and top‑p filtering.
-            logits = repetition_penalty_processor(generated_ids, logits)
-            logits = top_p_warper(None, logits)
             # Convert logits to probabilities and sample the next token.
             probs = torch.softmax(logits, dim=-1)
@@ -347,6 +368,7 @@ class T3(nn.Module):
             # Check for EOS token.
             if next_token.view(-1) == self.hp.stop_speech_token:
                 break
             # Get embedding for the new token.

 import logging
 from typing import Union, Optional, List
+logger = logging.getLogger(__name__)
 from tqdm import tqdm
 import torch
 import torch.nn.functional as F
 from torch import nn, Tensor
 from transformers import LlamaModel, LlamaConfig
+from transformers.generation.logits_process import TopPLogitsWarper, RepetitionPenaltyLogitsProcessor, MinPLogitsWarper
 from .modules.learned_pos_emb import LearnedPositionEmbeddings
 from .llama_configs import LLAMA_CONFIGS
 from .inference.t3_hf_backend import T3HuggingfaceBackend
 from .inference.alignment_stream_analyzer import AlignmentStreamAnalyzer
+from ..utils import AttrDict
 logger = logging.getLogger(__name__)
 def _ensure_BOT_EOT(text_tokens: Tensor, hp):
     B = text_tokens.size(0)
     assert (text_tokens == hp.start_text_token).int().sum() >= B, "missing start_text_token"
             different PE embedding space for speech.
     """
+    def __init__(self, hp=None):
+        if hp is None:
+            hp = T3Config.english_only()  # Default to English-only config for backward compatibility
         super().__init__()
         self.hp = hp
         self.cfg = LlamaConfig(**LLAMA_CONFIGS[hp.llama_config_name])
         t3_cond: T3Cond,
         text_tokens: torch.LongTensor,
         speech_tokens: torch.LongTensor,
+        cfg_weight: float = 0.0,
     ):
         # prepare input embeddings (skip backbone tranformer embeddings)
         cond_emb = self.prepare_conditioning(t3_cond)  # (B, len_cond, dim)
         text_emb = self.text_emb(text_tokens)  # (B, len_text, dim)
+        if cfg_weight > 0.0:
+            text_emb[1].zero_()  # CFG uncond
         speech_emb = self.speech_emb(speech_tokens)  # (B, len_speech, dim)
         if self.hp.input_pos_emb == "learned":
         stop_on_eos=True,
         do_sample=True,
         temperature=0.8,
+        top_p=0.95,
+        min_p=0.05,
         length_penalty=1.0,
+        repetition_penalty=1.2,
+        cfg_weight=0.5,
     ):
         """
         Args:
             t3_cond=t3_cond,
             text_tokens=text_tokens,
             speech_tokens=initial_speech_tokens,
+            cfg_weight=cfg_weight,
         )
         # In order to use the standard HF generate method, we need to extend some methods to inject our custom logic
         # TODO? synchronize the expensive compile function
         # with self.compile_lock:
         if not self.compiled:
+            # Default to None for English models, only create for multilingual
+            alignment_stream_analyzer = None
+            if self.hp.is_multilingual:
+                alignment_stream_analyzer = AlignmentStreamAnalyzer(
+                    self.tfmr,
+                    None,
+                    text_tokens_slice=(len_cond, len_cond + text_tokens.size(-1)),
+                    alignment_layer_idx=9, # TODO: hparam or something?
+                    eos_idx=self.hp.stop_speech_token,
+                )
+                assert alignment_stream_analyzer.eos_idx == self.hp.stop_speech_token
             patched_model = T3HuggingfaceBackend(
                 config=self.cfg,
                 llama=self.tfmr,
                 speech_enc=self.speech_emb,
                 speech_head=self.speech_head,
+                alignment_stream_analyzer=alignment_stream_analyzer,
             )
             self.patched_model = patched_model
             self.compiled = True
         #     max_new_tokens=max_new_tokens or self.hp.max_speech_tokens,
         #     num_return_sequences=num_return_sequences,
         #     temperature=temperature,
+        #     min_p=min_p,
         #     length_penalty=length_penalty,
         #     repetition_penalty=repetition_penalty,
         #     do_sample=do_sample,
         # Instantiate the logits processors.
         top_p_warper = TopPLogitsWarper(top_p=top_p)
+        min_p_warper = MinPLogitsWarper(min_p=min_p)
+        top_p_warper = TopPLogitsWarper(top_p=top_p)
+        repetition_penalty_processor = RepetitionPenaltyLogitsProcessor(penalty=float(repetition_penalty))
         # ---- Initial Forward Pass (no kv_cache yet) ----
         output = self.patched_model(
         # ---- Generation Loop using kv_cache ----
         for i in tqdm(range(max_new_tokens), desc="Sampling", dynamic_ncols=True):
+            logits_step = output.logits[:, -1, :]
+            # CFG combine  → (1, V)
+            cond   = logits_step[0:1, :]
+            uncond = logits_step[1:2, :]
+            cfg = torch.as_tensor(cfg_weight, device=cond.device, dtype=cond.dtype)
+            logits = cond + cfg * (cond - uncond)
+            # Apply alignment stream analyzer integrity checks
+            if self.patched_model.alignment_stream_analyzer is not None:
+                if logits.dim() == 1:            # guard in case something upstream squeezed
+                    logits = logits.unsqueeze(0) # (1, V)
+                # Pass the last generated token for repetition tracking
+                last_token = generated_ids[0, -1].item() if len(generated_ids[0]) > 0 else None
+                logits = self.patched_model.alignment_stream_analyzer.step(logits, next_token=last_token)  # (1, V)
+            # Apply repetition penalty
+            ids_for_proc = generated_ids[:1, ...]   # batch = 1
+            logits = repetition_penalty_processor(ids_for_proc, logits)  # expects (B,V)
             # Apply temperature scaling.
             if temperature != 1.0:
                 logits = logits / temperature
+            # Apply min_p and top_p filtering
+            logits = min_p_warper(ids_for_proc, logits)
+            logits = top_p_warper(ids_for_proc, logits)
             # Convert logits to probabilities and sample the next token.
             probs = torch.softmax(logits, dim=-1)
             # Check for EOS token.
             if next_token.view(-1) == self.hp.stop_speech_token:
+                logger.info(f"✅ EOS token detected! Stopping generation at step {i+1}")
                 break
             # Get embedding for the new token.