Spaces:

OpenMOSS-Team
/

MOSS-TTSD

Running on Zero

App Files Files Community

Cqy2019

rulerman commited on 3 days ago

Commit

61bc2f6

verified ·

1 Parent(s): 82b777c

mossttsd-space (#2)

Browse files

- update (dbd498f5b6a8eb7aff9ea559070143b6f55c6315)
- update2 (8008b4069c3a9980a0137258cf9aab4f866c1d98)

Co-authored-by: zyq <rulerman@users.noreply.huggingface.co>

Files changed (4) hide show

XY_Tokenizer/config/MOSS_TTSD_tokenizer.yaml +114 -0
XY_Tokenizer/xy_tokenizer/model.py +242 -150
app.py +5 -4
generation_utils.py +478 -165

XY_Tokenizer/config/MOSS_TTSD_tokenizer.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+generator_params:
+  input_sample_rate: 16000
+  output_sample_rate: 32000
+  encoder_downsample_rate: 1280
+  decoder_upsample_rate: 2560
+  feature_extractor_kwargs:
+    chunk_length: 30
+    feature_size: 80
+    hop_length: 160
+    n_fft: 400
+    n_samples: 480000
+    nb_max_frames: 3000
+    padding_side: right
+    padding_value: 0.0
+    return_attention_mask: false
+    sampling_rate: 16000
+  # Codec / model architecture (inference required)
+  semantic_encoder_kwargs:  # 100hz -> 50hz
+    num_mel_bins: 80
+    sampling_rate: 16000
+    hop_length: 160
+    stride_size: 2
+    kernel_size: 3
+    d_model: 768
+    scale_embedding: false
+    max_audio_seconds: 30
+    encoder_layers: 12
+    encoder_attention_heads: 12
+    encoder_ffn_dim: 3072
+    activation_function: "gelu"
+  semantic_encoder_adapter_kwargs: # 50hz
+    input_dim: 768
+    output_dim: 768
+    d_model: 768
+    max_source_positions: 1500
+    encoder_layers: 4
+    encoder_attention_heads: 12
+    encoder_ffn_dim: 3072
+  acoustic_encoder_kwargs:  # 100hz -> 50hz
+    num_mel_bins: 80
+    sampling_rate: 16000
+    hop_length: 160
+    stride_size: 2
+    kernel_size: 3
+    d_model: 768
+    scale_embedding: false
+    max_audio_seconds: 30
+    encoder_layers: 12
+    encoder_attention_heads: 12
+    encoder_ffn_dim: 3072
+    activation_function: "gelu"
+  pre_rvq_adapter_kwargs: # 50hz
+    input_dim: 1536
+    output_dim: 768
+    d_model: 768
+    max_source_positions: 1500
+    encoder_layers: 4
+    encoder_attention_heads: 12
+    encoder_ffn_dim: 3072
+  downsample_kwargs:  # 50hz -> 12.5hz
+    d_model: 768
+    avg_pooler: 4
+  quantizer_kwargs:  # 12.5hz
+    input_dim: 3072
+    rvq_dim: 512
+    output_dim: 3072
+    num_quantizers: 8
+    codebook_size: 1024
+    codebook_dim: 512
+    quantizer_dropout: 0.0
+    commitment: 1
+  post_rvq_adapter_kwargs: # 12.5hz
+    input_dim: 3072
+    output_dim: 3072
+    d_model: 768
+    max_source_positions: 375
+    encoder_layers: 4
+    encoder_attention_heads: 12
+    encoder_ffn_dim: 3072
+  upsample_kwargs:  # 12.5hz -> 50hz
+    d_model: 768
+    stride: 4
+  acoustic_decoder_kwargs:  # 50hz -> 100hz
+    num_mel_bins: 80
+    sampling_rate: 16000
+    hop_length: 160
+    stride_size: 2
+    kernel_size: 3
+    d_model: 768
+    scale_embedding: false
+    max_audio_seconds: 30
+    decoder_layers: 12
+    decoder_attention_heads: 12
+    decoder_ffn_dim: 3072
+    activation_function: "gelu"
+  vocos_kwargs:  # 100hz -> 32khz
+    input_channels: 80
+    dim: 512
+    intermediate_dim: 4096
+    num_layers: 30
+    n_fft: 1280
+    hop_size: 320
+    padding: "same"

XY_Tokenizer/xy_tokenizer/model.py CHANGED Viewed

@@ -1,146 +1,198 @@
 # -*- coding: utf-8 -*-
-import yaml
 import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .nn.feature_extractor import MelFeatureExtractor
-from .nn.modules import OmniAudioEncoder, OmniAudioDecoder, ResidualDownConv, UpConv, Transformer, Vocos
 from .nn.quantizer import ResidualVQ
 class XY_Tokenizer(nn.Module):
     def __init__(self, generator_params):
         super().__init__()
         # Basic parameters
-        self.input_sample_rate = generator_params['input_sample_rate']
-        self.output_sample_rate = generator_params['output_sample_rate']
-        self.encoder_downsample_rate = 1280
-        self.decoder_upsample_rate = 1920
-        self.code_dim = generator_params['quantizer_kwargs']['input_dim']
         ## Codec part
         ## Semantic channel
-        self.semantic_encoder = OmniAudioEncoder(**generator_params['semantic_encoder_kwargs'])
-        self.semantic_encoder_adapter = Transformer(**generator_params['semantic_encoder_adapter_kwargs'])
         ## Acoustic channel
-        self.acoustic_encoder = OmniAudioEncoder(**generator_params['acoustic_encoder_kwargs'])
         ## Semantic & acoustic shared parameters
-        self.pre_rvq_adapter = Transformer(**generator_params['pre_rvq_adapter_kwargs'])
-        self.downsample = ResidualDownConv(**generator_params['downsample_kwargs'])
-        self.quantizer = ResidualVQ(**generator_params['quantizer_kwargs'])
-        self.nq = generator_params['quantizer_kwargs']['num_quantizers']
-        self.post_rvq_adapter = Transformer(**generator_params['post_rvq_adapter_kwargs'])
         ## Acoustic channel
-        self.upsample = UpConv(**generator_params['upsample_kwargs'])
-        self.acoustic_decoder = OmniAudioDecoder(**generator_params['acoustic_decoder_kwargs'])
-        self.enhanced_vocos = Vocos(**generator_params['vocos_kwargs'])
         ## Feature extractor
-        self.feature_extractor = MelFeatureExtractor(**generator_params['feature_extractor_kwargs'])
     @torch.inference_mode()
     def inference_tokenize(self, x, input_lengths):
         """
-            Input:
-                x: Waveform tensor # (B, 1, T), T <= 30s * sample_rate
-                input_lengths: Valid length for each sample # (B,)
-            Output:
-                dict: Contains the following key-value pairs
-                    "zq": Quantized embeddings # (B, D, T)
-                    "codes": Quantization codes # (nq, B, T)
-                    "codes_lengths": Quantization code lengths # (B,)
         """
-        list_x = [xi[:, :x_len].reshape(-1).cpu().numpy() for xi, x_len in zip(x, input_lengths)]
         features = self.feature_extractor(
             list_x,
             sampling_rate=self.input_sample_rate,
             return_tensors="pt",
-            return_attention_mask=True
         )
-        input_mel = features['input_features'].to(x.device).to(x.dtype) # (B, D, 3000)
-        audio_attention_mask = features['attention_mask'].to(x.device) # (B, 3000)
         # Get batch size and sequence length of the input
-        mel_output_length = torch.sum(audio_attention_mask, dim=-1).long() # (B,)
         # Semantic channel
-        semantic_encoder_output, semantic_encoder_output_length = self.semantic_encoder(input_mel, mel_output_length) # (B, D, T), 100hz -> 50hz
-        semantic_encoder_adapter_output, semantic_encoder_adapter_output_length = self.semantic_encoder_adapter(semantic_encoder_output, semantic_encoder_output_length) # (B, D, T), 50hz
         # Acoustic channel
-        acoustic_encoder_output, acoustic_encoder_output_length = self.acoustic_encoder(input_mel, mel_output_length) # (B, D, T), 100hz -> 50hz
         # Semantic & acoustic mixing
-        concated_semantic_acoustic_channel = torch.concat([semantic_encoder_adapter_output, acoustic_encoder_output], dim=1) # (B, D, T)
         concated_semantic_acoustic_channel_length = acoustic_encoder_output_length
-        pre_rvq_adapter_output, pre_rvq_adapter_output_length = self.pre_rvq_adapter(concated_semantic_acoustic_channel, concated_semantic_acoustic_channel_length) # (B, D, T), 50hz
-        downsample_output, downsample_output_length = self.downsample(pre_rvq_adapter_output, pre_rvq_adapter_output_length) # (B, D, T), 50hz -> 12.5hz
-        zq, codes, vq_loss, _, quantizer_output_length = self.quantizer(downsample_output, downsample_output_length) # (B, D, T), (nq, B, T), (nq,), (nq, B, D, T), (B,)
         return {
-            "zq": zq, # (B, D, T)
-            "codes": codes, # (nq, B, T)
-            "codes_lengths": quantizer_output_length # (B,)
         }
-    @torch.inference_mode()
     def inference_detokenize(self, codes, codes_lengths):
         """
-            Input:
-                codes: Quantization codes # (nq, B, T)
-                codes_lengths: Quantization code lengths for each sample # (B,)
-            Output:
-                dict: Contains the following key-value pairs
-                    "y": Synthesized audio waveform # (B, 1, T)
-                    "output_length": Output lengths # (B,)
         """
-        zq = self.quantizer.decode_codes(codes) # (B, D, T)
-        post_rvq_adapter_output, post_rvq_adapter_output_length = self.post_rvq_adapter(zq, codes_lengths) # (B, D, T), 12.5hz
-        # Acoustic channel
-        upsample_output, upsample_output_length = self.upsample(post_rvq_adapter_output, post_rvq_adapter_output_length) # (B, D, T), 12.5hz -> 50hz
-        acoustic_decoder_output, acoustic_decoder_output_length = self.acoustic_decoder(upsample_output, upsample_output_length) # (B, D, T), 50hz -> 100hz
-        y, vocos_output_length = self.enhanced_vocos(acoustic_decoder_output, acoustic_decoder_output_length) # (B, 1, T), 100hz -> 16khz
         return {
-            "y": y, # (B, 1, T)
-            "output_length": vocos_output_length, # (B,)
         }
     @torch.inference_mode()
-    def encode(self, wav_list, overlap_seconds=10, device=torch.device("cuda")):
         """
-            Input:
-                wav_list: List of audio waveforms, each with potentially different length, may exceed 30 seconds # B * (T,)
-                overlap_seconds: Overlap in seconds, process 30 seconds at a time, keeping (30 - overlap_seconds) seconds of valid output
-            Output:
-                dict: Contains the following key-value pairs
-                    "codes_list": List of quantization codes # B * (nq, T)
         """
         duration_seconds = 30 - overlap_seconds
-        chunk_size = int(30 * self.input_sample_rate) # Maximum samples per chunk
-        duration_size = int(duration_seconds * self.input_sample_rate) # Valid output samples per chunk
-        code_duration_length = duration_size // self.encoder_downsample_rate # Valid code length per chunk
         # Get maximum waveform length
         max_length = max(len(wav) for wav in wav_list)
@@ -148,8 +200,8 @@ class XY_Tokenizer(nn.Module):
         wav_tensor = torch.zeros(batch_size, 1, max_length, device=device)
         input_lengths = torch.zeros(batch_size, dtype=torch.long, device=device)
         for i, wav in enumerate(wav_list):
-            wav_tensor[i, 0, :len(wav)] = wav
-            input_lengths[i] = len(wav) # (B,)
         # Calculate number of chunks needed
         max_chunks = (max_length + duration_size - 1) // duration_size
@@ -159,121 +211,161 @@ class XY_Tokenizer(nn.Module):
         for chunk_idx in range(max_chunks):
             start = chunk_idx * duration_size
             end = min(start + chunk_size, max_length)
-            chunk = wav_tensor[:, :, start:end] # (B, 1, T')
-            chunk_lengths = torch.clamp(input_lengths - start, 0, end - start) # (B,)
             # Skip empty chunks
             if chunk_lengths.max() == 0:
                 continue
             # Encode
-            result = self.inference_tokenize(chunk, chunk_lengths) # {"zq": (B, D, T'), "codes": (nq, B, T'), "codes_lengths": (B,)}
-            chunk_codes = result["codes"] # (nq, B, T')
-            chunk_code_lengths = result["codes_lengths"] # (B,)
             # Extract valid portion
-            valid_code_lengths = torch.clamp(chunk_code_lengths, 0, code_duration_length) # (B,)
-            valid_chunk_codes = torch.zeros(self.nq, batch_size, code_duration_length, device=device, dtype=chunk_codes.dtype)
             for b in range(batch_size):
                 if valid_code_lengths[b] > 0:
-                    valid_chunk_codes[:, b, :valid_code_lengths[b]] = chunk_codes[:, b, :valid_code_lengths[b]] # (nq, B, valid_code_length)
-            codes_list.append(valid_chunk_codes) # (nq, B, valid_code_length)
         # Concatenate all chunks
         if codes_list:
-            codes_tensor = torch.cat(codes_list, dim=-1) # (nq, B, T_total)
-            codes_list = [codes_tensor[:, i, :input_lengths[i] // self.encoder_downsample_rate] for i in range(batch_size)] # B * (nq, T)
         else:
-            codes_list = [torch.zeros(self.nq, 0, device=device, dtype=torch.long) for _ in range(batch_size)] # B * (nq, 0)
-        return {
-            "codes_list": codes_list # B * (nq, T)
-        }
     @torch.inference_mode()
-    def decode(self, codes_list, overlap_seconds=10, device=torch.device("cuda")):
         """
-            Input:
-                codes_list: List of quantization codes # B * (nq, T)
-                overlap_seconds: Overlap in seconds, process 30 seconds at a time, keeping (30 - overlap_seconds) seconds of valid output
-            Output:
-                dict: Contains the following key-value pairs
-                    "syn_wav_list": List of synthesized audio waveforms # B * (T,)
         """
         duration_seconds = 30 - overlap_seconds
-        chunk_code_length = int(30 * self.input_sample_rate // self.encoder_downsample_rate) # Maximum code length per chunk
-        duration_code_length = int(duration_seconds * self.input_sample_rate // self.encoder_downsample_rate) # Valid code length per chunk
-        duration_wav_length = duration_code_length * self.decoder_upsample_rate # Valid waveform length per chunk
         # Get maximum code length
         max_code_length = max(codes.shape[-1] for codes in codes_list)
         batch_size = len(codes_list)
-        codes_tensor = torch.zeros(self.nq, batch_size, max_code_length, device=device, dtype=torch.long)
         code_lengths = torch.zeros(batch_size, dtype=torch.long, device=device)
         for i, codes in enumerate(codes_list):
-            codes_tensor[:, i, :codes.shape[-1]] = codes.to(device)
-            code_lengths[i] = codes.shape[-1] # (B,)
         # Calculate number of chunks needed
-        max_chunks = (max_code_length + duration_code_length - 1) // duration_code_length
         wav_list = []
         # Process the entire batch in chunks
         for chunk_idx in range(max_chunks):
             start = chunk_idx * duration_code_length
             end = min(start + chunk_code_length, max_code_length)
-            chunk_codes = codes_tensor[:, :, start:end] # (nq, B, T')
-            chunk_code_lengths = torch.clamp(code_lengths - start, 0, end - start) # (B,)
             # Skip empty chunks
             if chunk_code_lengths.max() == 0:
                 continue
             # Decode
-            result = self.inference_detokenize(chunk_codes, chunk_code_lengths) # {"y": (B, 1, T'), "output_length": (B,)}
-            chunk_wav = result["y"] # (B, 1, T')
-            chunk_wav_lengths = result["output_length"] # (B,)
             # Extract valid portion
-            valid_wav_lengths = torch.clamp(chunk_wav_lengths, 0, duration_wav_length) # (B,)
-            valid_chunk_wav = torch.zeros(batch_size, 1, duration_wav_length, device=device)
             for b in range(batch_size):
                 if valid_wav_lengths[b] > 0:
-                    valid_chunk_wav[b, :, :valid_wav_lengths[b]] = chunk_wav[b, :, :valid_wav_lengths[b]] # (B, 1, valid_wav_length)
-            wav_list.append(valid_chunk_wav) # (B, 1, valid_wav_length)
         # Concatenate all chunks
         if wav_list:
-            wav_tensor = torch.cat(wav_list, dim=-1) # (B, 1, T_total)
-            syn_wav_list = [wav_tensor[i, 0, :code_lengths[i] * self.decoder_upsample_rate] for i in range(batch_size)] # B * (T,)
         else:
-            syn_wav_list = [torch.zeros(0, device=device) for _ in range(batch_size)] # B * (0,)
-        return {
-            "syn_wav_list": syn_wav_list # B * (T,)
-        }
     @classmethod
     def load_from_checkpoint(cls, config_path: str, ckpt_path: str):
         # Load model from configuration file and checkpoint
         logging.info(f"Loading model from {config_path} and {ckpt_path}")
         # Load configuration
-        with open(config_path, 'r') as f:
             config = yaml.safe_load(f)
         # Create model instance
-        model = cls(config['generator_params'])
         # Load checkpoint
-        checkpoint = torch.load(ckpt_path, map_location='cpu')
         # Check if checkpoint contains 'generator' key
-        if 'generator' in checkpoint:
-            model.load_state_dict(checkpoint['generator'])
         else:
             model.load_state_dict(checkpoint)
-        return model

 # -*- coding: utf-8 -*-
 import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import yaml
 from .nn.feature_extractor import MelFeatureExtractor
+from .nn.modules import (
+    OmniAudioDecoder,
+    OmniAudioEncoder,
+    ResidualDownConv,
+    Transformer,
+    UpConv,
+    Vocos,
+)
 from .nn.quantizer import ResidualVQ
 class XY_Tokenizer(nn.Module):
     def __init__(self, generator_params):
         super().__init__()
         # Basic parameters
+        self.input_sample_rate = generator_params["input_sample_rate"]
+        self.output_sample_rate = generator_params["output_sample_rate"]
+        self.encoder_downsample_rate = generator_params["encoder_downsample_rate"]
+        self.decoder_upsample_rate = generator_params["decoder_upsample_rate"]
+        self.code_dim = generator_params["quantizer_kwargs"]["input_dim"]
         ## Codec part
         ## Semantic channel
+        self.semantic_encoder = OmniAudioEncoder(
+            **generator_params["semantic_encoder_kwargs"]
+        )
+        self.semantic_encoder_adapter = Transformer(
+            **generator_params["semantic_encoder_adapter_kwargs"]
+        )
         ## Acoustic channel
+        self.acoustic_encoder = OmniAudioEncoder(
+            **generator_params["acoustic_encoder_kwargs"]
+        )
         ## Semantic & acoustic shared parameters
+        self.pre_rvq_adapter = Transformer(**generator_params["pre_rvq_adapter_kwargs"])
+        self.downsample = ResidualDownConv(**generator_params["downsample_kwargs"])
+        self.quantizer = ResidualVQ(**generator_params["quantizer_kwargs"])
+        self.nq = generator_params["quantizer_kwargs"]["num_quantizers"]
+        self.post_rvq_adapter = Transformer(
+            **generator_params["post_rvq_adapter_kwargs"]
+        )
         ## Acoustic channel
+        self.upsample = UpConv(**generator_params["upsample_kwargs"])
+        self.acoustic_decoder = OmniAudioDecoder(
+            **generator_params["acoustic_decoder_kwargs"]
+        )
+        self.enhanced_vocos = Vocos(**generator_params["vocos_kwargs"])
         ## Feature extractor
+        fe_kwargs = generator_params.get("feature_extractor_kwargs", {})
+        self.feature_extractor = MelFeatureExtractor(**fe_kwargs)
     @torch.inference_mode()
     def inference_tokenize(self, x, input_lengths):
         """
+        Input:
+            x: Waveform tensor # (B, 1, T), T <= 30s * sample_rate
+            input_lengths: Valid length for each sample # (B,)
+        Output:
+            dict: Contains the following key-value pairs
+                "zq": Quantized embeddings # (B, D, T)
+                "codes": Quantization codes # (nq, B, T)
+                "codes_lengths": Quantization code lengths # (B,)
         """
+        list_x = [
+            xi[:, :x_len].reshape(-1).cpu().numpy()
+            for xi, x_len in zip(x, input_lengths)
+        ]
         features = self.feature_extractor(
             list_x,
             sampling_rate=self.input_sample_rate,
             return_tensors="pt",
+            return_attention_mask=True,
         )
+        input_mel = features["input_features"].to(x.device).to(x.dtype)  # (B, D, 3000)
+        audio_attention_mask = features["attention_mask"].to(x.device)  # (B, 3000)
         # Get batch size and sequence length of the input
+        mel_output_length = torch.sum(audio_attention_mask, dim=-1).long()  # (B,)
         # Semantic channel
+        semantic_encoder_output, semantic_encoder_output_length = self.semantic_encoder(
+            input_mel, mel_output_length
+        )  # (B, D, T), 100hz -> 50hz
+        semantic_encoder_adapter_output, semantic_encoder_adapter_output_length = (
+            self.semantic_encoder_adapter(
+                semantic_encoder_output, semantic_encoder_output_length
+            )
+        )  # (B, D, T), 50hz
         # Acoustic channel
+        acoustic_encoder_output, acoustic_encoder_output_length = self.acoustic_encoder(
+            input_mel, mel_output_length
+        )  # (B, D, T), 100hz -> 50hz
         # Semantic & acoustic mixing
+        concated_semantic_acoustic_channel = torch.concat(
+            [semantic_encoder_adapter_output, acoustic_encoder_output], dim=1
+        )  # (B, D, T)
         concated_semantic_acoustic_channel_length = acoustic_encoder_output_length
+        pre_rvq_adapter_output, pre_rvq_adapter_output_length = self.pre_rvq_adapter(
+            concated_semantic_acoustic_channel,
+            concated_semantic_acoustic_channel_length,
+        )  # (B, D, T), 50hz
+        downsample_output, downsample_output_length = self.downsample(
+            pre_rvq_adapter_output, pre_rvq_adapter_output_length
+        )  # (B, D, T), 50hz -> 12.5hz
+        zq, codes, vq_loss, _, quantizer_output_length = self.quantizer(
+            downsample_output, downsample_output_length
+        )  # (B, D, T), (nq, B, T), (nq,), (nq, B, D, T), (B,)
         return {
+            "zq": zq,  # (B, D, T)
+            "codes": codes,  # (nq, B, T)
+            "codes_lengths": quantizer_output_length,  # (B,)
         }
+    @torch.inference_mode()
     def inference_detokenize(self, codes, codes_lengths):
         """
+        Input:
+            codes: Quantization codes # (nq, B, T)
+            codes_lengths: Quantization code lengths for each sample # (B,)
+        Output:
+            dict: Contains the following key-value pairs
+                "y": Synthesized audio waveform # (B, 1, T)
+                "output_length": Output lengths # (B,)
         """
+        zq = self.quantizer.decode_codes(codes)  # (B, D, T)
+        post_rvq_adapter_output, post_rvq_adapter_output_length = self.post_rvq_adapter(
+            zq, codes_lengths
+        )  # (B, D, T), 12.5hz
+        # Acoustic channel
+        upsample_output, upsample_output_length = self.upsample(
+            post_rvq_adapter_output, post_rvq_adapter_output_length
+        )  # (B, D, T), 12.5hz -> 50hz
+        acoustic_decoder_output, acoustic_decoder_output_length = self.acoustic_decoder(
+            upsample_output, upsample_output_length
+        )  # (B, D, T), 50hz -> 100hz
+        y, vocos_output_length = self.enhanced_vocos(
+            acoustic_decoder_output, acoustic_decoder_output_length
+        )  # (B, 1, T), 100hz -> 16khz
         return {
+            "y": y,  # (B, 1, T)
+            "output_length": vocos_output_length,  # (B,)
         }
     @torch.inference_mode()
+    def encode(self, wav_list, overlap_seconds=10):
         """
+        Input:
+            wav_list: List of audio waveforms, each with potentially different length, may exceed 30 seconds # B * (T,)
+            overlap_seconds: Overlap in seconds, process 30 seconds at a time, keeping (30 - overlap_seconds) seconds of valid output
+        Output:
+            dict: Contains the following key-value pairs
+                "codes_list": List of quantization codes # B * (nq, T)
         """
+        device = wav_list[0].device
         duration_seconds = 30 - overlap_seconds
+        chunk_size = int(30 * self.input_sample_rate)  # Maximum samples per chunk
+        duration_size = int(
+            duration_seconds * self.input_sample_rate
+        )  # Valid output samples per chunk
+        code_duration_length = (
+            duration_size // self.encoder_downsample_rate
+        )  # Valid code length per chunk
         # Get maximum waveform length
         max_length = max(len(wav) for wav in wav_list)
         wav_tensor = torch.zeros(batch_size, 1, max_length, device=device)
         input_lengths = torch.zeros(batch_size, dtype=torch.long, device=device)
         for i, wav in enumerate(wav_list):
+            wav_tensor[i, 0, : len(wav)] = wav
+            input_lengths[i] = len(wav)  # (B,)
         # Calculate number of chunks needed
         max_chunks = (max_length + duration_size - 1) // duration_size
         for chunk_idx in range(max_chunks):
             start = chunk_idx * duration_size
             end = min(start + chunk_size, max_length)
+            chunk = wav_tensor[:, :, start:end]  # (B, 1, T')
+            chunk_lengths = torch.clamp(input_lengths - start, 0, end - start)  # (B,)
             # Skip empty chunks
             if chunk_lengths.max() == 0:
                 continue
             # Encode
+            result = self.inference_tokenize(
+                chunk, chunk_lengths
+            )  # {"zq": (B, D, T'), "codes": (nq, B, T'), "codes_lengths": (B,)}
+            chunk_codes = result["codes"]  # (nq, B, T')
+            chunk_code_lengths = result["codes_lengths"]  # (B,)
             # Extract valid portion
+            valid_code_lengths = torch.clamp(
+                chunk_code_lengths, 0, code_duration_length
+            )  # (B,)
+            valid_chunk_codes = torch.zeros(
+                self.nq,
+                batch_size,
+                code_duration_length,
+                device=device,
+                dtype=chunk_codes.dtype,
+            )
             for b in range(batch_size):
                 if valid_code_lengths[b] > 0:
+                    valid_chunk_codes[:, b, : valid_code_lengths[b]] = chunk_codes[
+                        :, b, : valid_code_lengths[b]
+                    ]  # (nq, B, valid_code_length)
+            codes_list.append(valid_chunk_codes)  # (nq, B, valid_code_length)
         # Concatenate all chunks
         if codes_list:
+            codes_tensor = torch.cat(codes_list, dim=-1)  # (nq, B, T_total)
+            codes_list = [
+                codes_tensor[:, i, : input_lengths[i] // self.encoder_downsample_rate]
+                for i in range(batch_size)
+            ]  # B * (nq, T)
         else:
+            codes_list = [
+                torch.zeros(self.nq, 0, device=device, dtype=torch.long)
+                for _ in range(batch_size)
+            ]  # B * (nq, 0)
+        return {"codes_list": codes_list}  # B * (nq, T)
     @torch.inference_mode()
+    def decode(self, codes_list, overlap_seconds=10):
         """
+        Input:
+            codes_list: List of quantization codes # B * (nq, T)
+            overlap_seconds: Overlap in seconds, process 30 seconds at a time, keeping (30 - overlap_seconds) seconds of valid output
+        Output:
+            dict: Contains the following key-value pairs
+                "syn_wav_list": List of synthesized audio waveforms # B * (T,)
         """
+        device = codes_list[0].device
         duration_seconds = 30 - overlap_seconds
+        chunk_code_length = int(
+            30 * self.input_sample_rate // self.encoder_downsample_rate
+        )  # Maximum code length per chunk
+        duration_code_length = int(
+            duration_seconds * self.input_sample_rate // self.encoder_downsample_rate
+        )  # Valid code length per chunk
+        duration_wav_length = (
+            duration_code_length * self.decoder_upsample_rate
+        )  # Valid waveform length per chunk
         # Get maximum code length
         max_code_length = max(codes.shape[-1] for codes in codes_list)
         batch_size = len(codes_list)
+        codes_tensor = torch.zeros(
+            self.nq, batch_size, max_code_length, device=device, dtype=torch.long
+        )
         code_lengths = torch.zeros(batch_size, dtype=torch.long, device=device)
         for i, codes in enumerate(codes_list):
+            codes_tensor[:, i, : codes.shape[-1]] = codes.to(device)
+            code_lengths[i] = codes.shape[-1]  # (B,)
         # Calculate number of chunks needed
+        max_chunks = (
+            max_code_length + duration_code_length - 1
+        ) // duration_code_length
         wav_list = []
         # Process the entire batch in chunks
         for chunk_idx in range(max_chunks):
             start = chunk_idx * duration_code_length
             end = min(start + chunk_code_length, max_code_length)
+            chunk_codes = codes_tensor[:, :, start:end]  # (nq, B, T')
+            chunk_code_lengths = torch.clamp(
+                code_lengths - start, 0, end - start
+            )  # (B,)
             # Skip empty chunks
             if chunk_code_lengths.max() == 0:
                 continue
             # Decode
+            result = self.inference_detokenize(
+                chunk_codes, chunk_code_lengths
+            )  # {"y": (B, 1, T'), "output_length": (B,)}
+            chunk_wav = result["y"]  # (B, 1, T')
+            chunk_wav_lengths = result["output_length"]  # (B,)
             # Extract valid portion
+            valid_wav_lengths = torch.clamp(
+                chunk_wav_lengths, 0, duration_wav_length
+            )  # (B,)
+            valid_chunk_wav = torch.zeros(
+                batch_size, 1, duration_wav_length, device=device
+            )
             for b in range(batch_size):
                 if valid_wav_lengths[b] > 0:
+                    valid_chunk_wav[b, :, : valid_wav_lengths[b]] = chunk_wav[
+                        b, :, : valid_wav_lengths[b]
+                    ]  # (B, 1, valid_wav_length)
+            wav_list.append(valid_chunk_wav)  # (B, 1, valid_wav_length)
         # Concatenate all chunks
         if wav_list:
+            wav_tensor = torch.cat(wav_list, dim=-1)  # (B, 1, T_total)
+            syn_wav_list = [
+                wav_tensor[i, 0, : code_lengths[i] * self.decoder_upsample_rate]
+                for i in range(batch_size)
+            ]  # B * (T,)
         else:
+            syn_wav_list = [
+                torch.zeros(0, device=device) for _ in range(batch_size)
+            ]  # B * (0,)
+        return {"syn_wav_list": syn_wav_list}  # B * (T,)
     @classmethod
     def load_from_checkpoint(cls, config_path: str, ckpt_path: str):
         # Load model from configuration file and checkpoint
         logging.info(f"Loading model from {config_path} and {ckpt_path}")
         # Load configuration
+        with open(config_path, "r") as f:
             config = yaml.safe_load(f)
         # Create model instance
+        model = cls(config["generator_params"])
         # Load checkpoint
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
         # Check if checkpoint contains 'generator' key
+        if "generator" in checkpoint:
+            model.load_state_dict(checkpoint["generator"])
         else:
             model.load_state_dict(checkpoint)
+        return model

app.py CHANGED Viewed

@@ -131,15 +131,15 @@ LANGUAGES = {
 # Model configuration
 SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text."
 MODEL_PATH = os.environ["MODEL_REPO_ID"]
-SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml"
 # SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt"
 MAX_CHANNELS = 8
 from huggingface_hub import hf_hub_download
 SPT_CHECKPOINT_PATH = hf_hub_download(
-    repo_id="fnlp/XY_Tokenizer_TTSD_V0",
-    filename="xy_tokenizer.ckpt",
     cache_dir="XY_Tokenizer/weights"
 )
@@ -245,7 +245,8 @@ def process_single_audio_generation(
             device=device,
             system_prompt=SYSTEM_PROMPT,
             start_idx=0,
-            use_normalize=use_normalize
         )
         # Check results

 # Model configuration
 SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text."
 MODEL_PATH = os.environ["MODEL_REPO_ID"]
+SPT_CONFIG_PATH = "XY_Tokenizer/config/MOSS_TTSD_tokenizer.yaml"
 # SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt"
 MAX_CHANNELS = 8
 from huggingface_hub import hf_hub_download
 SPT_CHECKPOINT_PATH = hf_hub_download(
+    repo_id="OpenMOSS-Team/MOSS_TTSD_tokenizer",
+    filename="MOSS_TTSD_tokenizer",
     cache_dir="XY_Tokenizer/weights"
 )
             device=device,
             system_prompt=SYSTEM_PROMPT,
             start_idx=0,
+            use_normalize=use_normalize,
+            silence_duration=0.1,
         )
         # Check results

generation_utils.py CHANGED Viewed

@@ -1,86 +1,181 @@
 import os
 import re
 import torch
 import torchaudio
-import numpy as np
-from transformers import AutoTokenizer
-from modeling_asteroid import AsteroidTTSInstruct
-from XY_Tokenizer.xy_tokenizer.model import XY_Tokenizer
 MAX_CHANNELS = 8
-SILENCE_DURATION = 0.0  # Fixed silence duration: 0 seconds
-def load_model(model_path, spt_config_path, spt_checkpoint_path):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
-    model = AsteroidTTSInstruct.from_pretrained(model_path, torch_dtype=torch.bfloat16, attn_implementation="sdpa")
-    spt = XY_Tokenizer.load_from_checkpoint(config_path=spt_config_path, ckpt_path=spt_checkpoint_path)
     model.eval()
     spt.eval()
     return tokenizer, model, spt
 def process_jsonl_item(item):
-    """Process JSONL data items and extract audio and text information according to the new format"""
-    base_path = item.get("base_path", "")
     text = item.get("text", "")
-    # Process prompt audio and text
-    if "prompt_audio" in item and "prompt_text" in item:
-        print("Using prompt_audio and prompt_text directly from item.")
-        # If prompt_audio and prompt_text exist, use them directly
-        prompt_audio = item["prompt_audio"]
-        prompt_text = item["prompt_text"]
-        # Only perform path joining when prompt_audio is a string path
-        if isinstance(prompt_audio, str) and base_path and prompt_audio:
-            prompt_audio = os.path.join(base_path, prompt_audio)
-    else:
-        print("Using speaker1 and speaker2 information for prompt audio and text.")
-        # Otherwise, merge speaker1 and speaker2 information
-        prompt_audio_speaker1 = item.get("prompt_audio_speaker1", "")
-        prompt_text_speaker1 = item.get("prompt_text_speaker1", "")
-        prompt_audio_speaker2 = item.get("prompt_audio_speaker2", "")
-        prompt_text_speaker2 = item.get("prompt_text_speaker2", "")
-        # Process audio: if it's a string path, perform path joining; if it's a tuple, use directly
-        if isinstance(prompt_audio_speaker1, str):
-            speaker1_audio = os.path.join(base_path, prompt_audio_speaker1) if base_path and prompt_audio_speaker1 else prompt_audio_speaker1
         else:
-            speaker1_audio = prompt_audio_speaker1  # Use tuple directly
-        if isinstance(prompt_audio_speaker2, str):
-            speaker2_audio = os.path.join(base_path, prompt_audio_speaker2) if base_path and prompt_audio_speaker2 else prompt_audio_speaker2
         else:
-            speaker2_audio = prompt_audio_speaker2  # Use tuple directly
-        prompt_audio = {
-            "speaker1": speaker1_audio,
-            "speaker2": speaker2_audio
-        }
-        # Merge text
-        prompt_text = ""
-        if prompt_text_speaker1:
-            prompt_text += f"[S1]{prompt_text_speaker1}"
-        if prompt_text_speaker2:
-            prompt_text += f"[S2]{prompt_text_speaker2}"
-        prompt_text = prompt_text.strip()
-    return {
-        "text": text,
-        "prompt_text": prompt_text,
-        "prompt_audio": prompt_audio
-    }
 def load_audio_data(prompt_audio, target_sample_rate=16000):
     """Load audio data and return processed audio tensor
     Args:
         prompt_audio: Can be in the following formats:
             - String: audio file path
@@ -89,10 +184,14 @@ def load_audio_data(prompt_audio, target_sample_rate=16000):
     """
     if prompt_audio is None:
         return None
     try:
         # Check if prompt_audio is a dictionary (containing speaker1 and speaker2)
-        if isinstance(prompt_audio, dict) and "speaker1" in prompt_audio and "speaker2" in prompt_audio:
             # Process audio from both speakers separately
             wav1, sr1 = _load_single_audio(prompt_audio["speaker1"])
             wav2, sr2 = _load_single_audio(prompt_audio["speaker2"])
@@ -104,14 +203,14 @@ def load_audio_data(prompt_audio, target_sample_rate=16000):
             # Single audio
             wav, sr = _load_single_audio(prompt_audio)
             # Resample to 16k
-            if sr != target_sample_rate:
                 wav = torchaudio.functional.resample(wav, sr, target_sample_rate)
             # Ensure mono channel
             if wav.shape[0] > 1:
                 wav = wav.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
-            if len(wav.shape) == 1:
                 wav = wav.unsqueeze(0)
         return wav
     except Exception as e:
         print(f"Error loading audio data: {e}")
@@ -120,10 +219,10 @@ def load_audio_data(prompt_audio, target_sample_rate=16000):
 def _load_single_audio(audio_input):
     """Load single audio, supports file path or (wav, sr) tuple
     Args:
         audio_input: String (file path) or tuple (wav, sr)
     Returns:
         tuple: (wav, sr)
     """
@@ -150,8 +249,8 @@ def merge_speaker_audios(wav1, sr1, wav2, sr2, target_sample_rate=16000):
             wav1 = wav1.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
         if len(wav1.shape) == 1:
             wav1 = wav1.unsqueeze(0)
-        # Process second audio
         if sr2 != target_sample_rate:
             wav2 = torchaudio.functional.resample(wav2, sr2, target_sample_rate)
         # Ensure mono channel
@@ -159,7 +258,7 @@ def merge_speaker_audios(wav1, sr1, wav2, sr2, target_sample_rate=16000):
             wav2 = wav2.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
         if len(wav2.shape) == 1:
             wav2 = wav2.unsqueeze(0)
         # Concatenate audio
         merged_wav = torch.cat([wav1, wav2], dim=1)
         return merged_wav
@@ -168,34 +267,48 @@ def merge_speaker_audios(wav1, sr1, wav2, sr2, target_sample_rate=16000):
         raise
-def process_inputs(tokenizer, spt, prompt, text, device, audio_data=None, max_channels=8, pad_token=1024):
     seq = f"<|begin_of_style|>{prompt}<|end_of_style|>\n<|begin_of_text|>{text}<|end_of_text|>\n<|begin_of_speech|>"
     inputs1 = np.array(tokenizer.encode(seq))
     input_ids = np.full((inputs1.shape[0], max_channels), pad_token)
     input_ids[:, 0] = inputs1
     if audio_data is not None:
         try:
             # audio_data should now be a processed audio tensor
             wav = audio_data
             # Add fixed 5-second silence at the end of audio (using 16k sample rate)
-            silence_samples = int(SILENCE_DURATION * 16000)
             silence = torch.zeros(wav.shape[0], silence_samples)
             wav = torch.cat([wav, silence], dim=1)
             with torch.no_grad():
                 # Use SPT encoding
                 encode_result = spt.encode([wav.squeeze().to(device)])
-                audio_token = encode_result["codes_list"][0].permute(1, 0).cpu().numpy()  # Adjust dimension order
             # similar to DAC encoding adjustment
-            audio_token[:, 0] = audio_token[:, 0] + 151665  # Keep this line if offset is needed, otherwise delete
             input_ids = np.concatenate([input_ids, audio_token])
         except Exception as e:
             print(f"Error processing audio data: {e}")
             raise
     return input_ids
@@ -203,7 +316,9 @@ def shifting_inputs(input_ids, tokenizer, pad_token=1024, max_channels=8):
     seq_len = input_ids.shape[0]
     new_seq_len = seq_len + max_channels - 1
     shifted_input_ids = np.full((new_seq_len, max_channels), pad_token, dtype=np.int64)
-    shifted_input_ids[:, 0] = np.full(new_seq_len, tokenizer.pad_token_id, dtype=np.int64)
     for i in range(max_channels):
         shifted_input_ids[i : (seq_len + i), i] = input_ids[:, i]
     return shifted_input_ids
@@ -213,7 +328,7 @@ def rpadding(input_ids, channels, tokenizer):
     attention_masks = [np.ones(inputs.shape[0]) for inputs in input_ids]
     max_length = max(ids.shape[0] for ids in input_ids)
     padded_input_ids, padded_attns = [], []
     for ids, attn in zip(input_ids, attention_masks):
         pad_len = max_length - ids.shape[0]
         input_pad = np.full((pad_len, channels), 1024)
@@ -245,26 +360,23 @@ def normalize_text(text: str) -> str:
     Normalize multi-speaker script.
     1. Don't preserve line breaks.
-    2. Remove brackets for non-speaker tags (if [] doesn't contain S1/S2...Sx format, remove the brackets themselves).
-    3. Remove decorative symbols: 【】《》（）『』「」"-“” .
-    4. Internal punctuation ！；：、 → ，；only allow ？ and ，。
     5. Multiple 。 keep only the last one, others → ，。
     6. Replace consecutive "哈" (>=2) with "(笑)".
     7. Auto-recognize [S1] / [S2] … tags; if missing, treat as whole segment.
     """
     # Replace [1], [2] etc. format with [S1], [S2] etc. format
-    text = re.sub(r'\[(\d+)\]', r'[S\1]', text)
     # Remove decorative characters
-    remove_chars = "【】《》（）『』「」""\"-“”"
-    # Remove brackets for non-speaker tags (keep content, only remove brackets themselves)
-    text = re.sub(r'\[(?!S\d+\])([^\]]*)\]', r'\1', text)
     # Use positive lookahead to split text by speaker tags (tags themselves are still preserved)
-    segments = re.split(r'(?=\[S\d+\])', text.replace("\n", " "))
-    normalized_lines = []
     for seg in segments:
         seg = seg.strip()
@@ -272,42 +384,73 @@ def normalize_text(text: str) -> str:
             continue
         # Extract tags
-        m = re.match(r'^(\[S\d+\])\s*(.*)', seg)
-        tag, content = m.groups() if m else ('', seg)
         # Remove irrelevant symbols
         content = re.sub(f"[{re.escape(remove_chars)}]", "", content)
         # Handle consecutive "哈" characters: replace 2 or more with "(笑)"
-        content = re.sub(r'哈{2,}', '(笑)', content)
         # First handle multi-character punctuation marks
-        content = content.replace('——', '，')
-        content = content.replace('……', '，')
         # Handle single-character internal punctuation marks
-        internal_punct_map = str.maketrans({
-            '！': '，', '!': ',',
-            '；': '，', ';': ',',
-            '：': '，', ':': ',',
-            '、': '，',
-            '？': '，', '?': ','
-        })
         content = content.translate(internal_punct_map)
         content = content.strip()
         # Keep only the final period
         if len(content) > 1:
-            last_ch = "。" if content[-1] == "，" else ("." if content[-1] == "," else content[-1])
-            body = content[:-1].replace('。', '，')
             content = body + last_ch
-        normalized_lines.append(f"{tag}{content}".strip())
-    return "".join(normalized_lines)
-def process_batch(batch_items, tokenizer, model, spt, device, system_prompt, start_idx, use_normalize=False):
     """Process a batch of data items and generate audio, return audio data and metadata"""
     try:
         # Prepare batch data
@@ -316,64 +459,74 @@ def process_batch(batch_items, tokenizer, model, spt, device, system_prompt, sta
         prompts = [system_prompt] * batch_size
         prompt_audios = []
         actual_texts_data = []  # Store actual text data used
         print(f"Processing {batch_size} samples starting from index {start_idx}...")
         # Extract text and audio from each sample
         for i, item in enumerate(batch_items):
             # Use new processing function
             processed_item = process_jsonl_item(item)
             text = processed_item["text"]
             prompt_text = processed_item["prompt_text"]
-            # Merge text
-            full_text = prompt_text + text
             original_full_text = full_text  # Save original text
             # Apply text normalization based on parameter
             if use_normalize:
                 full_text = normalize_text(full_text)
             # Replace speaker tags
-            final_text = full_text.replace("[S1]", "<speaker1>").replace("[S2]", "<speaker2>")
             texts.append(final_text)
             # Save actual text information used
-            actual_texts_data.append({
-                "index": start_idx + i,
-                "original_text": original_full_text,
-                "normalized_text": normalize_text(original_full_text) if use_normalize else None,
-                "final_text": final_text,
-                "use_normalize": use_normalize
-            })
             # Get reference audio
             prompt_audios.append(processed_item["prompt_audio"])
         # Process inputs
         input_ids_list = []
-        for i, (text, prompt, audio_path) in enumerate(zip(texts, prompts, prompt_audios)):
             # Load audio data here
             audio_data = load_audio_data(audio_path) if audio_path else None
-            inputs = process_inputs(tokenizer, spt, prompt, text, device, audio_data)
             inputs = shifting_inputs(inputs, tokenizer)
             input_ids_list.append(inputs)
         # Pad batch inputs
         input_ids, attention_mask = rpadding(input_ids_list, MAX_CHANNELS, tokenizer)
         # Batch generation
         print(f"Starting batch audio generation...")
         start = input_ids.shape[1] - MAX_CHANNELS + 1
         # Move inputs to GPU
         input_ids = input_ids.to(device)
         attention_mask = attention_mask.to(device)
         # Generate model outputs
         outputs = model.generate(
-            input_ids=input_ids,
             attention_mask=attention_mask,
         )
         print(f"Original outputs shape: {outputs.shape}")
@@ -385,20 +538,19 @@ def process_batch(batch_items, tokenizer, model, spt, device, system_prompt, sta
         outputs = outputs[:, start:]
         seq_len = outputs.shape[1] - MAX_CHANNELS + 1
         speech_ids = torch.full((outputs.shape[0], seq_len, MAX_CHANNELS), 0).to(device)
         # Adjust output format
         for j in range(MAX_CHANNELS):
             speech_ids[..., j] = outputs[:, j : seq_len + j, j]
-            if j == 0:
                 speech_ids[..., j] = speech_ids[..., j] - 151665
         # Find valid positions for each sample
         li = find_max_valid_positions(speech_ids)
         # Store audio result data
         audio_results = []
         # Process batch sample results individually
         for i in range(batch_size):
             try:
@@ -408,39 +560,200 @@ def process_batch(batch_items, tokenizer, model, spt, device, system_prompt, sta
                     print(f"Sample {start_idx + i} has no valid speech tokens")
                     audio_results.append(None)
                     continue
                 this_speech_id = speech_ids[i, :end_idx]
-                print(f"Speech token shape for sample {start_idx + i}: {this_speech_id.shape}")
-                # Decode generated audio
-                with torch.no_grad():
-                    codes_list = [this_speech_id.permute(1, 0)]  # Convert to SPT expected format
-                    decode_result = spt.decode(codes_list, overlap_seconds=10)
-                    audio_result = decode_result["syn_wav_list"][0].cpu().detach()
-                    if audio_result.ndim == 1:  # If 1D [samples]
-                        audio_result = audio_result.unsqueeze(0)  # Convert to 2D [1, samples]
-                # Save audio data instead of file path
-                audio_results.append({
-                    "audio_data": audio_result,
-                    "sample_rate": spt.output_sample_rate,
-                    "index": start_idx + i
-                })
-                print(f"Audio generation completed: sample {start_idx + i}")
             except Exception as e:
                 print(f"Error processing sample {start_idx + i}: {str(e)}, skipping...")
                 import traceback
                 traceback.print_exc()
                 audio_results.append(None)
         # Clean up GPU memory
         torch.cuda.empty_cache()
         # Return text data and audio data
         return actual_texts_data, audio_results
     except Exception as e:
         print(f"Error during batch processing: {str(e)}")
-        raise

 import os
 import re
+import numpy as np
 import torch
 import torchaudio
 MAX_CHANNELS = 8
+def pad_or_truncate_to_seconds(
+    wav: torch.Tensor, target_seconds: float, sr: int
+) -> torch.Tensor:
+    """Pad or truncate a mono waveform to target length in seconds.
+    Args:
+        wav: (1, T) or (T,) tensor
+        target_seconds: target duration in seconds
+        sr: sample rate
+    Returns:
+        (1, T_target) tensor
+    """
+    if wav.dim() == 2 and wav.shape[0] == 1:
+        wav_1d = wav.squeeze(0)
+    else:
+        wav_1d = wav.reshape(-1)
+    target_len = int(round(target_seconds * sr))
+    cur_len = wav_1d.shape[-1]
+    if cur_len == target_len:
+        out = wav_1d
+    elif cur_len > target_len:
+        out = wav_1d[:target_len]
+    else:
+        pad_len = target_len - cur_len
+        out = torch.cat(
+            [wav_1d, torch.zeros(pad_len, dtype=wav_1d.dtype, device=wav_1d.device)],
+            dim=-1,
+        )
+    return out.unsqueeze(0)
+def crossfade_concat(
+    segments: list, sample_rate: int, crossfade_seconds: float = 0.1
+) -> torch.Tensor:
+    """Concatenate segments with linear crossfade.
+    Args:
+        segments: list of (1, T) tensors
+        sample_rate: sampling rate
+        crossfade_seconds: overlap time for crossfade
+    Returns:
+        (1, T_total) tensor
+    """
+    if len(segments) == 0:
+        return torch.zeros(1, 0)
+    if len(segments) == 1:
+        return segments[0]
+    out = segments[0]
+    cf_len_target = int(round(crossfade_seconds * sample_rate))
+    for k in range(1, len(segments)):
+        nxt = segments[k]
+        if cf_len_target <= 0:
+            out = torch.cat([out, nxt], dim=-1)
+            continue
+        cf_len = min(cf_len_target, out.shape[-1], nxt.shape[-1])
+        if cf_len <= 0:
+            out = torch.cat([out, nxt], dim=-1)
+            continue
+        fade_out = torch.linspace(
+            1.0, 0.0, steps=cf_len, dtype=out.dtype, device=out.device
+        )
+        fade_in = torch.linspace(
+            0.0, 1.0, steps=cf_len, dtype=nxt.dtype, device=nxt.device
+        )
+        overlap = out[0, -cf_len:] * fade_out + nxt[0, :cf_len] * fade_in
+        out = torch.cat(
+            [out[:, :-cf_len], overlap.unsqueeze(0), nxt[:, cf_len:]], dim=-1
+        )
+    return out
+def load_model(
+    model_path,
+    spt_config_path,
+    spt_checkpoint_path,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="sdpa",
+):
+    from transformers import AutoTokenizer
+    from modeling_asteroid import AsteroidTTSInstruct
+    from XY_Tokenizer.xy_tokenizer.model import XY_Tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AsteroidTTSInstruct.from_pretrained(
+        model_path, torch_dtype=torch_dtype, attn_implementation=attn_implementation
+    )
+    spt = XY_Tokenizer.load_from_checkpoint(
+        config_path=spt_config_path, ckpt_path=spt_checkpoint_path
+    )
     model.eval()
     spt.eval()
     return tokenizer, model, spt
 def process_jsonl_item(item):
+    """Parse a JSONL item enforcing prompt requirement.
+    Only supports Format 1 (separate speaker refs) and Format 2 (shared ref),
+    consistent with the updated README. If `base_path` is missing/empty, any
+    string paths must be absolute. Text-only input is not supported and will raise.
+    """
+    base_path = item.get("base_path", "") or ""
     text = item.get("text", "")
+    def _resolve_path(p: str) -> str:
+        if not isinstance(p, str) or not p:
+            return p
+        if base_path:
+            return os.path.join(base_path, p)
+        # base_path missing: require absolute path
+        if not os.path.isabs(p):
+            raise ValueError(
+                "When base_path is omitted, audio paths must be absolute. Got: " + p
+            )
+        return p
+    # Try Format 2 first: shared audio reference
+    prompt_audio = None
+    prompt_text = ""
+    if "prompt_audio" in item:
+        prompt_audio_val = item.get("prompt_audio")
+        if not prompt_audio_val:
+            raise ValueError("Format 2 requires non-empty 'prompt_audio'.")
+        if isinstance(prompt_audio_val, str):
+            prompt_audio = _resolve_path(prompt_audio_val)
+        else:
+            # allow tuple form for backward-compatibility
+            prompt_audio = prompt_audio_val
+        prompt_text = item.get("prompt_text", "")
+        return {"text": text, "prompt_text": prompt_text, "prompt_audio": prompt_audio}
+    # Try Format 1: separate speaker references
+    s1 = item.get("prompt_audio_speaker1", "")
+    s2 = item.get("prompt_audio_speaker2", "")
+    has_s1 = (isinstance(s1, str) and s1) or isinstance(s1, tuple)
+    has_s2 = (isinstance(s2, str) and s2) or isinstance(s2, tuple)
+    if has_s1 and has_s2:
+        if isinstance(s1, str) and s1:
+            s1_resolved = _resolve_path(s1)
         else:
+            s1_resolved = s1
+        if isinstance(s2, str) and s2:
+            s2_resolved = _resolve_path(s2)
         else:
+            s2_resolved = s2
+        # Build merged prompt audio dict
+        prompt_audio = {"speaker1": s1_resolved, "speaker2": s2_resolved}
+        # Merge texts
+        pt1 = item.get("prompt_text_speaker1", "")
+        pt2 = item.get("prompt_text_speaker2", "")
+        merged = ""
+        if pt1:
+            merged += f"[S1]{pt1}"
+        if pt2:
+            merged += f"[S2]{pt2}"
+        prompt_text = merged.strip()
+        return {"text": text, "prompt_text": prompt_text, "prompt_audio": prompt_audio}
+    # Otherwise, no supported prompt found → reject (text-only unsupported)
+    raise ValueError(
+        "Input must include prompt (Format 1 or 2). Text-only is not supported."
+    )
 def load_audio_data(prompt_audio, target_sample_rate=16000):
     """Load audio data and return processed audio tensor
     Args:
         prompt_audio: Can be in the following formats:
             - String: audio file path
     """
     if prompt_audio is None:
         return None
     try:
         # Check if prompt_audio is a dictionary (containing speaker1 and speaker2)
+        if (
+            isinstance(prompt_audio, dict)
+            and "speaker1" in prompt_audio
+            and "speaker2" in prompt_audio
+        ):
             # Process audio from both speakers separately
             wav1, sr1 = _load_single_audio(prompt_audio["speaker1"])
             wav2, sr2 = _load_single_audio(prompt_audio["speaker2"])
             # Single audio
             wav, sr = _load_single_audio(prompt_audio)
             # Resample to 16k
+            if sr != target_sample_rate:
                 wav = torchaudio.functional.resample(wav, sr, target_sample_rate)
             # Ensure mono channel
             if wav.shape[0] > 1:
                 wav = wav.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
+            if len(wav.shape) == 1:
                 wav = wav.unsqueeze(0)
         return wav
     except Exception as e:
         print(f"Error loading audio data: {e}")
 def _load_single_audio(audio_input):
     """Load single audio, supports file path or (wav, sr) tuple
     Args:
         audio_input: String (file path) or tuple (wav, sr)
     Returns:
         tuple: (wav, sr)
     """
             wav1 = wav1.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
         if len(wav1.shape) == 1:
             wav1 = wav1.unsqueeze(0)
+        # Process second audio
         if sr2 != target_sample_rate:
             wav2 = torchaudio.functional.resample(wav2, sr2, target_sample_rate)
         # Ensure mono channel
             wav2 = wav2.mean(dim=0, keepdim=True)  # Convert multi-channel to mono
         if len(wav2.shape) == 1:
             wav2 = wav2.unsqueeze(0)
         # Concatenate audio
         merged_wav = torch.cat([wav1, wav2], dim=1)
         return merged_wav
         raise
+def process_inputs(
+    tokenizer,
+    spt,
+    prompt,
+    text,
+    device,
+    silence_duration,
+    audio_data=None,
+    max_channels=8,
+    pad_token=1024,
+):
     seq = f"<|begin_of_style|>{prompt}<|end_of_style|>\n<|begin_of_text|>{text}<|end_of_text|>\n<|begin_of_speech|>"
     inputs1 = np.array(tokenizer.encode(seq))
     input_ids = np.full((inputs1.shape[0], max_channels), pad_token)
     input_ids[:, 0] = inputs1
     if audio_data is not None:
         try:
             # audio_data should now be a processed audio tensor
             wav = audio_data
             # Add fixed 5-second silence at the end of audio (using 16k sample rate)
+            silence_samples = int(silence_duration * 16000)
             silence = torch.zeros(wav.shape[0], silence_samples)
             wav = torch.cat([wav, silence], dim=1)
             with torch.no_grad():
                 # Use SPT encoding
                 encode_result = spt.encode([wav.squeeze().to(device)])
+                audio_token = (
+                    encode_result["codes_list"][0].permute(1, 0).cpu().numpy()
+                )  # Adjust dimension order
             # similar to DAC encoding adjustment
+            audio_token[:, 0] = (
+                audio_token[:, 0] + 151665
+            )  # Keep this line if offset is needed, otherwise delete
             input_ids = np.concatenate([input_ids, audio_token])
         except Exception as e:
             print(f"Error processing audio data: {e}")
             raise
     return input_ids
     seq_len = input_ids.shape[0]
     new_seq_len = seq_len + max_channels - 1
     shifted_input_ids = np.full((new_seq_len, max_channels), pad_token, dtype=np.int64)
+    shifted_input_ids[:, 0] = np.full(
+        new_seq_len, tokenizer.pad_token_id, dtype=np.int64
+    )
     for i in range(max_channels):
         shifted_input_ids[i : (seq_len + i), i] = input_ids[:, i]
     return shifted_input_ids
     attention_masks = [np.ones(inputs.shape[0]) for inputs in input_ids]
     max_length = max(ids.shape[0] for ids in input_ids)
     padded_input_ids, padded_attns = [], []
     for ids, attn in zip(input_ids, attention_masks):
         pad_len = max_length - ids.shape[0]
         input_pad = np.full((pad_len, channels), 1024)
     Normalize multi-speaker script.
     1. Don't preserve line breaks.
+    2. Preserve bracketed segments like [] () <> even when they are not speaker tags.
+    3. Remove decorative symbols: 【】《》（）『』「」～~-_.
+    4. Internal punctuation ；：、 → ，；keep ？！?.
     5. Multiple 。 keep only the last one, others → ，。
     6. Replace consecutive "哈" (>=2) with "(笑)".
     7. Auto-recognize [S1] / [S2] … tags; if missing, treat as whole segment.
+    8. Merge adjacent identical speaker tags.
     """
     # Replace [1], [2] etc. format with [S1], [S2] etc. format
+    text = re.sub(r"\[(\d+)\]", r"[S\1]", text)
     # Remove decorative characters
+    remove_chars = "【】《》（）『』「」" '"-_“”～~‘’'
     # Use positive lookahead to split text by speaker tags (tags themselves are still preserved)
+    segments = re.split(r"(?=\[S\d+\])", text.replace("\n", " "))
+    processed_parts = []
     for seg in segments:
         seg = seg.strip()
             continue
         # Extract tags
+        m = re.match(r"^(\[S\d+\])\s*(.*)", seg)
+        tag, content = m.groups() if m else ("", seg)
         # Remove irrelevant symbols
         content = re.sub(f"[{re.escape(remove_chars)}]", "", content)
         # Handle consecutive "哈" characters: replace 2 or more with "(笑)"
+        content = re.sub(r"哈{2,}", "[笑]", content)
+        # Handle English laughter (e.g., "haha", "ha ha")
+        content = re.sub(r"\b(ha(\s*ha)+)\b", "[laugh]", content, flags=re.IGNORECASE)
         # First handle multi-character punctuation marks
+        content = content.replace("——", "，")
+        content = content.replace("……", "，")
         # Handle single-character internal punctuation marks
+        internal_punct_map = str.maketrans(
+            {"；": "，", ";": ",", "：": "，", ":": ",", "、": "，"}
+        )
         content = content.translate(internal_punct_map)
         content = content.strip()
         # Keep only the final period
         if len(content) > 1:
+            last_ch = (
+                "。"
+                if content[-1] == "，"
+                else ("." if content[-1] == "," else content[-1])
+            )
+            body = content[:-1].replace("。", "，")
             content = body + last_ch
+        processed_parts.append({"tag": tag, "content": content})
+    if not processed_parts:
+        return ""
+    # Merge consecutive same speakers
+    merged_lines = []
+    current_tag = processed_parts[0]["tag"]
+    current_content = [processed_parts[0]["content"]]
+    for part in processed_parts[1:]:
+        if part["tag"] == current_tag and current_tag:
+            current_content.append(part["content"])
+        else:
+            merged_lines.append(f"{current_tag}{''.join(current_content)}".strip())
+            current_tag = part["tag"]
+            current_content = [part["content"]]
+    merged_lines.append(f"{current_tag}{''.join(current_content)}".strip())
+    return "".join(merged_lines).replace("‘", "'").replace("’", "'")
+def process_batch(
+    batch_items,
+    tokenizer,
+    model,
+    spt,
+    device,
+    system_prompt,
+    start_idx,
+    use_normalize=False,
+    silence_duration=0,
+):
     """Process a batch of data items and generate audio, return audio data and metadata"""
     try:
         # Prepare batch data
         prompts = [system_prompt] * batch_size
         prompt_audios = []
         actual_texts_data = []  # Store actual text data used
         print(f"Processing {batch_size} samples starting from index {start_idx}...")
         # Extract text and audio from each sample
         for i, item in enumerate(batch_items):
             # Use new processing function
             processed_item = process_jsonl_item(item)
             text = processed_item["text"]
             prompt_text = processed_item["prompt_text"]
+            # Merge text, if prompt_text is empty, full_text is just text
+            full_text = prompt_text + text if prompt_text else text
             original_full_text = full_text  # Save original text
             # Apply text normalization based on parameter
             if use_normalize:
                 full_text = normalize_text(full_text)
             # Replace speaker tags
+            final_text = full_text.replace("[S1]", "<speaker1>").replace(
+                "[S2]", "<speaker2>"
+            )
             texts.append(final_text)
             # Save actual text information used
+            actual_texts_data.append(
+                {
+                    "index": start_idx + i,
+                    "original_text": original_full_text,
+                    "normalized_text": (
+                        normalize_text(original_full_text) if use_normalize else None
+                    ),
+                    "final_text": final_text,
+                    "use_normalize": use_normalize,
+                }
+            )
             # Get reference audio
             prompt_audios.append(processed_item["prompt_audio"])
         # Process inputs
         input_ids_list = []
+        for i, (text, prompt, audio_path) in enumerate(
+            zip(texts, prompts, prompt_audios)
+        ):
             # Load audio data here
             audio_data = load_audio_data(audio_path) if audio_path else None
+            inputs = process_inputs(
+                tokenizer, spt, prompt, text, device, silence_duration, audio_data
+            )
             inputs = shifting_inputs(inputs, tokenizer)
             input_ids_list.append(inputs)
         # Pad batch inputs
         input_ids, attention_mask = rpadding(input_ids_list, MAX_CHANNELS, tokenizer)
         # Batch generation
         print(f"Starting batch audio generation...")
         start = input_ids.shape[1] - MAX_CHANNELS + 1
         # Move inputs to GPU
         input_ids = input_ids.to(device)
         attention_mask = attention_mask.to(device)
         # Generate model outputs
         outputs = model.generate(
+            input_ids=input_ids,
             attention_mask=attention_mask,
         )
         print(f"Original outputs shape: {outputs.shape}")
         outputs = outputs[:, start:]
         seq_len = outputs.shape[1] - MAX_CHANNELS + 1
         speech_ids = torch.full((outputs.shape[0], seq_len, MAX_CHANNELS), 0).to(device)
         # Adjust output format
         for j in range(MAX_CHANNELS):
             speech_ids[..., j] = outputs[:, j : seq_len + j, j]
+            if j == 0:
                 speech_ids[..., j] = speech_ids[..., j] - 151665
         # Find valid positions for each sample
         li = find_max_valid_positions(speech_ids)
         # Store audio result data
         audio_results = []
         # Process batch sample results individually
         for i in range(batch_size):
             try:
                     print(f"Sample {start_idx + i} has no valid speech tokens")
                     audio_results.append(None)
                     continue
                 this_speech_id = speech_ids[i, :end_idx]
+                print(
+                    f"Speech token shape for sample {start_idx + i}: {this_speech_id.shape}"
+                )
+                # Prompt-Augmented Decode (rvq8-style); fall back to original decode if no prompt
+                prompt_audio = prompt_audios[i]
+                if prompt_audio is None:
+                    # Fallback to original decode
+                    with torch.no_grad():
+                        codes_list = [this_speech_id.permute(1, 0)]
+                        decode_result = spt.decode(codes_list, overlap_seconds=10)
+                        audio_out = decode_result["syn_wav_list"][0].cpu().detach()
+                        if audio_out.ndim == 1:
+                            audio_out = audio_out.unsqueeze(0)
+                    audio_results.append(
+                        {
+                            "audio_data": audio_out,
+                            "sample_rate": spt.output_sample_rate,
+                            "index": start_idx + i,
+                        }
+                    )
+                    print(f"Audio generation completed (orig): sample {start_idx + i}")
+                else:
+                    # 1) Load prompt at SPT input sr and force to 20s
+                    ref_sr_in = (
+                        getattr(spt, "input_sample_rate", None)
+                        or getattr(spt, "sampling_rate", None)
+                        or 24000
+                    )
+                    ref_wav = load_audio_data(
+                        prompt_audio, target_sample_rate=ref_sr_in
+                    )
+                    if ref_wav is None:
+                        # If ref missing, use original decode
+                        with torch.no_grad():
+                            codes_list = [this_speech_id.permute(1, 0)]
+                            decode_result = spt.decode(codes_list, overlap_seconds=10)
+                            audio_out = decode_result["syn_wav_list"][0].cpu().detach()
+                            if audio_out.ndim == 1:
+                                audio_out = audio_out.unsqueeze(0)
+                        audio_results.append(
+                            {
+                                "audio_data": audio_out,
+                                "sample_rate": spt.output_sample_rate,
+                                "index": start_idx + i,
+                            }
+                        )
+                        print(
+                            f"Audio generation completed (orig no-ref): sample {start_idx + i}"
+                        )
+                    else:
+                        # Encode 20s reference to tokens
+                        ref_wav_20s = pad_or_truncate_to_seconds(
+                            ref_wav, 20.0, ref_sr_in
+                        ).to(device)
+                        with torch.no_grad():
+                            enc = spt.encode([ref_wav_20s.squeeze(0)])
+                            ref_codes = (
+                                enc["codes_list"][0].to(device).long()
+                            )  # (nq, T_ref)
+                        # Prepare token-to-sample mapping and windowing params
+                        out_sr = (
+                            getattr(spt, "output_sample_rate", None)
+                            or getattr(spt, "sample_rate", None)
+                            or 24000
+                        )
+                        tokens_per_second = float(ref_sr_in) / float(
+                            spt.encoder_downsample_rate
+                        )
+                        tokens_per_chunk = int(round(10.0 * tokens_per_second))
+                        stride_tokens = 85
+                        keep_tokens = 85
+                        left_ctx_tokens = 20
+                        total_tokens = this_speech_id.shape[0]
+                        samples_per_token = int(round(out_sr / tokens_per_second))
+                        crossfade_seconds = 0.1
+                        crossfade_samples = int(round(crossfade_seconds * out_sr))
+                        kept_segments = []
+                        chunk_idx = 0
+                        while True:
+                            st_tok = chunk_idx * stride_tokens
+                            if st_tok >= total_tokens:
+                                break
+                            ed_tok = min(st_tok + tokens_per_chunk, total_tokens)
+                            gen_chunk = this_speech_id[st_tok:ed_tok]  # (len, C)
+                            if gen_chunk.shape[0] == 0:
+                                break
+                            # Concatenate reference tokens with current window tokens
+                            combined_codes = torch.cat(
+                                [ref_codes, gen_chunk.permute(1, 0).long()], dim=1
+                            ).to(
+                                device
+                            )  # (nq, T_ref + T_chunk)
+                            codes_lengths = torch.tensor(
+                                [combined_codes.shape[-1]],
+                                dtype=torch.long,
+                                device=device,
+                            )
+                            combined_codes_batched = combined_codes.unsqueeze(
+                                1
+                            )  # (nq, 1, T)
+                            with torch.no_grad():
+                                detok = spt.inference_detokenize(
+                                    combined_codes_batched, codes_lengths
+                                )
+                                y = detok["y"][0, 0]  # (T_samples)
+                            # Remove 20s reference portion (in samples)
+                            ref_samples = int(round(20.0 * out_sr))
+                            if y.shape[-1] <= ref_samples:
+                                chunk_idx += 1
+                                continue
+                            chunk_y = y[ref_samples:]
+                            # Determine kept region within current window
+                            window_len = gen_chunk.shape[0]
+                            remains = total_tokens - st_tok
+                            is_first = chunk_idx == 0
+                            is_last = ed_tok >= total_tokens
+                            if is_first:
+                                keep_start_tok = 0
+                                keep_end_tok = min(
+                                    keep_tokens + left_ctx_tokens, window_len
+                                )
+                            elif is_last and remains < 105:
+                                keep_start_tok = (
+                                    0 if is_first else min(left_ctx_tokens, window_len)
+                                )
+                                keep_end_tok = window_len
+                            else:
+                                keep_start_tok = min(left_ctx_tokens, window_len)
+                                keep_end_tok = min(
+                                    left_ctx_tokens + keep_tokens, window_len
+                                )
+                            keep_start_smps = keep_start_tok * samples_per_token
+                            keep_end_smps = keep_end_tok * samples_per_token
+                            left_margin = 0
+                            right_margin = crossfade_samples if not is_last else 0
+                            seg_start = max(0, keep_start_smps - left_margin)
+                            seg_end = min(
+                                chunk_y.shape[-1], keep_end_smps + right_margin
+                            )
+                            if seg_end > seg_start:
+                                kept_segments.append(
+                                    chunk_y[seg_start:seg_end]
+                                    .detach()
+                                    .cpu()
+                                    .unsqueeze(0)
+                                )
+                            chunk_idx += 1
+                        # Concatenate with crossfade; if empty, return tiny silence
+                        if len(kept_segments) == 0:
+                            audio_out = torch.zeros(1, int(0.01 * out_sr))
+                        else:
+                            audio_out = crossfade_concat(
+                                kept_segments,
+                                out_sr,
+                                crossfade_seconds=crossfade_seconds,
+                            )
+                        audio_results.append(
+                            {
+                                "audio_data": audio_out,
+                                "sample_rate": out_sr,
+                                "index": start_idx + i,
+                            }
+                        )
+                        print(
+                            f"Audio generation completed (prompt-aug): sample {start_idx + i}"
+                        )
             except Exception as e:
                 print(f"Error processing sample {start_idx + i}: {str(e)}, skipping...")
                 import traceback
                 traceback.print_exc()
                 audio_results.append(None)
         # Clean up GPU memory
         torch.cuda.empty_cache()
         # Return text data and audio data
         return actual_texts_data, audio_results
     except Exception as e:
         print(f"Error during batch processing: {str(e)}")
+        raise