vibevoice-1.5b / vibevoice /processor /vibevoice_processor.py

Upload folder using huggingface_hub

4889ed5 verified 7 days ago

30.1 kB

	import math
	import warnings
	from typing import List, Optional, Union, Dict, Any, Tuple
	import os
	import re

	import numpy as np
	import torch

	from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
	from transformers.utils import TensorType, logging
	from .vibevoice_tokenizer_processor import AudioNormalizer

	logger = logging.get_logger(__name__)


	class VibeVoiceProcessor:
	r"""
	Constructs a VibeVoice processor which wraps a VibeVoice tokenizer and audio processor into a single processor.

	[`VibeVoiceProcessor`] offers all the functionalities of [`VibeVoiceTokenizer`] and [`VibeVoiceTokenizerProcessor`].
	See the [`~VibeVoiceProcessor.__call__`] and [`~VibeVoiceProcessor.decode`] for more information.

	Args:
	tokenizer (`VibeVoiceTextTokenizer` or `VibeVoiceTextTokenizerFast`):
	The tokenizer for text processing.
	audio_processor (`VibeVoiceTokenizerProcessor`):
	The audio processor for speech processing.
	speech_tok_compress_ratio (`int`, optional, defaults to 3200):
	The compression ratio for speech tokenization.
	db_normalize (`bool`, optional, defaults to True):
	Whether to apply decibel normalization to audio inputs.
	"""

	def __init__(self, tokenizer=None, audio_processor=None, speech_tok_compress_ratio=3200, db_normalize=True, **kwargs):
	self.tokenizer = tokenizer
	self.audio_processor = audio_processor
	self.speech_tok_compress_ratio = speech_tok_compress_ratio
	self.db_normalize = db_normalize
	self.audio_normalizer = AudioNormalizer() if db_normalize else None
	self.system_prompt = " Transform the text provided by various speakers into speech output, utilizing the distinct voice of each respective speaker.\n"

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	"""
	Instantiate a VibeVoiceProcessor from a pretrained VibeVoice processor.

	Args:
	pretrained_model_name_or_path (`str` or `os.PathLike`):
	This can be either:
	- a string, the model id of a pretrained model
	- a path to a directory containing processor config

	Returns:
	[`VibeVoiceProcessor`]: The processor object instantiated from pretrained model.
	"""
	import os
	import json
	from transformers.utils import cached_file
	from .vibevoice_tokenizer_processor import VibeVoiceTokenizerProcessor
	from vibevoice.modular.modular_vibevoice_text_tokenizer import (
	VibeVoiceTextTokenizer,
	VibeVoiceTextTokenizerFast
	)

	# Try to load from local path first, then from HF hub
	config_path = os.path.join(pretrained_model_name_or_path, "preprocessor_config.json")
	config = None

	if os.path.exists(config_path):
	# Local path exists
	with open(config_path, 'r') as f:
	config = json.load(f)
	else:
	# Try to load from HF hub
	try:
	config_file = cached_file(
	pretrained_model_name_or_path,
	"preprocessor_config.json",
	**kwargs
	)
	with open(config_file, 'r') as f:
	config = json.load(f)
	except Exception as e:
	logger.warning(f"Could not load preprocessor_config.json from {pretrained_model_name_or_path}: {e}")
	logger.warning("Using default configuration")
	config = {
	"speech_tok_compress_ratio": 3200,
	"db_normalize": True,
	}

	# Extract main processor parameters
	speech_tok_compress_ratio = config.get("speech_tok_compress_ratio", 3200)
	db_normalize = config.get("db_normalize", True)

	# Load tokenizer - try from model path first, then fallback to Qwen
	language_model_pretrained_name = config.get("language_model_pretrained_name", None) or kwargs.pop("language_model_pretrained_name", "Qwen/Qwen2.5-1.5B")
	logger.info(f"Loading tokenizer from {language_model_pretrained_name}")
	if 'qwen' in language_model_pretrained_name.lower():
	tokenizer = VibeVoiceTextTokenizerFast.from_pretrained(
	language_model_pretrained_name,
	**kwargs
	)
	else:
	raise ValueError(f"Unsupported tokenizer type for {language_model_pretrained_name}. Supported types: Qwen, Llama, Gemma.")

	# Load audio processor
	if "audio_processor" in config:
	# Create audio processor from config
	audio_config = config["audio_processor"]
	audio_processor = VibeVoiceTokenizerProcessor(
	sampling_rate=audio_config.get("sampling_rate", 24000),
	normalize_audio=audio_config.get("normalize_audio", True),
	target_dB_FS=audio_config.get("target_dB_FS", -25),
	eps=audio_config.get("eps", 1e-6),
	)
	else:
	# Create default audio processor
	audio_processor = VibeVoiceTokenizerProcessor()

	# Create and return the processor
	return cls(
	tokenizer=tokenizer,
	audio_processor=audio_processor,
	speech_tok_compress_ratio=speech_tok_compress_ratio,
	db_normalize=db_normalize,
	)

	def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
	"""
	Save a processor to a directory, so that it can be re-loaded using the
	[`~VibeVoiceProcessor.from_pretrained`] class method.

	Args:
	save_directory (`str` or `os.PathLike`):
	Directory where the processor will be saved.
	"""
	import os
	import json

	os.makedirs(save_directory, exist_ok=True)

	# Save processor configuration
	processor_config = {
	"processor_class": "VibeVoiceProcessor",
	"speech_tok_compress_ratio": self.speech_tok_compress_ratio,
	"db_normalize": self.db_normalize,
	"audio_processor": {
	"feature_extractor_type": "VibeVoiceTokenizerProcessor",
	"sampling_rate": getattr(self.audio_processor, 'sampling_rate', 24000),
	"normalize_audio": getattr(self.audio_processor, 'normalize_audio', True),
	"target_dB_FS": getattr(self.audio_processor, 'target_dB_FS', -25),
	"eps": getattr(self.audio_processor, 'eps', 1e-6),
	}
	}

	config_path = os.path.join(save_directory, "preprocessor_config.json")
	with open(config_path, 'w') as f:
	json.dump(processor_config, f, indent=2)

	logger.info(f"Processor configuration saved in {config_path}")

	def __call__(
	self,
	text: Optional[Union[str, List[str], TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
	voice_samples: Optional[Union[List[Union[str, np.ndarray]], List[List[Union[str, np.ndarray]]]]] = None,
	padding: Union[bool, str, PaddingStrategy] = True,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_attention_mask: bool = True,
	**kwargs,
	) -> BatchEncoding:
	"""
	Main method to process one or more podcast scripts with optional voice samples.

	Args:
	text (`str`, `List[str]`):
	The input text(s) to process. Can be:
	- A single script string
	- A list of script strings for batch processing
	- A path to a .json or .txt file
	- A list of paths
	voice_samples (`List[Union[str, np.ndarray]]`, `List[List[Union[str, np.ndarray]]]`, optional):
	Voice samples for each script. Can be:
	- A list of samples for a single script
	- A list of lists for batch processing
	padding (`bool`, `str` or `PaddingStrategy`, defaults to `True`):
	Whether to pad sequences to the same length
	truncation (`bool`, `str` or `TruncationStrategy`, defaults to `False`):
	Whether to truncate sequences
	max_length (`int`, optional):
	Maximum length of the returned sequences
	return_tensors (`str` or `TensorType`, optional):
	If set, will return tensors of a particular framework
	return_attention_mask (`bool`, defaults to `True`):
	Whether to return the attention mask

	Returns:
	`BatchEncoding`: A BatchEncoding with the following fields:
	- input_ids -- List of token id sequences or tensor
	- attention_mask -- List of attention masks or tensor
	- speech_tensors -- Padded speech inputs (if voice_samples provided)
	- speech_masks -- Speech masks (if voice_samples provided)
	- speech_input_mask -- Boolean masks indicating speech token positions
	"""
	# Handle single vs batch input
	if isinstance(text, str) or (isinstance(text, list) and len(text) > 0 and not isinstance(text[0], str)):
	# Single input
	texts = [text]
	is_batched = False
	else:
	# Batch input
	texts = text
	is_batched = True

	# Handle voice samples
	if voice_samples is not None:
	if not is_batched or (isinstance(voice_samples[0], (str, np.ndarray))):
	# Single set of voice samples
	voice_samples_list = [voice_samples]
	else:
	# Batch of voice samples
	voice_samples_list = voice_samples
	else:
	voice_samples_list = [None] * len(texts)

	# Process each input
	all_encodings = []
	for text_input, voice_input in zip(texts, voice_samples_list):
	encoding = self._process_single(text_input, voice_input)
	all_encodings.append(encoding)

	# Combine batch
	batch_encoding = self._batch_encode(
	all_encodings,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	return_tensors=return_tensors,
	return_attention_mask=return_attention_mask,
	)

	return batch_encoding

	def _process_single(
	self,
	text: Union[str, TextInput],
	voice_samples: Optional[List[Union[str, np.ndarray]]] = None,
	) -> Dict[str, Any]:
	"""Process a single podcast script."""
	# Determine if text is a file path or direct script
	script = None
	if isinstance(text, str):
	# Check if it's a file path
	if text.endswith('.json') and os.path.exists(text):
	script = self._convert_json_to_script(text)
	elif text.endswith('.txt') and os.path.exists(text):
	script = self._convert_text_to_script(text)
	else:
	# Assume it's the script content directly
	script = text

	if script is None:
	raise ValueError(f"Could not process input text: {text}")

	# Parse the script
	parsed_lines = self._parse_script(script)
	all_speakers = list(set(speaker_id for speaker_id, _ in parsed_lines))

	# Create system prompt
	# system_tokens = self.tokenizer.encode(self.system_prompt, add_special_tokens=False)
	system_tokens = self.tokenizer.encode(self.system_prompt)

	# Process voice samples if provided
	if voice_samples:
	voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(voice_samples[:len(all_speakers)])
	else:
	voice_tokens, voice_speech_inputs, voice_speech_masks = [], [], []

	# Build full token sequence
	full_tokens = system_tokens + voice_tokens
	speech_input_mask = [False] * len(system_tokens) + voice_speech_masks

	# Add text input section
	full_tokens += self.tokenizer.encode(' Text input:\n', add_special_tokens=False)
	speech_input_mask += [False] * len(self.tokenizer.encode(' Text input:\n', add_special_tokens=False))

	for speaker_id, speaker_text in parsed_lines:
	speaker_text_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:{speaker_text}\n", add_special_tokens=False)
	full_tokens += speaker_text_tokens
	speech_input_mask += [False] * len(speaker_text_tokens)

	# Add speech output section
	full_tokens += self.tokenizer.encode(' Speech output:\n', add_special_tokens=False) + [self.tokenizer.speech_start_id]
	speech_input_mask += [False] * (len(self.tokenizer.encode(' Speech output:\n', add_special_tokens=False)) + 1)

	return {
	"input_ids": full_tokens,
	"speech_inputs": voice_speech_inputs if voice_speech_inputs else None,
	"speech_input_mask": speech_input_mask,
	"parsed_script": parsed_lines,
	"all_speakers": all_speakers,
	}

	def _batch_encode(
	self,
	encodings: List[Dict[str, Any]],
	padding: Union[bool, str, PaddingStrategy] = True,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_attention_mask: bool = True,
	) -> BatchEncoding:
	"""Combine multiple encodings into a batch with padding."""
	# Extract input_ids and create attention_mask
	input_ids_list = [enc["input_ids"] for enc in encodings]
	speech_input_masks_list = [enc["speech_input_mask"] for enc in encodings]

	# Determine padding strategy
	if isinstance(padding, bool):
	padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
	elif isinstance(padding, str):
	padding_strategy = PaddingStrategy(padding)
	else:
	padding_strategy = padding

	# Apply padding to input_ids
	if padding_strategy != PaddingStrategy.DO_NOT_PAD:
	if padding_strategy == PaddingStrategy.LONGEST:
	max_len = max(len(ids) for ids in input_ids_list)
	elif padding_strategy == PaddingStrategy.MAX_LENGTH and max_length is not None:
	max_len = max_length
	else:
	max_len = max(len(ids) for ids in input_ids_list)

	# Pad sequences
	padded_input_ids = []
	attention_masks = []
	padded_speech_input_masks = []

	for input_ids, speech_mask in zip(input_ids_list, speech_input_masks_list):
	# Truncate if needed
	if truncation and len(input_ids) > max_len:
	input_ids = input_ids[:max_len]
	speech_mask = speech_mask[:max_len]

	# Pad
	padding_length = max_len - len(input_ids)
	# padded_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
	padded_ids = [self.tokenizer.pad_id] * padding_length + input_ids
	attention_mask = [0] * padding_length + [1] * len(input_ids)
	padded_speech_mask = [False] * padding_length + speech_mask

	padded_input_ids.append(padded_ids)
	attention_masks.append(attention_mask)
	padded_speech_input_masks.append(padded_speech_mask)

	input_ids_list = padded_input_ids
	speech_input_masks_list = padded_speech_input_masks
	else:
	# No padding, just create attention masks
	attention_masks = [[1] * len(ids) for ids in input_ids_list] if return_attention_mask else None

	# Process speech inputs
	all_speech_inputs = []
	has_speech = False
	for enc in encodings:
	if enc["speech_inputs"] is not None:
	all_speech_inputs.extend(enc["speech_inputs"])
	has_speech = True

	# Prepare batch encoding
	batch_encoding = BatchEncoding()

	# Handle tensor conversion
	if return_tensors is not None:
	batch_encoding["input_ids"] = torch.tensor(input_ids_list, dtype=torch.long)
	if return_attention_mask and attention_masks is not None:
	batch_encoding["attention_mask"] = torch.tensor(attention_masks, dtype=torch.long)
	batch_encoding["speech_input_mask"] = torch.tensor(speech_input_masks_list, dtype=torch.bool)
	else:
	batch_encoding["input_ids"] = input_ids_list
	if return_attention_mask and attention_masks is not None:
	batch_encoding["attention_mask"] = attention_masks
	batch_encoding["speech_input_mask"] = speech_input_masks_list

	# Process speech tensors if present
	if has_speech:
	speech_dict = self.prepare_speech_inputs(
	all_speech_inputs,
	return_tensors=return_tensors,
	)
	batch_encoding["speech_tensors"] = speech_dict["padded_speeches"]
	batch_encoding["speech_masks"] = speech_dict["speech_masks"]
	else:
	batch_encoding["speech_tensors"] = None
	batch_encoding["speech_masks"] = None

	# Add metadata
	batch_encoding["parsed_scripts"] = [enc["parsed_script"] for enc in encodings]
	batch_encoding["all_speakers_list"] = [enc["all_speakers"] for enc in encodings]

	return batch_encoding

	def _create_voice_prompt(
	self,
	speaker_samples: List[Union[str, np.ndarray]]
	) -> Tuple[List[int], List[np.ndarray], List[bool]]:
	"""
	Create voice prompt tokens and process audio samples.

	Returns:
	tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
	"""
	vae_token_id = self.tokenizer.speech_diffusion_id

	voice_full_tokens = self.tokenizer.encode(' Voice input:\n', add_special_tokens=False)
	voice_speech_inputs = []
	voice_speech_masks = [False] * len(voice_full_tokens)

	for speaker_id, speaker_audio in enumerate(speaker_samples):
	prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)

	# Process audio
	if isinstance(speaker_audio, str):
	# Load audio from file
	wav = self.audio_processor._load_audio_from_path(speaker_audio)
	else:
	wav = np.array(speaker_audio, dtype=np.float32)

	# Apply normalization if needed
	if self.db_normalize and self.audio_normalizer:
	wav = self.audio_normalizer(wav)

	# Calculate token length based on compression ratio
	# if speaker_audio.endswith('.pt') or speaker_audio.endswith('.npy'):
	# vae_tok_len = wav.shape[0]
	# else:
	vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)

	# Build tokens and masks
	speaker_tokens = (prefix_tokens +
	[self.tokenizer.speech_start_id] +
	[vae_token_id] * vae_tok_len +
	[self.tokenizer.speech_end_id] +
	self.tokenizer.encode('\n', add_special_tokens=False))

	vae_input_mask = ([False] * len(prefix_tokens) +
	[False] +
	[True] * vae_tok_len +
	[False] +
	[False])

	voice_full_tokens.extend(speaker_tokens)
	voice_speech_masks.extend(vae_input_mask)
	voice_speech_inputs.append(wav)

	return voice_full_tokens, voice_speech_inputs, voice_speech_masks

	def prepare_speech_inputs(
	self,
	speech_inputs: List[np.ndarray],
	return_tensors: Optional[Union[str, TensorType]] = None,
	device: Optional[Union[str, torch.device]] = None,
	dtype: Optional[torch.dtype] = None,
	) -> Dict[str, Any]:
	"""
	Prepare speech inputs for model consumption.

	Args:
	speech_inputs: List of speech arrays
	return_tensors: Output tensor type
	device: Device to place tensors on
	dtype: Data type for tensors

	Returns:
	Dictionary with padded_speeches and speech_masks
	"""
	if not speech_inputs:
	return {"padded_speeches": None, "speech_masks": None}

	# Calculate sequence lengths
	vae_tok_seqlens = [math.ceil(s.shape[0] / self.speech_tok_compress_ratio) for s in speech_inputs]
	# vae_tok_seqlens = [math.ceil(s.shape[0] / self.speech_tok_compress_ratio) if s.ndim == 1 else s.shape[0] for s in speech_inputs]
	max_speech_length = max(s.shape[0] for s in speech_inputs)

	# Pad speeches
	if speech_inputs[0].ndim == 1:
	padded_speeches = np.full((len(speech_inputs), max_speech_length), fill_value=0, dtype=np.float32)
	else:
	padded_speeches = np.full((len(speech_inputs), max_speech_length, speech_inputs[0].shape[-1]), fill_value=0, dtype=np.float32)
	speech_masks = np.zeros((len(speech_inputs), max(vae_tok_seqlens)), dtype=np.bool_)

	for i, (speech, vae_tok_length) in enumerate(zip(speech_inputs, vae_tok_seqlens)):
	padded_speeches[i, :len(speech)] = speech
	speech_masks[i, :vae_tok_length] = True

	result = {
	"padded_speeches": padded_speeches,
	"speech_masks": speech_masks,
	}

	# Convert to tensors if requested
	if return_tensors == "pt":
	result["padded_speeches"] = torch.tensor(padded_speeches, device=device, dtype=dtype or torch.float32)
	result["speech_masks"] = torch.tensor(speech_masks, device=device, dtype=torch.bool)

	return result

	def _convert_json_to_script(self, json_file: str) -> str:
	"""
	Convert JSON format to script format.
	Expected JSON format:
	[
	{"speaker": "1", "text": "Hello everyone..."},
	{"speaker": "2", "text": "Great to be here..."}
	]
	"""
	import json

	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	if not isinstance(data, list):
	raise ValueError("JSON file must contain a list of speaker entries")

	script_lines = []
	for item in data:
	if not isinstance(item, dict):
	logger.warning(f"Skipping non-dict entry: {item}")
	continue

	speaker = item.get('speaker')
	text = item.get('text')

	if speaker is None or text is None:
	logger.warning(f"Skipping entry missing speaker or text: {item}")
	continue

	# Ensure speaker ID is valid
	try:
	speaker_id = int(speaker)
	except (ValueError, TypeError):
	logger.warning(f"Invalid speaker ID: {speaker}, skipping entry")
	continue

	# Clean up text
	text = text.strip()
	if text:
	script_lines.append(f"Speaker {speaker_id}: {text}")

	if not script_lines:
	raise ValueError("No valid entries found in JSON file")

	return "\n".join(script_lines)

	def _convert_text_to_script(self, text_file: str) -> str:
	"""
	Convert text file to script format.
	Handles multiple formats:
	1. Already formatted as "Speaker X: text"
	2. Plain text (assigns to Speaker 1)

	Handles edge cases like multiple colons in a line.
	"""
	with open(text_file, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	script_lines = []
	current_speaker = 1

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Try to parse as "Speaker X: text" format
	# Use regex to be more robust
	speaker_match = re.match(r'^Speaker\s+(\d+)\s:\s(.*)$', line, re.IGNORECASE)

	if speaker_match:
	speaker_id = int(speaker_match.group(1))
	text = speaker_match.group(2).strip()
	if text:
	script_lines.append(f"Speaker {speaker_id}: {text}")
	else:
	# Treat as plain text - assign to current speaker
	script_lines.append(f"Speaker {current_speaker}: {line}")

	if not script_lines:
	raise ValueError("No valid content found in text file")

	return "\n".join(script_lines)

	def _parse_script(self, script: str) -> List[Tuple[int, str]]:
	"""Parse script into list of (speaker_id, text) tuples."""
	lines = script.strip().split("\n")
	parsed_lines = []
	speaker_ids = []

	# First pass: parse all lines and collect speaker IDs
	for line in lines:
	if not line.strip():
	continue

	# Use regex to handle edge cases like multiple colons
	match = re.match(r'^Speaker\s+(\d+)\s:\s(.*)$', line.strip(), re.IGNORECASE)

	if match:
	speaker_id = int(match.group(1))
	text = ' ' + match.group(2).strip()
	parsed_lines.append((speaker_id, text))
	speaker_ids.append(speaker_id)
	else:
	logger.warning(f"Could not parse line: '{line}'")

	if not parsed_lines:
	raise ValueError("No valid speaker lines found in script")

	# Check if we need to normalize speaker IDs (only if all are > 0)
	min_speaker_id = min(speaker_ids)
	if min_speaker_id > 0:
	# Normalize to start from 0
	normalized_lines = []
	for speaker_id, text in parsed_lines:
	normalized_lines.append((speaker_id - 1, text))
	return normalized_lines
	else:
	# Keep original IDs
	return parsed_lines

	def _merge_inputs(self, text_inputs: BatchEncoding, audio_inputs: Dict) -> BatchEncoding:
	"""Merge text and audio inputs into a single BatchEncoding."""
	# Start with text inputs
	merged = BatchEncoding(text_inputs)

	# Add audio-specific fields
	if "audio" in audio_inputs:
	merged["speech_inputs"] = audio_inputs["audio"]
	if "streaming" in audio_inputs:
	merged["streaming"] = audio_inputs["streaming"]

	return merged

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.batch_decode`].
	Please refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.decode`].
	Please refer to the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self):
	"""
	Return the list of inputs accepted by the model.
	"""
	tokenizer_input_names = self.tokenizer.model_input_names
	audio_processor_input_names = self.audio_processor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + audio_processor_input_names + ["speech_inputs", "speech_input_mask"]))

	def save_audio(self,
	audio: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]],
	output_path: str = "output.wav",
	sampling_rate: Optional[int] = None,
	normalize: bool = False,
	batch_prefix: str = "audio_",
	) -> str:
	"""
	Save audio data to a file.
	Args:
	audio (Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]]):
	The audio data to save. Can be a single tensor/array or a list of them.
	output_path (str, optional): Path to save the audio file. Defaults to "output.wav".
	sampling_rate (int, optional): Sampling rate for the audio. If None, uses the processor's default.
	normalize (bool, optional): Whether to normalize the audio before saving. Defaults to False.
	batch_prefix (str, optional): Prefix for batch audio files. Defaults to "audio_".
	Returns:
	str: The path to the saved audio file.
	"""
	return self.audio_processor.save_audio(audio, output_path=output_path, sampling_rate=sampling_rate, normalize=normalize, batch_prefix=batch_prefix)

	__all__ = [
	"VibeVoiceProcessor",
	]