Spaces:

shenyunhang
/

VITA-Audio

Running on Zero

App Files Files Community

VITA-Audio / tools /inference_sts.py

shenyunhang

-a

52e4f53 10 days ago

raw

history blame contribute delete

36.3 kB

	import json
	import logging
	import os
	import random
	import re
	import sys
	import time
	import uuid
	from threading import Thread
	from typing import Optional

	import torch
	import tqdm
	from torch import nn
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from transformers.generation import GenerationConfig

	import torchaudio
	from vita_audio.data.processor.audio_processor import add_audio_input_contiguous
	from vita_audio.tokenizer import get_audio_tokenizer

	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)

	torch.manual_seed(1234)

	device_map = "cuda:0"
	audio_tokenizer_rank = 0
	torch_dtype = torch.bfloat16

	# model_name_or_path = sys.argv[1]
	# audio_tokenizer_path = sys.argv[2]
	# flow_path = sys.argv[3]


	if True:
	# if False:
	# sensevoice glm4voice tokenizer
	sys.path.append("third_party/GLM-4-Voice/")
	sys.path.append("third_party/GLM-4-Voice/cosyvoice/")
	sys.path.append("third_party/GLM-4-Voice/third_party/Matcha-TTS/")

	audio_tokenizer_path = "/data/models/THUDM/glm-4-voice-tokenizer"
	flow_path = "/data/models/THUDM/glm-4-voice-decoder"

	audio_tokenizer_type = "sensevoice_glm4voice"

	model_name_or_path = "VITA-MLLM/VITA-Audio-Plus-Vanilla/"

	# if True:
	if False:
	# glm4voice tokenizer
	sys.path.append("third_party/GLM-4-Voice/")
	sys.path.append("third_party/GLM-4-Voice/cosyvoice/")
	sys.path.append("third_party/GLM-4-Voice/third_party/Matcha-TTS/")

	audio_tokenizer_path = "/data/models/THUDM/glm-4-voice-tokenizer"
	flow_path = "/data/models/THUDM/glm-4-voice-decoder"

	audio_tokenizer_type = "glm4voice"

	# model_name_or_path = "VITA-MLLM/VITA-Audio-Balance"

	model_name_or_path = "VITA-MLLM/VITA-Audio-Boost"


	output_dir = "/data/output/LM/inference/"
	os.makedirs(output_dir, exist_ok=True)


	class TextAudioIteratorStreamer(TextIteratorStreamer):
	def __init__(
	self,
	tokenizer: "AutoTokenizer",
	skip_prompt: bool = False,
	timeout: Optional[float] = None,
	**decode_kwargs,
	):
	super().__init__(tokenizer, skip_prompt, timeout, **decode_kwargs)

	# self.audio_offset = tokenizer.convert_tokens_to_ids("<\|audio_0\|>")
	self.audio_offset = tokenizer.convert_tokens_to_ids("<\|begin_of_audio\|>")
	self.num_decode_tokens = 0

	def put(self, value):
	"""
	Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
	"""
	if len(value.shape) > 1 and value.shape[0] > 1:
	raise ValueError("TextStreamer only supports batch size 1")
	elif len(value.shape) > 1:
	value = value[0]

	if self.skip_prompt and self.next_tokens_are_prompt:
	self.next_tokens_are_prompt = False
	return

	self.num_decode_tokens += len(value)

	# Add the new token to the cache and decodes the entire thing.
	self.token_cache.extend(value.tolist())
	text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)

	# After the symbol for a new line, we flush the cache.
	if text.endswith("\n"):
	printable_text = text[self.print_len :]
	self.token_cache = []
	self.print_len = 0
	# If the last token is a CJK character, we print the characters.
	elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
	printable_text = text[self.print_len :]
	self.print_len += len(printable_text)
	elif self.token_cache[-1] >= self.audio_offset:
	printable_text = text[self.print_len :]
	self.print_len += len(printable_text)
	# Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
	# which may change with the subsequent token -- there are probably smarter ways to do this!)
	else:
	printable_text = text[self.print_len : text.rfind(" ") + 1]
	self.print_len += len(printable_text)

	self.on_finalized_text(printable_text)

	while self.text_queue.qsize() > 10:
	time.sleep(0.01)


	class BenchmarkIteratorStreamer(TextIteratorStreamer):
	def __init__(
	self,
	tokenizer: "AutoTokenizer",
	skip_prompt: bool = False,
	timeout: Optional[float] = None,
	**decode_kwargs,
	):
	super().__init__(tokenizer, skip_prompt, timeout, **decode_kwargs)

	self.num_decode_tokens = 0

	def put(self, value):
	"""
	Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
	"""
	if len(value.shape) > 1 and value.shape[0] > 1:
	raise ValueError("TextStreamer only supports batch size 1")
	elif len(value.shape) > 1:
	value = value[0]

	if self.skip_prompt and self.next_tokens_are_prompt:
	self.next_tokens_are_prompt = False
	return

	self.num_decode_tokens += len(value)

	printable_text = " ".join([str(x) for x in value.tolist()]) + " "
	self.on_finalized_text(printable_text)


	def find_audio_segments_regex(text):
	"""
	Find all substrings between <\|begin_of_audio\|> and <\|end_of_audio\|> using regex.

	Args:
	text (str): The input string to search through

	Returns:
	list: A list of all found audio segments (substrings between the delimiters)
	"""
	pattern = re.compile(r"<\\|begin_of_audio\\|>(.*?)<\\|end_of_audio\\|>", re.DOTALL)
	segments = pattern.findall(text)
	return [segment.strip() for segment in segments]


	def extract_token_ids_as_int(text):
	pattern = re.compile(r"<\\|audio_(\d+)\\|>")
	token_ids = pattern.findall(text)
	return [int(id) for id in token_ids]


	def custom_init_weights(module):
	if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	torch.nn.init.constant_(module.bias, 0)
	elif isinstance(module, torch.nn.BatchNorm2d) or isinstance(module, torch.nn.BatchNorm1d):
	torch.nn.init.constant_(module.weight, 1)
	torch.nn.init.constant_(module.bias, 0)


	class S2SInference:
	def __init__(
	self, model_name_or_path, audio_tokenizer_path, audio_tokenizer_type, flow_path=None
	):

	config = AutoConfig.from_pretrained(
	model_name_or_path,
	trust_remote_code=True,
	)

	if "qwen2" in config.model_type.lower():
	from evaluation.get_chat_template import qwen2_chat_template as chat_template

	add_generation_prompt = True

	default_system_message = []

	if "hunyuan" in config.model_type.lower():
	from evaluation.get_chat_template import hunyuan_chat_template as chat_template

	add_generation_prompt = False

	default_system_message = [
	{
	"role": "system",
	"content": "You are a helpful AI assistant.",
	}
	]

	luke_system_message = [
	{
	"role": "system",
	"content": "Your Name: Luke\nYour Gender: male\n\nRespond in a text-audio interleaved manner.",
	},
	]

	tokenizer = AutoTokenizer.from_pretrained(
	model_name_or_path,
	trust_remote_code=True,
	chat_template=chat_template,
	)
	# print(f"{tokenizer=}")
	print(f"{tokenizer.get_chat_template()=}")

	model = AutoModelForCausalLM.from_pretrained(
	model_name_or_path,
	trust_remote_code=True,
	device_map=device_map,
	torch_dtype=torch_dtype,
	attn_implementation="flash_attention_2",
	).eval()
	# print("model", model)
	print(f"{model.config.model_type=}")
	print(f"{model.hf_device_map=}")

	model.generation_config = GenerationConfig.from_pretrained(
	model_name_or_path, trust_remote_code=True
	)

	model.generation_config.max_new_tokens = 8192
	model.generation_config.chat_format = "chatml"
	model.generation_config.max_window_size = 8192
	model.generation_config.use_cache = True
	# model.generation_config.use_cache = False
	model.generation_config.do_sample = False
	model.generation_config.temperature = 1.0
	model.generation_config.top_k = 50
	model.generation_config.top_p = 1.0
	model.generation_config.num_beams = 1
	model.generation_config.pad_token_id = tokenizer.pad_token_id
	if model.config.model_type == "hunyuan":
	model.generation_config.eos_token_id = tokenizer.eos_id
	print(f"{model.generation_config=}")

	audio_tokenizer = get_audio_tokenizer(
	audio_tokenizer_path,
	audio_tokenizer_type,
	flow_path=flow_path,
	rank=audio_tokenizer_rank,
	)

	self.model = model
	self.tokenizer = tokenizer
	self.audio_tokenizer = audio_tokenizer
	self.add_generation_prompt = add_generation_prompt
	self.default_system_message = default_system_message
	self.luke_system_message = luke_system_message

	audio_0_id = tokenizer("<\|audio_0\|>").input_ids[0]
	print(f"{audio_0_id=}")

	def benchmark_forward(self, mtp_inference_mode):
	print("-" * 100)
	print("benchmark_forward...")
	print(f"{mtp_inference_mode=}")

	total_time = 0

	past_key_values = None
	use_cache = True

	self.model.input_ids = None
	self.model.inputs_embeds = None
	self.model.hidden_states = [None] * (self.model.config.num_nextn_predict_layers + 1)
	self.model.position_ids = None
	self.model.attention_mask = None
	self.model.mtp_idx = -1
	self.model.num_prefill_tokens = -1

	model_max_length = 1024
	if mtp_inference_mode is not None:
	ori_mtp_inference_mode = self.model.generation_config.mtp_inference_mode
	self.model._prepare_mtp_for_generation(mtp_inference_mode, model_max_length)

	else:
	self.model._prepare_mtp_for_generation(
	self.model.generation_config.mtp_inference_mode, model_max_length
	)

	for i in tqdm.tqdm(range(1, model_max_length + 1)):

	if use_cache:
	input_ids = torch.tensor([i - 1], dtype=torch.long).unsqueeze(0).to("cuda")
	position_ids = torch.tensor([i - 1], dtype=torch.long).unsqueeze(0).to("cuda")
	else:
	input_ids = torch.arange(i, dtype=torch.long).unsqueeze(0).to("cuda")
	position_ids = torch.arange(i, dtype=torch.long).unsqueeze(0).to("cuda")

	attention_mask = torch.tensor([1] * i, dtype=torch.float).unsqueeze(0).to("cuda")

	torch.cuda.synchronize()
	start = time.time()

	output = self.model(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	num_logits_to_keep=1,
	)

	torch.cuda.synchronize()
	end = time.time()

	total_time += end - start
	# print(f"{i=} {total_time=}")

	past_key_values = output.past_key_values

	print()
	print(f"{total_time=}")
	print(f"second/token {total_time/model_max_length=}")
	print(f"token/second {model_max_length/total_time=}")

	if mtp_inference_mode is not None:
	self.model.mtp_inference_mode = ori_mtp_inference_mode

	def benchmark_generate(self, mtp_inference_mode):

	self.model.apply(custom_init_weights)

	print("-" * 100)
	print("benchmark_generate...")
	print(f"{mtp_inference_mode=}")

	total_time = 0
	self.model.generation_config.use_cache = True

	self.model.generation_config.max_new_tokens = 8192

	if mtp_inference_mode is not None:
	ori_mtp_inference_mode = self.model.generation_config.mtp_inference_mode
	self.model.generation_config.mtp_inference_mode = mtp_inference_mode

	input_ids = torch.tensor([0], dtype=torch.long).unsqueeze(0).to("cuda")

	torch.cuda.synchronize()
	start = time.time()

	output = self.model.generate(
	input_ids,
	)
	# print(f"{output.size()=}")

	torch.cuda.synchronize()
	end = time.time()

	total_time += end - start

	print()
	print(f"{total_time=}")
	print(f"second/token {total_time/output.size(1)=}")
	print(f"token/second {output.size(1)/total_time=}")

	if mtp_inference_mode is not None:
	self.model.generation_config.mtp_inference_mode = ori_mtp_inference_mode

	def benchmark_generate_stream(self, mtp_inference_mode):
	print("-" * 100)
	print("benchmark_generate_stream...")
	print(f"{mtp_inference_mode=}")

	self.model.apply(custom_init_weights)

	total_time = 0
	self.model.generation_config.use_cache = True

	# model_max_length = 8192
	model_max_length = 4096
	# model_max_length = 2048
	# model_max_length = 1024
	num_prefill_tokens = 32

	self.model.generation_config.max_new_tokens = model_max_length
	self.model.generation_config.do_sample = False

	if mtp_inference_mode is not None:
	ori_mtp_inference_mode = self.model.generation_config.mtp_inference_mode
	self.model.generation_config.mtp_inference_mode = mtp_inference_mode

	input_ids = torch.tensor([0] * num_prefill_tokens, dtype=torch.long).unsqueeze(0).to("cuda")

	streamer = BenchmarkIteratorStreamer(self.tokenizer, skip_prompt=True)
	generation_kwargs = dict(input_ids=input_ids, streamer=streamer)
	thread = Thread(target=self.model.generate, kwargs=generation_kwargs)

	token_decode_time = []

	torch.cuda.synchronize()
	start = time.time()
	thread.start()

	generated_text = ""
	for new_text in tqdm.tqdm(streamer, total=model_max_length):
	generated_text += new_text
	end = time.time()

	token_decode_time.append(end - start)

	yield new_text

	# print(f"{len(generated_text)}")

	torch.cuda.synchronize()
	end = time.time()

	total_time += end - start

	print()
	print(f"{token_decode_time[-1]=}")
	print(f"{streamer.num_decode_tokens=}")
	print(f"second/token {token_decode_time[-1]/streamer.num_decode_tokens=}")
	print(f"token/second {streamer.num_decode_tokens/token_decode_time[-1]=}")

	# if mtp_inference_mode is None:
	# mtp_inference_mode = []
	# with open(f'token_decode_time_{str(mtp_inference_mode)}.json', 'w') as f:
	# json.dump(token_decode_time, f)

	if mtp_inference_mode is not None:
	self.model.generation_config.mtp_inference_mode = ori_mtp_inference_mode

	def run_infer(
	self,
	audio_path=None,
	prompt_audio_path=None,
	stream_stride=4,
	max_returned_tokens=4096,
	sample_rate=16000,
	request_id="",
	audio_feats=None,
	message="",
	use_past=False,
	mode="luke",
	do_sample=False,
	mtp_inference_mode=None,
	):

	AUD_TAG_TOKEN = "<\|audio\|>"
	AUD_CONTEXT_TOKEN = "<\|context_of_audio\|>"
	AUD_START_TOKEN = "<\|begin_of_audio\|>"
	AUD_END_TOKEN = "<\|end_of_audio\|>"

	if prompt_audio_path is not None:
	system_message = [
	{
	"role": "system",
	"content": f"Your Voice: <\|audio\|>\n",
	},
	]

	elif mode == "luke":
	system_message = self.luke_system_message

	else:
	system_message = self.default_system_message

	if prompt_audio_path is not None and self.audio_tokenizer.apply_to_role("user", is_discrete=True):
	# discrete codec
	audio_tokens = self.audio_tokenizer.encode(prompt_audio_path)
	audio_tokens = "".join(f"<\|audio_{i}\|>" for i in audio_tokens)
	system_message[-1]["content"] = system_message[-1]["content"].replace(
	"<\|audio\|>", f"<\|begin_of_audio\|>{audio_tokens}<\|end_of_audio\|>"
	)

	if audio_path is not None:
	messages = system_message + [
	{
	"role": "user",
	"content": message + "\n<\|audio\|>",
	},
	]
	else:
	messages = system_message + [
	{
	"role": "user",
	"content": message,
	},
	]

	if audio_path is not None and self.audio_tokenizer.apply_to_role("user", is_discrete=True):
	# discrete codec
	audio_tokens = self.audio_tokenizer.encode(audio_path)
	audio_tokens = "".join(f"<\|audio_{i}\|>" for i in audio_tokens)
	messages[-1]["content"] = messages[-1]["content"].replace(
	"<\|audio\|>", f"<\|begin_of_audio\|>{audio_tokens}<\|end_of_audio\|>"
	)

	input_ids = self.tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=self.add_generation_prompt,
	)

	if (audio_path is not None or prompt_audio_path is not None) and self.audio_tokenizer.apply_to_role(
	"user", is_contiguous=True
	):
	# contiguous codec
	audio_paths = []
	if audio_path is not None:
	audio_paths.append(audio_path)
	if prompt_audio_path is not None:
	audio_paths.append(prompt_audio_path)
	input_ids, audios, audio_indices = add_audio_input_contiguous(
	input_ids, audio_paths, self.tokenizer, self.audio_tokenizer
	)
	else:
	audios = None
	audio_indices = None

	input_ids = torch.tensor([input_ids], dtype=torch.long).to("cuda")

	print("input", self.tokenizer.decode(input_ids[0], skip_special_tokens=False), flush=True)

	self.model.generation_config.do_sample = do_sample

	if mtp_inference_mode is not None:
	ori_mtp_inference_mode = self.model.generation_config.mtp_inference_mode
	self.model.generation_config.mtp_inference_mode = mtp_inference_mode

	outputs = self.model.generate(
	input_ids,
	audios=audios,
	audio_indices=audio_indices,
	)

	output = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
	print(f"{output=}", flush=True)

	audio_offset = self.tokenizer.convert_tokens_to_ids("<\|audio_0\|>")

	audio_tokens = []
	for token_id in outputs[0]:
	if token_id >= audio_offset:
	audio_tokens.append(token_id - audio_offset)

	if len(audio_tokens) > 0:
	tts_speech = self.audio_tokenizer.decode(
	audio_tokens, source_speech_16k=prompt_audio_path
	)

	else:
	tts_speech = None

	if mtp_inference_mode is not None:
	self.model.generation_config.mtp_inference_mode = ori_mtp_inference_mode

	return output, tts_speech

	def run_infer_stream(
	self,
	audio_path=None,
	prompt_audio_path=None,
	stream_stride=4,
	max_returned_tokens=4096,
	sample_rate=16000,
	request_id="",
	audio_feats=None,
	message="",
	use_past=False,
	mode="luke",
	do_sample=False,
	mtp_inference_mode=None,
	):

	if prompt_audio_path is not None:
	system_message = [
	{
	"role": "system",
	"content": f"Your Voice: <\|audio\|>\n",
	},
	]

	elif mode == "luke":
	system_message = self.luke_system_message

	else:
	system_message = self.default_system_message

	if prompt_audio_path is not None and self.audio_tokenizer.apply_to_role("user", is_discrete=True):
	# discrete codec
	audio_tokens = self.audio_tokenizer.encode(prompt_audio_path)
	audio_tokens = "".join(f"<\|audio_{i}\|>" for i in audio_tokens)
	system_message[-1]["content"] = system_message[-1]["content"].replace(
	"<\|audio\|>", f"<\|begin_of_audio\|>{audio_tokens}<\|end_of_audio\|>"
	)

	if audio_path is not None:
	messages = system_message + [
	{
	"role": "user",
	"content": message + "\n<\|audio\|>",
	},
	]
	else:
	messages = system_message + [
	{
	"role": "user",
	"content": message,
	},
	]

	if audio_path is not None and self.audio_tokenizer.apply_to_role("user", is_discrete=True):
	# discrete codec
	audio_tokens = self.audio_tokenizer.encode(audio_path)
	audio_tokens = "".join(f"<\|audio_{i}\|>" for i in audio_tokens)
	messages[-1]["content"] = messages[-1]["content"].replace(
	"<\|audio\|>", f"<\|begin_of_audio\|>{audio_tokens}<\|end_of_audio\|>"
	)

	input_ids = self.tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=self.add_generation_prompt,
	)

	if (audio_path is not None or prompt_audio_path is not None) and self.audio_tokenizer.apply_to_role(
	"user", is_contiguous=True
	):
	# contiguous codec
	audio_paths = []
	if audio_path is not None:
	audio_paths.append(audio_path)
	if prompt_audio_path is not None:
	audio_paths.append(prompt_audio_path)
	input_ids, audios, audio_indices = add_audio_input_contiguous(
	input_ids, audio_paths, self.tokenizer, self.audio_tokenizer
	)
	else:
	audios = None
	audio_indices = None

	input_ids = torch.tensor([input_ids], dtype=torch.long).to("cuda")

	print("input", self.tokenizer.decode(input_ids[0], skip_special_tokens=False), flush=True)

	self.model.generation_config.do_sample = do_sample

	if mtp_inference_mode is not None:
	ori_mtp_inference_mode = self.model.generation_config.mtp_inference_mode
	self.model.generation_config.mtp_inference_mode = mtp_inference_mode

	streamer = TextAudioIteratorStreamer(self.tokenizer, skip_prompt=True)
	generation_kwargs = dict(
	input_ids=input_ids,
	audios=audios,
	audio_indices=audio_indices,
	streamer=streamer,
	)
	thread = Thread(target=self.model.generate, kwargs=generation_kwargs)

	thread.start()

	# generated_text = ""
	for new_text in streamer:
	# generated_text += new_text

	yield new_text

	# torch.cuda.synchronize()

	if mtp_inference_mode is not None:
	self.model.generation_config.mtp_inference_mode = ori_mtp_inference_mode


	def benchmark_llm():

	for mtp_inference_mode, tag in zip(
	[
	[8192, 0],
	[1, 4, 3, 8, 4, 10],
	[1, 10, 4, 10],
	[1, 10],
	],
	[
	"Vanilla",
	"Balance",
	"Boost",
	"Turbo",
	],
	):
	print("=" * 100)
	print("benchmark_llm")
	print(f"{tag}")

	s2s_inference.benchmark_forward(mtp_inference_mode)

	s2s_inference.benchmark_generate(mtp_inference_mode)

	generated_text = ""
	for new_text in s2s_inference.benchmark_generate_stream(
	mtp_inference_mode=mtp_inference_mode
	):
	generated_text += new_text
	# print(new_text, end="", flush=True)


	def benchmark_sts():
	audio_paths = [
	"asset/介绍一下上海.wav",
	"asset/发表一个悲伤的演讲.wav",
	"asset/发表一个振奋人心的演讲.wav",
	]

	for _ in range(10):

	print("=" * 100)
	print("benchmark_sts")
	audio_path = random.choice(audio_paths)
	print(f"{audio_path}")

	start = time.time()
	audio_idx = 0
	generated_text = ""
	all_tts_speech = []
	past_tts_speech_len = 0
	for new_text in s2s_inference.run_infer_stream(audio_path=audio_path):
	# print(new_text, end="", flush=True)

	generated_text += new_text

	if new_text == "<\|end_of_audio\|>":
	audio_tokens = extract_token_ids_as_int(generated_text)

	tts_speech = s2s_inference.audio_tokenizer.decode(audio_tokens, option_steps=1)
	tts_speech = tts_speech[past_tts_speech_len:]
	past_tts_speech_len += len(tts_speech)
	all_tts_speech.append(tts_speech)

	end = time.time()
	if audio_idx == 0:
	print(audio_tokens)
	print(f"{audio_idx} audio chunk {end - start}")

	wav_path = os.path.join(output_dir, audio_path[:-4] + f"_{audio_idx}.wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")

	audio_idx += 1
	start = time.time()

	wav_path = os.path.join(output_dir, audio_path[:-4] + ".wav")
	tts_speech = torch.cat(all_tts_speech, dim=0)
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")


	# ==============================================================
	# Text
	def text_task():
	for text in [
	"How many helicopters can a human eat in one sitting?",
	"你叫什么名字？",
	"写一首诗",
	"介绍一下上海",
	]:
	print("=" * 100)
	print("text_task")
	print(f"{text=}")

	output, _ = s2s_inference.run_infer(
	message=text,
	mode=None,
	# do_sample=True,
	mtp_inference_mode=[8192, 0],
	)
	print(f"{output=}", flush=True)


	# ==============================================================
	# Text stream
	def text_stream_task():
	for text in [
	"你叫什么名字？",
	]:
	print("=" * 100)
	print("text_stream_task")
	print(f"{text=}")

	generated_text = ""
	for new_text in s2s_inference.run_infer_stream(
	message=text,
	mode=None,
	# do_sample=True,
	mtp_inference_mode=[8192, 0],
	):
	generated_text += new_text
	print(new_text, end="")
	print("")


	# ==============================================================
	# S2S
	def sts_task():
	for audio_path in [
	"asset/介绍一下上海.wav",
	"asset/发表一个悲伤的演讲.wav",
	"asset/发表一个振奋人心的演讲.wav",
	"asset/piano.mp3",
	]:
	print("=" * 100)
	print("sts_task")
	print(f"{audio_path=}")

	output, tts_speech = s2s_inference.run_infer(
	audio_path=audio_path,
	)

	wav_path = os.path.join(output_dir, audio_path[:-4] + ".wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")


	# ==============================================================
	# S2S stream
	def sts_stream_task():
	for audio_path in [
	"asset/介绍一下上海.wav",
	]:
	print("=" * 100)
	print("sts_stream_task")
	print(f"{audio_path=}")

	generated_text = ""
	for new_text in s2s_inference.run_infer_stream(audio_path=audio_path):
	generated_text += new_text
	print(new_text, end="")
	print("")

	audio_decode_time = []
	audio_segments = find_audio_segments_regex(generated_text)
	for audio_idx, audio_segment in enumerate(audio_segments):
	start = time.time()

	audio_tokens = extract_token_ids_as_int(audio_segment)
	# print(audio_tokens)

	tts_speech = s2s_inference.audio_tokenizer.decode(audio_tokens)

	end = time.time()
	audio_decode_time.append(end - start)

	wav_path = os.path.join(output_dir, audio_path[:-4] + f"_{audio_idx}.wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")
	# print(f"{audio_decode_time=}")


	# ==============================================================
	# ASR
	def asr_task():
	for audio_path in [
	"/data/data/wenet-e2e/wenetspeech/data/cuts_TEST_NET.00000000/TES/TEST_NET_Y0000000020_5XD21BihDd8_S00395.wav",
	"/data/data/wenet-e2e/wenetspeech/data/cuts_TEST_NET.00000000/TES/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00424.wav",
	"/data/data/wenet-e2e/wenetspeech/data/cuts_TEST_NET.00000000/TES/TEST_NET_Y0000000050_LOLTeK1BNMo_S00045.wav",
	"/data/data/fixie-ai/librispeech_asr/test.clean/2830-3980-0034.wav",
	"/data/data/fixie-ai/librispeech_asr/test.clean/237-134500-0040.wav",
	]:
	print("=" * 100)
	print("asr_task")
	print(f"{audio_path=}")

	output, tts_speech = s2s_inference.run_infer(
	audio_path=audio_path,
	# message="Translate the speech to text.",
	message="Convert the speech to text.",
	mode=None,
	)
	print(f"{output=}", flush=True)


	# ==============================================================
	# TTS
	def tts_task():
	TTS_texts = [
	"我们将为全球城市的可持续发展贡献力量。",
	"通天河灵感大王",
	"他本是我莲花池里养大的金鱼，每日浮头听经，修成手段。那一柄九瓣铜锤，乃是一枝未开的菡萏，被他运炼成兵。不知是那一日，海潮泛涨，走到此间。我今早扶栏看花，却不见这厮出拜，掐指巡纹，算着他在此成精，害你师父，故此未及梳妆，运神功，织个竹篮儿擒他。",
	"一二三四五六七八九十",
	"One Two Tree Four Five Six Seven Eight Night Ten",
	"1 2 3 4 5 6 7 8 9 10",
	"12345678910",
	"两个黄鹂鸣翠柳，一行白鹭上青天。窗含西岭千秋雪，门泊东吴万里船。",
	"坡上立着一只鹅，坡下就是一条河。宽宽的河，肥肥的鹅，鹅要过河，河要渡鹅不知是鹅过河，还是河渡鹅?",
	"扁担长，板凳宽，扁担没有板凳宽，板凳没有扁担长。扁担绑在板凳上，板凳不让扁担绑在板凳上。",
	"化肥会挥发，黑化肥发灰，灰化肥发黑。黑化肥发灰会挥发；灰化肥挥发会发黑。黑化肥挥发发灰会花飞；灰化肥挥发发黑会飞花，黑灰化肥会挥发发灰黑讳为花飞；灰黑化肥会挥发发黑灰为讳飞花。",
	"圆桌儿、方桌儿没有腿儿，墨水瓶儿里没有水儿，花瓶里有花儿没有叶儿，练习本儿上写字儿没有准儿，甘蔗好吃净是节儿。西瓜挺大没有味儿，坛儿里的小米儿长了虫儿，鸡毛掸子成了棍儿，水缸沿儿上系围裙儿，耗子打更猫打盹儿，新买的小褂儿没钉扣儿，奶奶想说没有劲儿。",
	"起床歌：小宝宝，起得早，睁开眼，眯眯笑，咿呀呀，学说话，伸伸手，要人抱。穿衣歌小胳膊，穿袖子，穿上衣，扣扣子，小脚丫，穿裤子，穿上袜子穿鞋子。小镜子-小镜子，圆又圆，看宝宝，露笑脸。闭上眼，做个梦，变月亮，挂上天。小铃铛叮铃铃，叮铃铃，一会远，一会近。小宝宝，耳朵灵，听铃声，找到铃。学画画小宝宝，学画画，大蜡笔，手中拿，画小鸭，叫嘎嘎，画小马，骑回家。大鞋子大鞋子，像只船，爸爸穿，我也穿，一二一，向前走，走呀走，翻了船。逛公园逛公园，宝宝笑，东看看，西瞧瞧，花儿香，鸟儿叫，小草绿，小树摇。看画报小娃娃，看画报，睁大眼，仔细瞧，布娃娃，哈哈笑，伸伸手，要你抱。搭积木大积木，红黄兰，小宝宝，最爱玩，搭火车，钻山洞，盖高楼，连着天。小汽车小汽车，嘀嘀嘀，开过来，开过去，小宝宝，当司机，送妈妈，上班去。藏猫猫儿歌：躲猫猫，躲猫猫，猫猫、猫猫在哪里？喵……猫咪在这里。",
	]

	for text in TTS_texts:
	print("=" * 100)
	print("tts_task")
	print(f"{text=}")

	output, tts_speech = s2s_inference.run_infer(
	message="Convert the text to speech.\n" + text,
	mode=None,
	do_sample=True,
	)

	wav_path = os.path.join(output_dir, text[:16] + ".wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")

	# ==============================================================
	# Clone TTS
	for text in TTS_texts:
	for prompt_audio_path in [
	"asset/2631296891109983590.wav",
	"asset/379838640-d5ff0815-74f8-4738-b0f1-477cfc8dcc2d.wav",
	"asset/4202818730519913143.wav",
	]:
	print("=" * 100)
	print("tts_task")
	print(f"{text=} {prompt_audio_path=}")

	output, tts_speech = s2s_inference.run_infer(
	prompt_audio_path=prompt_audio_path,
	# message="Translate the text to speech.\n" + text,
	message="Convert the text to speech.\n" + text,
	mode=None,
	do_sample=True,
	)

	wav_path = os.path.join(output_dir, prompt_audio_path[:16] + "_" + text[:16] + ".wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")


	# ==============================================================
	# TTS stream
	def tts_stream_task():
	TTS_texts = [
	"他本是我莲花池里养大的金鱼，每日浮头听经，修成手段。那一柄九瓣铜锤，乃是一枝未开的菡萏，被他运炼成兵。不知是那一日，海潮泛涨，走到此间。我今早扶栏看花，却不见这厮出拜，掐指巡纹，算着他在此成精，害你师父，故此未及梳妆，运神功，织个竹篮儿擒他。",
	]

	for text in TTS_texts:
	print("=" * 100)
	print("tts_stream_task")
	print(f"{text=}")

	generated_text = ""
	for new_text in s2s_inference.run_infer_stream(
	message="Convert the text to speech.\n" + text,
	mode=None,
	do_sample=True,
	):
	generated_text += new_text
	print(new_text, end="")
	print("")

	audio_segments = find_audio_segments_regex(generated_text)
	for audio_idx, audio_segment in enumerate(audio_segments):
	audio_tokens = extract_token_ids_as_int(audio_segment)
	# print(audio_tokens)
	tts_speech = s2s_inference.audio_tokenizer.decode(audio_tokens)

	wav_path = os.path.join(output_dir, text[:16] + f"_{audio_idx}.wav")
	os.makedirs(os.path.dirname(wav_path), exist_ok=True)
	torchaudio.save(wav_path, tts_speech.unsqueeze(0), 22050, format="wav")


	s2s_inference = S2SInference(
	model_name_or_path, audio_tokenizer_path, audio_tokenizer_type, flow_path=flow_path
	)


	text_task()
	text_stream_task()

	sts_task()
	sts_stream_task()

	asr_task()
	tts_task()
	tts_stream_task()

	benchmark_sts()
	benchmark_llm()