Spaces:

y5shen
/

roboAssist_demo

Runtime error

App Files Files Community

roboAssist_demo / test_audio.py

y5shen

Upload folder using huggingface_hub

81463e4 verified about 2 months ago

raw

history blame

11.8 kB

	import gradio as gr
	import modelscope_studio as mgr
	import librosa
	from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
	from argparse import ArgumentParser
	import requests
	import os
	from django.http import HttpResponse

	# 默认的模型检查点路径
	DEFAULT_CKPT_PATH = 'Qwen/Qwen2-Audio-7B-Instruct'

	def text_to_speech(text2):
	data = {
	"text": text2,
	"text_language": "zh",
	}
	# 注意 URL 中的单引号应该是 URL 的一部分，需要正确转义
	response = requests.post('http://127.0.0.1:8000', json=data)

	if response.status_code == 200:
	audio_file_path = "/root/project/Qwen2-Audio/demo/output.mp3"
	with open(audio_file_path, "wb") as f:
	f.write(response.content)
	return audio_file_path
	else:
	print(f"错误：请求失败，状态码为 {response.status_code}")
	return None

	def _get_args():
	"""
	解析命令行参数，获取运行配置。

	返回:
	argparse.Namespace: 包含命令行参数的命名空间对象。
	"""
	parser = ArgumentParser()
	parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
	help="Checkpoint name or path, default to %(default)r") # 模型检查点路径
	parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only") # 是否仅使用CPU
	parser.add_argument("--inbrowser", action="store_true", default=False,
	help="Automatically launch the interface in a new tab on the default browser.") # 是否在浏览器中自动打开界面
	parser.add_argument("--server-port", type=int, default=15110,
	help="Demo server port.") # 指定服务器端口
	parser.add_argument("--server-name", type=str, default="0.0.0.0",
	help="Demo server name.") # 指定服务器名称

	args = parser.parse_args()
	return args

	def add_text(chatbot, task_history, input):
	"""
	将用户输入的文本内容添加到聊天记录中，并更新聊天机器人界面。

	参数:
	chatbot (gr.components.Chatbot): 聊天机器人组件。
	task_history (list): 任务历史记录。
	input (gr.inputs): 用户输入内容。

	返回:
	tuple: 更新后的聊天机器人界面和任务历史记录，以及重置后的用户输入框。
	"""
	text_content = input.text # 获取文本输入内容
	content = []
	if len(input.files) > 0: # 如果用户上传了音频文件
	for i in input.files:
	content.append({'type': 'audio', 'audio_url': i.path}) # 将音频文件添加到内容列表中
	if text_content: # 如果用户输入了文本
	content.append({'type': 'text', 'text': text_content}) # 将文本内容添加到内容列表中
	task_history.append({"role": "user", "content": content}) # 更新任务历史记录

	# 更新聊天机器人界面，添加用户输入
	chatbot.append([{
	"text": input.text,
	"files": input.files,
	}, None])
	return chatbot, task_history, None
	'''
	def add_file(chatbot, task_history, audio_file_path):
	"""
	将音频文件添加到聊天记录中。

	参数:
	chatbot (gr.components.Chatbot): 聊天机器人组件。
	task_history (list): 任务历史记录。
	audio_file_path (str): 音频文件的路径。

	返回:
	tuple: 更新后的聊天机器人界面和任务历史记录。
	"""
	# 确保任务历史记录中的音频条目是正确的格式
	task_history.append({"role": "user", "content": [{"type": "audio", "audio_url": audio_file_path}]})

	# 更新聊天记录，直接使用 audio_file_path 而不是 gr.Audio 组件
	chatbot.append((None, {"type": "audio", "audio_url": audio_file_path}))

	return chatbot, task_history
	'''
	import os

	def add_file(chatbot, task_history, audio_path):
	if not os.path.isfile(audio_path):
	print(f"Error: The file {audio_path} does not exist.")
	return chatbot, task_history

	# 将音频文件信息添加到任务历史
	task_history.append({
	"role": "user",
	"content": [{"type": "audio", "audio_url": audio_path}]
	})

	# 假设 chatbot 组件可以接受字典格式的输入
	chatbot_state = [{
	"text": f"[Audio file: {os.path.basename(audio_path)}]",
	"files": [audio_path] # 直接使用文件路径而不是 gr.File
	}, None]
	chatbot.append(chatbot_state) # 更新 chatbot 状态

	return chatbot, task_history

	def reset_user_input():
	"""
	重置用户输入字段。

	返回:
	gr.update: 将文本框的值重置为空。
	"""
	return gr.Textbox.update(value='')

	def reset_state(task_history):
	"""
	重置聊天记录和任务历史。

	参数:
	task_history (list): 当前的任务历史记录。

	返回:
	tuple: 清空的聊天记录和任务历史。
	"""
	return [], []

	def regenerate(chatbot, task_history):
	"""
	重新生成最后的机器人响应。

	参数:
	chatbot (gr.components.Chatbot): 聊天机器人组件。
	task_history (list): 任务历史记录。

	返回:
	tuple: 更新后的聊天机器人界面和任务历史记录。
	"""
	# 如果最后一条消息是助手生成的，则移除它
	if task_history and task_history[-1]['role'] == 'assistant':
	task_history.pop()
	chatbot.pop()
	# 如果任务历史记录不为空，重新生成响应
	if task_history:
	chatbot, task_history = predict(chatbot, task_history)
	return chatbot, task_history

	def predict(chatbot, task_history):
	"""
	根据当前任务历史记录生成模型响应，并将响应转换为音频文件添加到聊天记录中。

	参数:
	chatbot (gr.components.Chatbot): 聊天机器人组件。
	task_history (list): 任务历史记录。

	返回:
	tuple: 更新后的聊天机器人界面和任务历史记录。
	"""
	print(f"{task_history=}")
	print(f"{chatbot=}")

	# 使用处理器将任务历史记录格式化为模型输入
	text = processor.apply_chat_template(task_history, add_generation_prompt=True, tokenize=False)
	audios = []

	# 遍历任务历史，查找音频内容并加载
	for message in task_history:
	if isinstance(message["content"], list):
	for ele in message["content"]:
	if ele["type"] == "audio":
	audios.append(
	librosa.load(ele['audio_url'], sr=processor.feature_extractor.sampling_rate)[0]
	)

	if len(audios) == 0: # 如果没有音频，则设置为 None
	audios = None
	print(f"{text=}")
	print(f"{audios=}")

	# 使用处理器生成模型输入
	inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
	if not _get_args().cpu_only: # 如果支持 GPU，则将输入数据移动到 CUDA 设备
	inputs["input_ids"] = inputs.input_ids.to("cuda")

	# 生成响应
	generate_ids = model.generate(**inputs, max_length=256)
	generate_ids = generate_ids[:, inputs.input_ids.size(1):]

	# 解码生成的文本响应
	# 假设其他参数已经正确设置
	response = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
	task_history.append({'role': 'assistant', 'content': response})
	chatbot.append((None, response)) # 添加文本响应

	# 将文本响应转换为语音
	audio_file_path = text_to_speech(response)
	if audio_file_path:
	chatbot, task_history = add_file(chatbot, task_history, audio_file_path)

	return chatbot, task_history

	def _launch_demo(args):
	"""
	启动Gradio的Web用户界面，展示Qwen2-Audio-Instruct模型的聊天功能。

	参数:
	args (argparse.Namespace): 从命令行解析的参数。
	"""
	with gr.Blocks() as demo:
	# 添加页面标题和描述
	gr.Markdown(
	"""<p align="center"><img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/blog/qwenaudio/qwen2audio_logo.png" style="height: 80px"/><p>""")
	gr.Markdown("""<center><font size=8>Qwen2-Audio-Instruct Bot</center>""")
	gr.Markdown(
	"""\
	<center><font size=3>This WebUI is based on Qwen2-Audio-Instruct, developed by Alibaba Cloud. \
	(本WebUI基于Qwen2-Audio-Instruct打造，实现聊天机器人功能。)</center>""")
	gr.Markdown("""\
	<center><font size=4>Qwen2-Audio <a href="https://modelscope.cn/models/qwen/Qwen2-Audio-7B">🤖 </a>
	\| <a href="https://huggingface.co/Qwen/Qwen2-Audio-7B">🤗</a>&nbsp ｜
	Qwen2-Audio-Instruct <a href="https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct">🤖 </a> \|
	<a href="https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct">🤗</a>&nbsp ｜
	&nbsp<a href="https://github.com/QwenLM/Qwen2-Audio">Github</a></center>""")

	# 创建聊天机器人组件
	chatbot = mgr.Chatbot(label='Qwen2-Audio-7B-Instruct', elem_classes="control-height", height=750)

	# 创建用户输入组件，支持文本、麦克风和文件上传
	user_input = mgr.MultimodalInput(
	interactive=True,
	sources=['microphone', 'upload'],
	submit_button_props=dict(value="🚀 Submit (发送)"),
	upload_button_props=dict(value="📁 Upload (上传文件)", show_progress=True),
	)
	task_history = gr.State([]) # 初始化任务历史状态

	with gr.Row(): # 创建清除历史和重试按钮
	empty_bin = gr.Button("🧹 Clear History (清除历史)")
	regen_btn = gr.Button("🤔️ Regenerate (重试)")

	# 当用户提交输入时，调用add_text函数，然后调用predict函数生成响应
	user_input.submit(fn=add_text,
	inputs=[chatbot, task_history, user_input],
	outputs=[chatbot, task_history, user_input]).then(
	predict, [chatbot, task_history], [chatbot, task_history], show_progress=True
	)
	# 清除历史按钮的点击事件处理，重置聊天记录和任务历史
	empty_bin.click(reset_state, outputs=[chatbot, task_history], show_progress=True)
	# 重试按钮的点击事件处理，重新生成最后的响应
	regen_btn.click(regenerate, [chatbot, task_history], [chatbot, task_history], show_progress=True)

	# 启动Gradio界面
	demo.queue().launch(
	share=False, # 不共享URL
	inbrowser=args.inbrowser, # 是否自动在浏览器中打开
	server_port=args.server_port, # 指定服务器端口
	server_name=args.server_name, # 指定服务器名称
	ssl_certfile="/root/project/cert.pem",
	ssl_keyfile="/root/project/key.pem",
	ssl_verify=False
	)

	if __name__ == "__main__":
	args = _get_args() # 获取命令行参数
	if args.cpu_only:
	device_map = "cpu" # 如果指定了仅使用CPU，设置设备映射为CPU
	else:
	device_map = "auto" # 否则自动选择设备

	# 加载模型
	model = Qwen2AudioForConditionalGeneration.from_pretrained(
	args.checkpoint_path,
	torch_dtype="auto", # 自动选择数据类型
	device_map=device_map, # 设置设备映射
	resume_download=True, # 断点续传
	).eval()
	model.generation_config.max_new_tokens = 2048 # 设置最大生成token数，用于长对话
	print("generation_config", model.generation_config)
	processor = AutoProcessor.from_pretrained(args.checkpoint_path, resume_download=True) # 加载处理器
	_launch_demo(args) # 启动演示界面