import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import os import copy import re import secrets from pathlib import Path from pydub import AudioSegment torch.manual_seed(420) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-Audio-Chat", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="cuda", trust_remote_code=True).eval() def _parse_text(text): lines = text.split("\n") lines = [line for line in lines if line != ""] count = 0 for i, line in enumerate(lines): if "```" in line: count += 1 items = line.split("`") if count % 2 == 1: lines[i] = f'
'
            else:
                lines[i] = f"
" else: if i > 0: if count % 2 == 1: line = line.replace("`", r"\`") line = line.replace("<", "<") line = line.replace(">", ">") line = line.replace(" ", " ") line = line.replace("*", "*") line = line.replace("_", "_") line = line.replace("-", "-") line = line.replace(".", ".") line = line.replace("!", "!") line = line.replace("(", "(") line = line.replace(")", ")") line = line.replace("$", "$") lines[i] = "
" + line text = "".join(lines) return text def predict(_chatbot, task_history, user_input): if task_history is None or not isinstance(task_history, list): task_history = [] print("Predict - Start: task_history =", task_history) if not isinstance(task_history, list) or not all(isinstance(item, tuple) and len(item) == 2 for item in task_history): print("Error: task_history should be a list of tuples of length 2.") return _chatbot query = user_input if user_input else (task_history[-1][0] if task_history else "") print("User: " + _parse_text(query)) if not task_history: return _chatbot history_cp = copy.deepcopy(task_history) history_filter = [] audio_idx = 1 pre = "" last_audio = None for item in history_cp: q, a = item if isinstance(q, (tuple, list)): last_audio = q[0] q = f'Audio {audio_idx}: ' pre += q + '\n' audio_idx += 1 else: pre += q history_filter.append((pre, a)) pre = "" if not history_filter: return _chatbot history, message = history_filter[:-1], history_filter[-1][0] response, history = model.chat(tokenizer, message, history=history) ts_pattern = r"<\|\d{1,2}\.\d+\|>" all_time_stamps = re.findall(ts_pattern, response) if (len(all_time_stamps) > 0) and (len(all_time_stamps) % 2 ==0) and last_audio: ts_float = [ float(t.replace("<|","").replace("|>","")) for t in all_time_stamps] ts_float_pair = [ts_float[i:i + 2] for i in range(0,len(all_time_stamps),2)] # 读取音频文件 format = os.path.splitext(last_audio)[-1].replace(".","") audio_file = AudioSegment.from_file(last_audio, format=format) chat_response_t = response.replace("<|", "").replace("|>", "") chat_response = chat_response_t temp_dir = secrets.token_hex(20) temp_dir = Path(uploaded_file_dir) / temp_dir temp_dir.mkdir(exist_ok=True, parents=True) # 截取音频文件 for pair in ts_float_pair: audio_clip = audio_file[pair[0] * 1000: pair[1] * 1000] # 保存音频文件 name = f"tmp{secrets.token_hex(5)}.{format}" filename = temp_dir / name audio_clip.export(filename, format=format) _chatbot[-1] = (_parse_text(query), chat_response) _chatbot.append((None, (str(filename),))) if not _chatbot: _chatbot = [("", "")] print("Predict - End: task_history =", task_history) return _chatbot[-1][1], _chatbot def regenerate(_chatbot, task_history): if task_history is None or not isinstance(task_history, list): task_history = [] print("Regenerate - Start: task_history =", task_history) if not task_history: return _chatbot item = task_history[-1] if item[1] is None: return _chatbot task_history[-1] = (item[0], None) chatbot_item = _chatbot.pop(-1) if chatbot_item[0] is None: _chatbot[-1] = (_chatbot[-1][0], None) else: _chatbot.append((chatbot_item[0], None)) print("Regenerate - End: task_history =", task_history) return predict(_chatbot, task_history) def add_text(history, task_history, text): if task_history is None or not isinstance(task_history, list): task_history = [] print("Add Text - Before: task_history =", task_history) if not isinstance(task_history, list): task_history = [] history.append((_parse_text(text), None)) task_history.append((text, None)) print("Add Text - After: task_history =", task_history) return history, task_history def add_file(history, task_history, file): if task_history is None or not isinstance(task_history, list): task_history = [] print("Add File - Before: task_history =", task_history) history.append(((file.name,), None)) task_history.append(((file.name,), None)) print("Add File - After: task_history =", task_history) return history, task_history def add_mic(history, task_history, file): if task_history is None or not isinstance(task_history, list): task_history = [] print("Add Mic - Before: task_history =", task_history) if file is None: return history, task_history file_with_extension = file + '.wav' os.rename(file, file_with_extension) history.append(((file_with_extension,), None)) task_history.append(((file_with_extension,), None)) print("Add Mic - After: task_history =", task_history) return history, task_history def reset_user_input(): return gr.update(value="") def reset_state(task_history): if task_history is None or not isinstance(task_history, list): task_history = [] print("Reset State - Before: task_history =", task_history) task_history = [] print("Reset State - After: task_history =", task_history) return [] iface = gr.Interface( fn=predict, inputs=[ gr.Audio(label="Audio Input"), gr.Textbox(label="Text Query"), gr.State() ], outputs=[ "text", gr.State() ], title="Audio-Text Interaction Model", description="This model can process an audio input along with a text query and provide a response.", theme="default", allow_flagging="never" ) iface.launch()