import os import gradio as gr from zhconv import convert from src.cost_time import calculate_time from configs import * description = """

Linly 智能对话系统 (Linly-Talker)
[知乎] [bilibili] [GitHub] [个人主页]
Linly-Talker 是一款智能 AI 对话系统,结合了大型语言模型 (LLMs) 与视觉模型,是一种新颖的人工智能交互方式。

""" use_ref_video = False ref_video = None ref_info = 'pose' use_idle_mode = False length_of_audio = 5 @calculate_time def Asr(audio): try: question = asr.transcribe(audio) question = convert(question, 'zh-cn') except Exception as e: print("ASR Error: ", e) question = 'Gradio存在一些bug,麦克风模式有时候可能音频还未传入,请重新点击一下语音识别即可' gr.Warning(question) return question @calculate_time def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0): answer = llm.generate(question) print(answer) try: os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav --write-subtitles answer.vtt') except: tts.predict(answer, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt') return 'answer.wav', 'answer.vtt', answer @calculate_time def Talker_response(text, voice, rate, volume, pitch, source_video, bbox_shift): voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice driven_audio, driven_vtt, _ = LLM_response(text, voice, rate, volume, pitch) video = musetalker.inference_noprepare(driven_audio, source_video, bbox_shift) if driven_vtt: return (video, driven_vtt) else: return video def main(): with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference: gr.HTML(description) with gr.Row(equal_height=False): with gr.Column(variant='panel'): with gr.Tabs(elem_id="sadtalker_source_image"): with gr.TabItem('MuseV Video'): gr.Markdown("MuseV: need help? please visit MuseVDemo to generate Video https://huggingface.co/spaces/AnchorFake/MuseVDemo") with gr.Row(): source_video = gr.Video(label="Reference Video",sources=['upload']) gr.Markdown("BBox_shift 推荐值下限,在生成初始结果后生成相应的 bbox 范围。如果结果不理想,可以根据该参考值进行调整。\n一般来说,在我们的实验观察中,我们发现正值(向下半部分移动)通常会增加嘴巴的张开度,而负值(向上半部分移动)通常会减少嘴巴的张开度。然而,需要注意的是,这并不是绝对的规则,用户可能需要根据他们的具体需求和期望效果来调整该参数。") with gr.Row(): bbox_shift = gr.Number(label="BBox_shift value, px", value=0) bbox_shift_scale = gr.Textbox(label="bbox_shift_scale", value="",interactive=False) source_video.change(fn=musetalker.prepare_material, inputs=[source_video, bbox_shift], outputs=[source_video, bbox_shift_scale]) with gr.Tabs(elem_id="question_audio"): with gr.TabItem('对话'): with gr.Column(variant='panel'): question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话') input_text = gr.Textbox(label="Input Text", lines=3, info = '文字对话') with gr.Accordion("Advanced Settings", open=False, visible=True) as parameter_article: voice = gr.Dropdown(tts.SUPPORTED_VOICE, value='zh-CN-XiaoxiaoNeural', label="Voice") rate = gr.Slider(minimum=-100, maximum=100, value=0, step=1.0, label='Rate') volume = gr.Slider(minimum=0, maximum=100, value=100, step=1, label='Volume') pitch = gr.Slider(minimum=-100, maximum=100, value=0, step=1, label='Pitch') asr_text = gr.Button('语音识别(语音对话后点击)') asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text]) with gr.Tabs(): gr.Markdown("## Text Examples") examples = [ ['应对压力最有效的方法是什么?'], ['如何进行时间管理?'], ['为什么有些人选择使用纸质地图或寻求方向,而不是依赖GPS设备或智能手机应用程序?'], ['近日,苹果公司起诉高通公司,状告其未按照相关合约进行合作,高通方面尚未回应。这句话中“其”指的是谁?'], ['三年级同学种树80颗,四、五年级种的棵树比三年级种的2倍多14棵,三个年级共种树多少棵?'], ['撰写一篇交响乐音乐会评论,讨论乐团的表演和观众的整体体验。'], ['翻译成中文:Luck is a dividend of sweat. The more you sweat, the luckier you get.'], ] gr.Examples( examples = examples, inputs = [input_text], ) # driven_audio = 'answer.wav' with gr.Column(variant='panel'): with gr.TabItem("MuseTalk Video"): gen_video = gr.Video(label="Generated video", format="mp4") submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary') examples = [os.path.join('Musetalk/data/video', video) for video in os.listdir("Musetalk/data/video")] # ['Musetalk/data/video/yongen_musev.mp4', 'Musetalk/data/video/musk_musev.mp4', 'Musetalk/data/video/monalisa_musev.mp4', 'Musetalk/data/video/sun_musev.mp4', 'Musetalk/data/video/seaside4_musev.mp4', 'Musetalk/data/video/sit_musev.mp4', 'Musetalk/data/video/man_musev.mp4'] gr.Markdown("## MuseV Video Examples") gr.Examples( examples=[ ['Musetalk/data/video/yongen_musev.mp4', 5], ['Musetalk/data/video/musk_musev.mp4', 5], ['Musetalk/data/video/monalisa_musev.mp4', 5], ['Musetalk/data/video/sun_musev.mp4', 5], ['Musetalk/data/video/seaside4_musev.mp4', 5], ['Musetalk/data/video/sit_musev.mp4', 5], ['Musetalk/data/video/man_musev.mp4', 5] ], inputs =[source_video, bbox_shift], ) submit.click( fn=Talker_response, inputs=[input_text, voice, rate, volume, pitch, source_video, bbox_shift], outputs=[gen_video] ) return inference def success_print(text): print(f"\033[1;31;42m{text}\033[0m") def error_print(text): print(f"\033[1;37;41m{text}\033[0m") if __name__ == "__main__": # llm = LLM(mode='offline').init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf') # llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key") # llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat') try: from LLM import LLM llm = LLM(mode=mode).init_model('Qwen', 'Qwen/Qwen-1_8B-Chat') except Exception as e: error_print(f"LLM is not ready, error: {e}") error_print("如果使用LLM,请先下载有关的LLM模型") try: from TTS import EdgeTTS tts = EdgeTTS() except Exception as e: error_print(f"EdgeTTS Error: {e}") error_print("如果使用EdgeTTS,请先下载EdgeTTS库,测试EdgeTTS是否可用") try: from ASR import WhisperASR asr = WhisperASR('base') except Exception as e: error_print(f"ASR Error: {e}") error_print("如果使用ASR,请先下载ASR相关模型,如Whisper") try: from TFG import MuseTalk_RealTime musetalker = MuseTalk_RealTime() musetalker.init_model except Exception as e: error_print(f"MuseTalk Error: {e}") error_print("如果使用MuseTalk,请先下载MuseTalk相关模型") gr.close_all() demo = main() demo.queue() # demo.launch() demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0" server_port=port, # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话 # ssl_certfile=ssl_certfile, # ssl_keyfile=ssl_keyfile, # ssl_verify=False, debug=True)