import spaces import random import gradio as gr from css.utils import * # 定制语音生成 def custom(): def random_seed(): return random.randint(1, 100000000) @spaces.GPU def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed): import time t1 = time.time() print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed) if _synthetic_input_textbox == '': # gr.Warning('合成文本为空,您是否忘记输入合成文本?') gr.Warning('The synthesis text is empty, did you forget to input the synthesis text?') return (target_sr, default_data) set_all_random_seed(_seed) if use_instruct(_synthetic_input_textbox): model = cosyvoice_instruct else: model = cosyvoice prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr)) t2 = time.time() if _language_radio == 'cross' or _prompt_input_textbox == '': output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k) else: output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k) t3 = time.time() audio_data = postprocess(output['tts_speech']).numpy().flatten() t4 = time.time() print(f'load and preprocess time: {t2-t1}s') print(f'inference time: {t3-t2}s') print(f'postprocess time: {t4-t3}s') return (target_sr, audio_data) with gr.Column(): with gr.Row(): with gr.Column(scale=1, min_width=400): with gr.Group(): recorded_audio = gr.Audio(sources=['microphone'], label="Record Audio File", type='filepath') gr.Text("Please click to record and read the text on the right (Chinese or English) to complete the input", max_lines=1, container=False, interactive=False) with gr.Column(scale=10): prompt_input_textbox = gr.Textbox(label="Input Text for Recording") gr.Examples( label="Example Recording Texts", examples=example_prompt_text, inputs=[prompt_input_textbox]) with gr.Column(): language_radio = gr.Radio(choices=[('Same Language', 'same'), ('Cross Language', 'cross')], value='same', label="Input Synthesis Text") synthetic_input_textbox = gr.Textbox(show_label=False) gr.Examples( label="Example Texts", examples=example_tts_text, inputs=[synthetic_input_textbox]) with gr.Accordion(label="Random Seed"): with gr.Row(): with gr.Column(scale=1, min_width=180): seed_button = gr.Button(value="\U0001F3B2 Shuffle Randomly", elem_classes="full-height") with gr.Column(scale=10): seed = gr.Number(show_label=False, value=0, container=False, elem_classes="full-height") with gr.Column(): generate_button = gr.Button("Generate Audio", variant="primary", size="lg") with gr.Column(): output_audio = gr.Audio(label="Synthesized Audio") seed_button.click(fn=random_seed, outputs=[seed]) generate_button.click( fn=generate_audio, inputs=[recorded_audio, prompt_input_textbox, language_radio, synthetic_input_textbox, seed], outputs=[output_audio])