import os import torch import se_extractor from api import BaseSpeakerTTS, ToneColorConverter ckpt_base_en = 'checkpoints/checkpoints/base_speakers/EN' ckpt_converter_en = 'checkpoints/checkpoints/converter' #device = 'cuda:0' device = "cpu" base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base_en}/config.json', device=device) base_speaker_tts.load_ckpt(f'{ckpt_base_en}/checkpoint.pth') tone_color_converter = ToneColorConverter(f'{ckpt_converter_en}/config.json', device=device) tone_color_converter.load_ckpt(f'{ckpt_converter_en}/checkpoint.pth') from tts_voice import tts_order_voice import edge_tts import gradio as gr import tempfile import anyio def vc_en(text, audio_ref, style_mode): if style_mode=="default": source_se = torch.load(f'{ckpt_base_en}/en_default_se.pth').to(device) reference_speaker = audio_ref target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True) save_path = "output.wav" # Run the base speaker tts src_path = "tmp.wav" base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0) # Run the tone color converter encode_message = "@MyShell" tone_color_converter.convert( audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message) else: source_se = torch.load(f'{ckpt_base_en}/en_style_se.pth').to(device) reference_speaker = audio_ref target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True) save_path = "output.wav" # Run the base speaker tts src_path = "tmp.wav" base_speaker_tts.tts(text, src_path, speaker=style_mode, language='English', speed=0.9) # Run the tone color converter encode_message = "@MyShell" tone_color_converter.convert( audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message) return "output.wav" language_dict = tts_order_voice base_speaker = "base_audio.mp3" source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True) async def text_to_speech_edge(text, audio_ref, language_code): voice = language_dict[language_code] communicate = edge_tts.Communicate(text, voice) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) reference_speaker = audio_ref target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True) save_path = "output.wav" # Run the tone color converter encode_message = "@MyShell" tone_color_converter.convert( audio_src_path=tmp_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message) return "output.wav" app = gr.Blocks() with app: gr.Markdown("#