import edge_tts import asyncio import librosa import soundfile import io from inference.infer_tool import Svc TEXT = "私はヘルタ。今は忙しいから、リモート人形のオート返答機能に任せる。こんにちは、こんにちは、ごきげんよう、良い日になりますように。それじゃ" VOICE = "ja-JP-NanamiNeural" OUTPUT_FILE = "test.mp3" asyncio.run(edge_tts.Communicate(TEXT, VOICE).save(OUTPUT_FILE)) audio, sr = librosa.load(OUTPUT_FILE, sr=16000, mono=True) raw_path = io.BytesIO() soundfile.write(raw_path, audio, 16000, format="wav") raw_path.seek(0) print('checkpoint 1') model = Svc(fr"Herta-Svc/G_10000.pth", f"Herta-Svc/config.json", device = 'cpu') print('checkpoint 2') out_audio, out_sr = model.infer('speaker0', 0, raw_path, auto_predict_f0 = True, ) print('checkpoint 3') soundfile.write('out_audio.wav', out_audio.cpu().numpy(), 44100) print("done")