File size: 990 Bytes
5a030e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import edge_tts
import asyncio
import librosa
import soundfile
import io

from inference.infer_tool import Svc

TEXT = "私はヘルタ。今は忙しいから、リモート人形のオート返答機能に任せる。こんにちは、こんにちは、ごきげんよう、良い日になりますように。それじゃ"
VOICE = "ja-JP-NanamiNeural"
OUTPUT_FILE = "test.mp3"

asyncio.run(edge_tts.Communicate(TEXT, VOICE).save(OUTPUT_FILE))
audio, sr = librosa.load(OUTPUT_FILE, sr=16000, mono=True)
raw_path = io.BytesIO()
soundfile.write(raw_path, audio, 16000, format="wav")
raw_path.seek(0)
print('checkpoint 1')

model = Svc(fr"Herta-Svc/G_10000.pth", f"Herta-Svc/config.json", device = 'cpu')
print('checkpoint 2')

out_audio, out_sr = model.infer('speaker0', 0, raw_path,
                                       auto_predict_f0 = True,
                                       )
print('checkpoint 3')

soundfile.write('out_audio.wav', out_audio.cpu().numpy(), 44100)

print("done")