Text-to-Speech
dual_ar
fish-speech-1.5 / handler.py
DOBSON001's picture
Create handler.py
90d679f verified
raw
history blame
794 Bytes
import torch
from fish_speech.models.fish_speech import FishSpeech
from fish_speech.inference import infer
import io
import base64
import soundfile as sf
# 加载模型
model = FishSpeech.from_pretrained('fishaudio/fish-speech-1.5')
def predict(inputs: dict):
text = inputs.get('inputs', 'Hello world')
# 支持 [singing] 标签
if "[singing]" in text.lower():
mode = "singing"
text = text.replace("[singing]", "")
else:
mode = "speech"
# 生成音频
audio = infer(model, text, mode=mode)
# 转 base64 WAV
buffer = io.BytesIO()
sf.write(buffer, audio.cpu().numpy(), 24000, format='WAV')
audio_b64 = base64.b64encode(buffer.getvalue()).decode()
return {"audio": audio_b64}
def query(payload):
return predict(payload)