Michael Hu
tts is deprecated, use fish speech
933cc7f
raw
history blame
1.49 kB
import time
import yaml
from pathlib import Path
import torch
from fish_audio.sdk import TextToSpeech, Vocoder
from pydub import AudioSegment
# Load config
config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
with open(config_path) as f:
config = yaml.safe_load(f)
# Initialize models
tts_model = TextToSpeech(
model_name=config["tts"]["model"],
device=config["tts"]["device"],
precision=config["tts"]["precision"],
)
vocoder = Vocoder(
model_name=config["tts"]["vocoder"],
device=tts_model.device,
)
def generate_speech(text: str, language: str = "zh") -> str:
"""Generate speech from text using Fish Audio SDK"""
# Format text with language tags
lang_template = config["generation"]["language_mapping"][language]
processed_text = lang_template.format(text=text)
# Generate mel spectrogram
mel = tts_model.generate(
text=processed_text,
temperature=config["generation"]["temperature"],
top_k=config["generation"]["top_k"],
max_length=config["generation"]["max_length"],
)
# Convert mel to waveform
waveform = vocoder(mel)
# Create audio segment
audio = AudioSegment(
waveform.numpy().tobytes(),
frame_rate=vocoder.sample_rate,
sample_width=2,
channels=1,
)
# Save output
output_path = f"temp/outputs/output_{int(time.time())}.wav"
audio.export(output_path, format="wav")
return output_path