from fastapi import FastAPI, Query from fastapi.responses import StreamingResponse from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import torch import io import soundfile as sf import requests import numpy as np app = FastAPI(title="SpeechT5 TTS API") # Load models once at startup processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Function to load a speaker embedding from a URL def load_speaker_embedding(url: str) -> torch.Tensor: response = requests.get(url) response.raise_for_status() # Load the .bin file as a float32 tensor embedding = torch.frombuffer(response.content, dtype=torch.float32) return embedding.unsqueeze(0) # Add batch dimension # Example: load US female 1 speaker_embeddings = load_speaker_embedding( "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin" ) def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray: """ Simple moving average smoothing. """ if window_size < 2: return audio cumsum = np.cumsum(np.insert(audio, 0, 0)) smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size # pad to original length pad_left = window_size // 2 pad_right = window_size - 1 - pad_left smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge') return smoothed @app.get("/speak") def speak(text: str = Query(..., description="Text to convert to speech")): """ Convert text to speech using SpeechT5 + HiFi-GAN. Returns a WAV audio stream. """ # Prepare input inputs = processor(text=text, return_tensors="pt") # Generate speech speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) audio = speech.numpy().astype(np.float32) # --- Normalize --- peak = np.max(np.abs(audio)) if peak > 0: audio = (audio / peak) * 0.1 # Adjustable normalization level # --- Smooth --- audio = smooth_audio(audio, window_size=3) # --- Write WAV as 32-bit float --- buf = io.BytesIO() sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT") buf.seek(0) return StreamingResponse(buf, media_type="audio/wav")