from fastapi import FastAPI, Query from fastapi.responses import StreamingResponse from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import torch import io import soundfile as sf import requests app = FastAPI(title="SpeechT5 TTS API") # Load models once at startup processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Function to load a speaker embedding from a URL def load_speaker_embedding(url: str) -> torch.Tensor: response = requests.get(url) response.raise_for_status() # Load the .bin file as a float32 tensor embedding = torch.frombuffer(response.content, dtype=torch.float32) return embedding.unsqueeze(0) # Add batch dimension # Example: load US female 1 speaker_embeddings = load_speaker_embedding( "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin" ) @app.get("/speak") def speak(text: str = Query(..., description="Text to convert to speech")): """ Convert text to speech using SpeechT5 + HiFi-GAN. Returns a WAV audio stream. """ # Prepare input inputs = processor(text=text, return_tensors="pt") # Generate speech speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Convert to bytes buffer buf = io.BytesIO() sf.write(buf, speech.numpy(), samplerate=16000, format="WAV") buf.seek(0) return StreamingResponse(buf, media_type="audio/wav")