anuj-exe commited on
Commit
3075963
·
verified ·
1 Parent(s): 55ee0c6

Create otherApp.py

Browse files
Files changed (1) hide show
  1. otherApp.py +49 -0
otherApp.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query
2
+ from fastapi.responses import StreamingResponse
3
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
+ import torch
5
+ import io
6
+ import soundfile as sf
7
+ import requests
8
+
9
+ app = FastAPI(title="SpeechT5 TTS API")
10
+
11
+ # Load models once at startup
12
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
13
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
14
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
+
16
+
17
+ # Function to load a speaker embedding from a URL
18
+ def load_speaker_embedding(url: str) -> torch.Tensor:
19
+ response = requests.get(url)
20
+ response.raise_for_status()
21
+ # Load the .bin file as a float32 tensor
22
+ embedding = torch.frombuffer(response.content, dtype=torch.float32)
23
+ return embedding.unsqueeze(0) # Add batch dimension
24
+
25
+
26
+ # Example: load US female 1
27
+ speaker_embeddings = load_speaker_embedding(
28
+ "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
29
+ )
30
+
31
+
32
+ @app.get("/speak")
33
+ def speak(text: str = Query(..., description="Text to convert to speech")):
34
+ """
35
+ Convert text to speech using SpeechT5 + HiFi-GAN.
36
+ Returns a WAV audio stream.
37
+ """
38
+ # Prepare input
39
+ inputs = processor(text=text, return_tensors="pt")
40
+
41
+ # Generate speech
42
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
43
+
44
+ # Convert to bytes buffer
45
+ buf = io.BytesIO()
46
+ sf.write(buf, speech.numpy(), samplerate=16000, format="WAV")
47
+ buf.seek(0)
48
+
49
+ return StreamingResponse(buf, media_type="audio/wav")