anuj-exe commited on
Commit
55ee0c6
·
verified ·
1 Parent(s): 4fab8cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -37
app.py CHANGED
@@ -1,49 +1,30 @@
1
- from fastapi import FastAPI, Query
2
- from fastapi.responses import StreamingResponse
3
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
- import torch
5
- import io
6
- import soundfile as sf
7
- import requests
8
-
9
- app = FastAPI(title="SpeechT5 TTS API")
10
-
11
- # Load models once at startup
12
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
13
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
14
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
-
16
-
17
- # Function to load a speaker embedding from a URL
18
- def load_speaker_embedding(url: str) -> torch.Tensor:
19
- response = requests.get(url)
20
- response.raise_for_status()
21
- # Load the .bin file as a float32 tensor
22
- embedding = torch.frombuffer(response.content, dtype=torch.float32)
23
- return embedding.unsqueeze(0) # Add batch dimension
24
-
25
-
26
- # Example: load US female 1
27
- speaker_embeddings = load_speaker_embedding(
28
- "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
29
- )
30
-
31
 
32
  @app.get("/speak")
33
  def speak(text: str = Query(..., description="Text to convert to speech")):
34
- """
35
- Convert text to speech using SpeechT5 + HiFi-GAN.
36
- Returns a WAV audio stream.
37
- """
38
  # Prepare input
39
  inputs = processor(text=text, return_tensors="pt")
40
 
41
  # Generate speech
42
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
43
-
44
- # Convert to bytes buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  buf = io.BytesIO()
46
- sf.write(buf, speech.numpy(), samplerate=16000, format="WAV")
47
  buf.seek(0)
48
 
49
  return StreamingResponse(buf, media_type="audio/wav")
 
1
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  @app.get("/speak")
4
  def speak(text: str = Query(..., description="Text to convert to speech")):
 
 
 
 
5
  # Prepare input
6
  inputs = processor(text=text, return_tensors="pt")
7
 
8
  # Generate speech
9
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
10
+ audio = speech.numpy().astype(np.float32)
11
+
12
+ # --- Normalize ---
13
+ peak = np.max(np.abs(audio))
14
+ if peak > 0:
15
+ audio = (audio / peak) * 0.1 # match your NORMALIZATION_LEVEL
16
+
17
+ # --- Smooth (moving average) ---
18
+ window_size = 3 # like SMOOTHING_WINDOW
19
+ if window_size > 1:
20
+ cumsum = np.cumsum(np.insert(audio, 0, 0))
21
+ audio = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
22
+ # pad to original length
23
+ audio = np.pad(audio, (window_size//2, window_size-1-window_size//2), mode='edge')
24
+
25
+ # Write WAV as 32-bit float
26
  buf = io.BytesIO()
27
+ sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
28
  buf.seek(0)
29
 
30
  return StreamingResponse(buf, media_type="audio/wav")