anuj-exe commited on
Commit
6114331
·
verified ·
1 Parent(s): 3075963

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -9
app.py CHANGED
@@ -1,7 +1,56 @@
 
 
 
 
 
 
 
1
  import numpy as np
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  @app.get("/speak")
4
  def speak(text: str = Query(..., description="Text to convert to speech")):
 
 
 
 
5
  # Prepare input
6
  inputs = processor(text=text, return_tensors="pt")
7
 
@@ -12,17 +61,12 @@ def speak(text: str = Query(..., description="Text to convert to speech")):
12
  # --- Normalize ---
13
  peak = np.max(np.abs(audio))
14
  if peak > 0:
15
- audio = (audio / peak) * 0.1 # match your NORMALIZATION_LEVEL
16
 
17
- # --- Smooth (moving average) ---
18
- window_size = 3 # like SMOOTHING_WINDOW
19
- if window_size > 1:
20
- cumsum = np.cumsum(np.insert(audio, 0, 0))
21
- audio = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
22
- # pad to original length
23
- audio = np.pad(audio, (window_size//2, window_size-1-window_size//2), mode='edge')
24
 
25
- # Write WAV as 32-bit float
26
  buf = io.BytesIO()
27
  sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
28
  buf.seek(0)
 
1
+ from fastapi import FastAPI, Query
2
+ from fastapi.responses import StreamingResponse
3
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
+ import torch
5
+ import io
6
+ import soundfile as sf
7
+ import requests
8
  import numpy as np
9
 
10
+ app = FastAPI(title="SpeechT5 TTS API")
11
+
12
+ # Load models once at startup
13
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
+
17
+
18
+ # Function to load a speaker embedding from a URL
19
+ def load_speaker_embedding(url: str) -> torch.Tensor:
20
+ response = requests.get(url)
21
+ response.raise_for_status()
22
+ # Load the .bin file as a float32 tensor
23
+ embedding = torch.frombuffer(response.content, dtype=torch.float32)
24
+ return embedding.unsqueeze(0) # Add batch dimension
25
+
26
+
27
+ # Example: load US female 1
28
+ speaker_embeddings = load_speaker_embedding(
29
+ "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
30
+ )
31
+
32
+
33
+ def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray:
34
+ """
35
+ Simple moving average smoothing.
36
+ """
37
+ if window_size < 2:
38
+ return audio
39
+ cumsum = np.cumsum(np.insert(audio, 0, 0))
40
+ smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
41
+ # pad to original length
42
+ pad_left = window_size // 2
43
+ pad_right = window_size - 1 - pad_left
44
+ smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge')
45
+ return smoothed
46
+
47
+
48
  @app.get("/speak")
49
  def speak(text: str = Query(..., description="Text to convert to speech")):
50
+ """
51
+ Convert text to speech using SpeechT5 + HiFi-GAN.
52
+ Returns a WAV audio stream.
53
+ """
54
  # Prepare input
55
  inputs = processor(text=text, return_tensors="pt")
56
 
 
61
  # --- Normalize ---
62
  peak = np.max(np.abs(audio))
63
  if peak > 0:
64
+ audio = (audio / peak) * 0.1 # Adjustable normalization level
65
 
66
+ # --- Smooth ---
67
+ audio = smooth_audio(audio, window_size=3)
 
 
 
 
 
68
 
69
+ # --- Write WAV as 32-bit float ---
70
  buf = io.BytesIO()
71
  sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
72
  buf.seek(0)