anuj-exe commited on
Commit
9f2ddc0
·
verified ·
1 Parent(s): 966e4cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -46
app.py CHANGED
@@ -19,19 +19,16 @@ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
21
 
22
- # Max tokens allowed by model
23
- MAX_TOKENS = 600
24
-
25
  # Map integer to speaker embedding URL
26
  SPEAKER_EMBEDDINGS = {
27
- 0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin",
28
- 1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin",
29
- 2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin",
30
- 3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin",
31
- 4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin",
32
- 5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin",
33
- 6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin",
34
- 7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin",
35
  }
36
 
37
 
@@ -42,7 +39,7 @@ def load_speaker_embedding(url: str) -> torch.Tensor:
42
  return embedding.unsqueeze(0)
43
 
44
 
45
- def smooth_audio(audio: np.ndarray, window_size: int) -> np.ndarray:
46
  smoothed = np.copy(audio)
47
  half = window_size // 2
48
  for i in range(len(audio)):
@@ -57,26 +54,6 @@ def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray:
57
  audio[-fade_samples:] *= fade
58
  return audio
59
 
60
-
61
- def chunk_text(text: str, processor, max_tokens: int = MAX_TOKENS):
62
- words = text.split()
63
- chunks = []
64
- current_chunk = []
65
-
66
- for word in words:
67
- current_chunk.append(word)
68
- tokens = processor(" ".join(current_chunk), return_tensors="pt")["input_ids"]
69
- if tokens.size(1) > max_tokens:
70
- current_chunk.pop()
71
- chunks.append(" ".join(current_chunk))
72
- current_chunk = [word]
73
-
74
- if current_chunk:
75
- chunks.append(" ".join(current_chunk))
76
-
77
- return chunks
78
-
79
-
80
  @app.get("/speak")
81
  def speak(
82
  text: str = Query(..., description="Text to convert to speech"),
@@ -85,17 +62,9 @@ def speak(
85
  embedding_url = SPEAKER_EMBEDDINGS[speaker]
86
  speaker_embedding = load_speaker_embedding(embedding_url)
87
 
88
- chunks = chunk_text(text, processor, MAX_TOKENS)
89
- audio_list = []
90
-
91
- for chunk in chunks:
92
- inputs = processor(text=chunk, return_tensors="pt")
93
- speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
94
- audio_chunk = speech.numpy().astype(np.float32)
95
- audio_list.append(audio_chunk)
96
-
97
- # Concatenate all chunks
98
- audio = np.concatenate(audio_list)
99
 
100
  # --- Normalize ---
101
  peak = np.max(np.abs(audio))
@@ -103,10 +72,10 @@ def speak(
103
  audio = (audio / peak) * NORMALIZATION_LEVEL
104
 
105
  # --- Smooth audio ---
106
- audio = smooth_audio(audio, SMOOTHING_WINDOW * 2)
107
 
108
- # --- Fade out ---
109
- fade_samples = min(512, len(audio)//10)
110
  audio = apply_fade_out(audio, fade_samples)
111
 
112
  # --- Bit depth ---
 
19
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
21
 
 
 
 
22
  # Map integer to speaker embedding URL
23
  SPEAKER_EMBEDDINGS = {
24
+ 0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin", # Normal
25
+ 1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin", # US female 1
26
+ 2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin", # US female 2
27
+ 3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin", # US male 1
28
+ 4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin", # US male 2
29
+ 5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin", # Canadian male
30
+ 6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin", # Scottish male
31
+ 7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin", # Indian male
32
  }
33
 
34
 
 
39
  return embedding.unsqueeze(0)
40
 
41
 
42
+ def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray:
43
  smoothed = np.copy(audio)
44
  half = window_size // 2
45
  for i in range(len(audio)):
 
54
  audio[-fade_samples:] *= fade
55
  return audio
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @app.get("/speak")
58
  def speak(
59
  text: str = Query(..., description="Text to convert to speech"),
 
62
  embedding_url = SPEAKER_EMBEDDINGS[speaker]
63
  speaker_embedding = load_speaker_embedding(embedding_url)
64
 
65
+ inputs = processor(text=text, return_tensors="pt")
66
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
67
+ audio = speech.numpy().astype(np.float32)
 
 
 
 
 
 
 
 
68
 
69
  # --- Normalize ---
70
  peak = np.max(np.abs(audio))
 
72
  audio = (audio / peak) * NORMALIZATION_LEVEL
73
 
74
  # --- Smooth audio ---
75
+ audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2) # Slightly larger window
76
 
77
+ # --- Fade out to remove clicks at the end ---
78
+ fade_samples = min(512, len(audio)//10) # ~30ms at 16kHz
79
  audio = apply_fade_out(audio, fade_samples)
80
 
81
  # --- Bit depth ---