Simonlob commited on
Commit
eede4e0
·
1 Parent(s): 77bfda1

debug emb cache

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. .gitignore +5 -0
  3. app.py +7 -1
  4. util.py +15 -2
.DS_Store DELETED
Binary file (6.15 kB)
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .DS_Store/
2
+ venv/
3
+ env/
4
+ __Pycache__/
5
+ __pycache__/
app.py CHANGED
@@ -59,7 +59,14 @@ def generate_speech_gpu(text, model_choice, mode, speaker_choice, t, top_p, rp):
59
  selected_model = models[model_choice]
60
 
61
  # Get speaker embedding based on mode
 
62
  speaker_emb = speaker_manager.get_speaker_emb(mode, speaker_choice)
 
 
 
 
 
 
63
 
64
  print(f"Generating speech with {model_choice}...")
65
  audio, _ = selected_model(
@@ -114,7 +121,6 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Ocean()) a
114
  type="numpy",
115
  sources=["upload", "microphone"],
116
  format="wav",
117
- waveform_options={"sample_rate": 16000}
118
  )
119
 
120
  with gr.Row():
 
59
  selected_model = models[model_choice]
60
 
61
  # Get speaker embedding based on mode
62
+ print(f"[generate_speech_gpu] Mode: {mode}, Speaker choice: {speaker_choice}")
63
  speaker_emb = speaker_manager.get_speaker_emb(mode, speaker_choice)
64
+ print(f"[generate_speech_gpu] Speaker emb type: {type(speaker_emb)}")
65
+ if speaker_emb is not None:
66
+ if isinstance(speaker_emb, str):
67
+ print(f"[generate_speech_gpu] Speaker emb is path: {speaker_emb}")
68
+ elif torch.is_tensor(speaker_emb):
69
+ print(f"[generate_speech_gpu] Speaker emb is tensor: shape={speaker_emb.shape}, device={speaker_emb.device}")
70
 
71
  print(f"Generating speech with {model_choice}...")
72
  audio, _ = selected_model(
 
121
  type="numpy",
122
  sources=["upload", "microphone"],
123
  format="wav",
 
124
  )
125
 
126
  with gr.Row():
util.py CHANGED
@@ -130,13 +130,20 @@ class SpeakerManager:
130
  """
131
  if mode == "select":
132
  if speaker_name and speaker_name in self.speaker_map:
133
- return self.speaker_map[speaker_name]
 
 
134
  return None
135
  elif mode == "generate":
 
 
 
 
 
136
  return self.cached_embedding
137
  return None
138
 
139
- def generate_embedding(self, audio_data, sample_rate: int = 16000):
140
  """
141
  Generate speaker embedding from audio data.
142
 
@@ -154,19 +161,25 @@ class SpeakerManager:
154
  """
155
  # Initialize embedder lazily
156
  if self.embedder is None:
 
157
  self.embedder = SpeakerEmbedder()
158
 
159
  # Handle Gradio audio format (sr, audio) tuple
160
  if isinstance(audio_data, tuple):
161
  sample_rate, audio_array = audio_data
 
162
  else:
163
  audio_array = audio_data
 
164
 
165
  # Generate embedding
 
166
  embedding = self.embedder.embed_audio(audio_array, sample_rate=sample_rate)
 
167
 
168
  # Cache the result
169
  self.cached_embedding = embedding
 
170
 
171
  return embedding
172
 
 
130
  """
131
  if mode == "select":
132
  if speaker_name and speaker_name in self.speaker_map:
133
+ path = self.speaker_map[speaker_name]
134
+ print(f"[SpeakerManager] Returning speaker path: {path}")
135
+ return path
136
  return None
137
  elif mode == "generate":
138
+ print(f"[SpeakerManager] Cached embedding: {self.cached_embedding}")
139
+ print(f"[SpeakerManager] Cached embedding type: {type(self.cached_embedding)}")
140
+ if self.cached_embedding is not None:
141
+ print(f"[SpeakerManager] Cached embedding shape: {self.cached_embedding.shape}")
142
+ print(f"[SpeakerManager] Cached embedding device: {self.cached_embedding.device}")
143
  return self.cached_embedding
144
  return None
145
 
146
+ def generate_embedding(self, audio_data, sample_rate: int):
147
  """
148
  Generate speaker embedding from audio data.
149
 
 
161
  """
162
  # Initialize embedder lazily
163
  if self.embedder is None:
164
+ print("[SpeakerManager] Initializing SpeakerEmbedder...")
165
  self.embedder = SpeakerEmbedder()
166
 
167
  # Handle Gradio audio format (sr, audio) tuple
168
  if isinstance(audio_data, tuple):
169
  sample_rate, audio_array = audio_data
170
+ print(f"[SpeakerManager] Audio tuple: sr={sample_rate}, shape={audio_array.shape}")
171
  else:
172
  audio_array = audio_data
173
+ print(f"[SpeakerManager] Audio array shape: {audio_array.shape}")
174
 
175
  # Generate embedding
176
+ print("[SpeakerManager] Generating embedding...")
177
  embedding = self.embedder.embed_audio(audio_array, sample_rate=sample_rate)
178
+ print(f"[SpeakerManager] Generated embedding shape: {embedding.shape}, device: {embedding.device}")
179
 
180
  # Cache the result
181
  self.cached_embedding = embedding
182
+ print(f"[SpeakerManager] Cached embedding (id={id(self.cached_embedding)})")
183
 
184
  return embedding
185