Spaces:
M4xjunior
/
Running on Zero

M4xjunior commited on
Commit
a679a35
·
1 Parent(s): 4894b78
Files changed (3) hide show
  1. app.py +136 -119
  2. logs/sentence_analyzer_2024-12-02.log +12 -0
  3. src/f5_tts/api.py +1 -2
app.py CHANGED
@@ -1,4 +1,7 @@
 
 
1
  import nltk
 
2
  nltk.download('punkt_tab')
3
  from sentence_analyzer import SentenceAnalyzer
4
  import re
@@ -10,29 +13,41 @@ import gradio as gr
10
  import numpy as np
11
  import soundfile as sf
12
  import torchaudio
13
- import torch
14
  from cached_path import cached_path
15
  from transformers import AutoModelForCausalLM, AutoTokenizer
16
- from tqdm import tqdm
17
  try:
18
  import spaces
 
19
  USING_SPACES = True
20
  except ImportError:
21
  USING_SPACES = False
22
 
 
23
  def gpu_decorator(func):
24
  if USING_SPACES:
25
  return spaces.GPU(func)
26
  else:
27
  return func
28
 
29
- # Importando a nova API F5TTS
30
- from f5_tts.api import F5TTS
31
- from f5_tts.infer.utils_infer import preprocess_ref_audio_text
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  import os
34
  from huggingface_hub import hf_hub_download
35
 
 
36
  def load_f5tts():
37
  # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
38
  repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
@@ -40,88 +55,87 @@ def load_f5tts():
40
  token = os.getenv("HUGGINGFACE_TOKEN")
41
  # Valida se o token está presente
42
  if not token:
43
- raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.")
 
 
44
  # Faz o download do modelo do repositório privado
45
- ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token)
46
-
47
- # Define as configurações do modelo (ajuste se necessário)
48
- F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
49
 
50
- # Retorna a instância da API F5TTS
51
- return F5TTS(
52
- model_type="F5-TTS", # Ajuste o nome do modelo se necessário
53
- ckpt_file=ckpt_path,
54
- vocab_file="/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt", # Caminho para o arquivo vocab.txt
55
- device="cuda" if torch.cuda.is_available() else "cpu", # Define o dispositivo
56
- use_ema=True
57
  )
 
58
 
59
- # Carregar modelo F5TTS usando a nova API
60
- F5TTS_ema_model = load_f5tts()
61
 
62
- # Variáveis globais para o cache
63
- last_checkpoint = None
64
- last_device = None
65
- last_ema = None
66
- tts_api = None
67
- training_process = None # Adicione esta linha se necessário para o seu contexto
68
 
69
- # Modificação na classe F5TTS para salvar o áudio em um arquivo temporário
70
- class F5TTS(F5TTS): # Herdando da classe original F5TTS
71
- def infer(
72
- self,
73
- ref_file,
74
- ref_text,
75
- gen_text,
76
- show_info=print,
77
- progress=tqdm,
78
- target_rms=0.1,
79
- cross_fade_duration=0.15,
80
- sway_sampling_coef=-1,
81
- cfg_strength=2,
82
- nfe_step=32,
83
- speed=1.0,
84
- fix_duration=None,
85
- remove_silence=False,
86
- file_wave=None,
87
- file_spect=None,
88
- seed=-1,
89
- ):
90
- if seed == -1:
91
- seed = random.randint(0, sys.maxsize)
92
- seed_everything(seed)
93
- self.seed = seed
94
- wav, sr, spect = infer_process( # Chamando infer_process
95
- ref_file,
96
- ref_text,
97
- gen_text,
98
- self.ema_model,
99
- show_info=show_info,
100
- progress=progress,
101
- target_rms=target_rms,
102
- cross_fade_duration=cross_fade_duration,
103
- nfe_step=nfe_step,
104
- cfg_strength=cfg_strength,
105
- sway_sampling_coef=sway_sampling_coef,
106
- speed=speed,
107
- fix_duration=fix_duration,
108
- device=self.device,
109
- )
110
- if file_wave is not None:
111
- self.export_wav(wav, file_wave, remove_silence)
112
- if file_spect is not None:
113
- self.export_spectrogram(spect, file_spect)
114
 
115
- # Salvar o áudio em um arquivo temporário
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
117
- sf.write(f.name, wav, sr)
118
- audio_file = f.name # Atribuir o caminho do arquivo temporário à variável audio_file
119
-
120
- # Retornar o caminho do arquivo temporário
121
- return audio_file, self.device, str(self.seed)
 
 
 
 
122
 
123
 
124
- with gr.Blocks() as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  with gr.Tabs():
126
  with gr.Tab("TTS Básico"):
127
  gr.Markdown("# TTS Básico com F5-TTS")
@@ -129,8 +143,8 @@ with gr.Blocks() as app:
129
  # Entradas básicas
130
  ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
131
  gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
 
132
  generate_btn = gr.Button("Sintetizar", variant="primary")
133
-
134
  # Configurações avançadas
135
  gr.Markdown("### Configurações Avançadas")
136
  with gr.Accordion("Expandir Configurações Avançadas", open=False):
@@ -152,6 +166,7 @@ with gr.Blocks() as app:
152
  step=0.1,
153
  info="Ajuste a velocidade do áudio.",
154
  )
 
155
  cross_fade_duration_slider = gr.Slider(
156
  label="Duração do Cross-fade (s)",
157
  minimum=0.0,
@@ -176,7 +191,6 @@ with gr.Blocks() as app:
176
  step=1,
177
  info="Ajuste NFE Step.",
178
  )
179
- seed_input = gr.Number(label="Seed", value=-1, minimum=-1) # Seed na seção avançada
180
 
181
  analyzer = SentenceAnalyzer()
182
 
@@ -190,11 +204,8 @@ with gr.Blocks() as app:
190
  speed_slider,
191
  nfe_slider,
192
  chunk_size_slider,
193
- seed_input, # Passando o seed para process_chunks
194
  ):
195
- # Acessando a instância F5TTS_ema_model diretamente
196
- f5tts_model = F5TTS_ema_model
197
-
198
  # Dividir o texto em sentenças
199
  sentences = analyzer.split_into_sentences(gen_text_input)
200
 
@@ -207,37 +218,39 @@ with gr.Blocks() as app:
207
  # Processar cada chunk
208
  audio_segments = []
209
  for chunk in chunks:
210
- # Usando a função infer correta aqui
211
- audio_file, _, _ = f5tts_model.infer( # Usando f5tts_model.infer
212
- ref_file=ref_audio_input,
213
- ref_text=ref_text_input,
214
- gen_text=chunk,
215
- nfe_step=nfe_slider,
216
- speed=speed_slider,
 
217
  seed=seed_input,
218
- remove_silence=remove_silence,
219
  )
220
- audio_data, _ = torchaudio.load(audio_file)
221
- audio_segments.append(audio_data.squeeze().cpu().numpy())
222
 
223
  # Concatenar os segmentos de áudio gerados
224
  if audio_segments:
225
  final_audio_data = np.concatenate(audio_segments)
226
  return (
227
- (24000, final_audio_data), # Áudio final - assumindo taxa de amostragem de 24000
228
- None, # Espectrograma - não estamos gerando um espectrograma aqui
229
- gr.update(value=ref_text_input), # Nenhuma mudança no Texto de Referência
230
- f5tts_model.seed # Retornando o seed da instância F5TTS_ema_model
 
 
231
  )
232
  else:
233
  gr.Warning("Nenhum áudio gerado.")
234
- return None, None, gr.update(), None # Retornando None para o seed
235
 
236
  # Saídas
237
  gr.Markdown("### Resultados")
238
  audio_output = gr.Audio(label="Áudio Sintetizado")
239
- seed_output = gr.Text(label="Seed usada:") # Saída do Seed
240
-
241
  # Associação do botão `generate_btn` à função `process_chunks`
242
  generate_btn.click(
243
  process_chunks,
@@ -250,31 +263,35 @@ with gr.Blocks() as app:
250
  speed_slider,
251
  nfe_slider,
252
  chunk_size_slider,
253
- seed_input,
254
- ],
255
  outputs=[
256
  audio_output,
257
- ref_text_input,
 
258
  seed_output,
259
  ],
260
  )
261
-
262
- # Código para iniciar a aplicação Gradio
263
- @click.command()
264
- @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
265
- @click.option("--host", "-H", default=None, help="Host to run the app on")
266
- @click.option(
267
- "--share",
268
- "-s",
269
- default=False,
270
- is_flag=True,
271
- help="Share the app via Gradio share link",
 
 
 
 
 
 
 
272
  )
273
- @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
274
- def main(port, host, share, api):
275
- global app
276
- print("Starting app...")
277
- app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
278
 
279
  if __name__ == "__main__":
280
  if not USING_SPACES:
 
1
+ import random
2
+ import sys
3
  import nltk
4
+
5
  nltk.download('punkt_tab')
6
  from sentence_analyzer import SentenceAnalyzer
7
  import re
 
13
  import numpy as np
14
  import soundfile as sf
15
  import torchaudio
 
16
  from cached_path import cached_path
17
  from transformers import AutoModelForCausalLM, AutoTokenizer
18
+
19
  try:
20
  import spaces
21
+
22
  USING_SPACES = True
23
  except ImportError:
24
  USING_SPACES = False
25
 
26
+
27
  def gpu_decorator(func):
28
  if USING_SPACES:
29
  return spaces.GPU(func)
30
  else:
31
  return func
32
 
 
 
 
33
 
34
+ from f5_tts.model import DiT, UNetT
35
+ from f5_tts.model.utils import seed_everything
36
+ from f5_tts.infer.utils_infer import (
37
+ load_vocoder,
38
+ load_model,
39
+ preprocess_ref_audio_text,
40
+ infer_process,
41
+ remove_silence_for_generated_wav,
42
+ save_spectrogram,
43
+ )
44
+
45
+ # Carregar vocoder
46
+ vocoder = load_vocoder()
47
  import os
48
  from huggingface_hub import hf_hub_download
49
 
50
+
51
  def load_f5tts():
52
  # Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
53
  repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
 
55
  token = os.getenv("HUGGINGFACE_TOKEN")
56
  # Valida se o token está presente
57
  if not token:
58
+ raise ValueError(
59
+ "A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida."
60
+ )
61
  # Faz o download do modelo do repositório privado
62
+ ckpt_path = hf_hub_download(
63
+ repo_id=repo_id, filename=filename, use_auth_token=token
64
+ )
 
65
 
66
+ F5TTS_model_cfg = dict(
67
+ dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
 
 
 
 
 
68
  )
69
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
70
 
 
 
71
 
72
+ # Carregar modelo F5TTS
73
+ F5TTS_ema_model = load_f5tts()
 
 
 
 
74
 
75
+ @gpu_decorator
76
+ def infer(
77
+ ref_audio_orig,
78
+ ref_text,
79
+ gen_text,
80
+ remove_silence,
81
+ cross_fade_duration=0.15,
82
+ speed=1,
83
+ nfe=32,
84
+ show_info=gr.Info,
85
+ seed=-1,
86
+ ):
87
+ if seed == -1:
88
+ seed = random.randint(0, sys.maxsize)
89
+ seed_everything(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ print(f"Usando seed: {seed}")
92
+ ref_audio, ref_text = preprocess_ref_audio_text(
93
+ ref_audio_orig, ref_text, show_info=show_info
94
+ )
95
+ ema_model = F5TTS_ema_model
96
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
97
+ ref_audio,
98
+ ref_text.lower().strip(),
99
+ gen_text.lower().strip(),
100
+ ema_model,
101
+ vocoder,
102
+ cross_fade_duration=cross_fade_duration,
103
+ nfe_step=nfe,
104
+ speed=speed,
105
+ show_info=show_info,
106
+ progress=gr.Progress(),
107
+ )
108
+ # Remover silêncios
109
+ if remove_silence:
110
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
111
+ sf.write(f.name, final_wave, final_sample_rate)
112
+ remove_silence_for_generated_wav(f.name)
113
+ final_wave, _ = torchaudio.load(f.name)
114
+ final_wave = final_wave.squeeze().cpu().numpy()
115
+ # Salvar espectrograma
116
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
117
+ spectrogram_path = tmp_spectrogram.name
118
+ save_spectrogram(combined_spectrogram, spectrogram_path)
119
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed
120
 
121
 
122
+ # Estilos CSS
123
+ custom_css = """
124
+ #sentences-container {
125
+ border: 1px solid #ddd;
126
+ border-radius: 4px;
127
+ padding: 10px;
128
+ margin-bottom: 10px;
129
+ }
130
+ .sentence-box {
131
+ border: 1px solid #eee;
132
+ padding: 5px;
133
+ margin-bottom: 5px;
134
+ border-radius: 4px;
135
+ background-color: #f9f9f9;
136
+ }
137
+ """
138
+ with gr.Blocks(css=custom_css) as app:
139
  with gr.Tabs():
140
  with gr.Tab("TTS Básico"):
141
  gr.Markdown("# TTS Básico com F5-TTS")
 
143
  # Entradas básicas
144
  ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
145
  gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
146
+ seed_input = gr.Number(label="Seed (opcional)", value=-1)
147
  generate_btn = gr.Button("Sintetizar", variant="primary")
 
148
  # Configurações avançadas
149
  gr.Markdown("### Configurações Avançadas")
150
  with gr.Accordion("Expandir Configurações Avançadas", open=False):
 
166
  step=0.1,
167
  info="Ajuste a velocidade do áudio.",
168
  )
169
+
170
  cross_fade_duration_slider = gr.Slider(
171
  label="Duração do Cross-fade (s)",
172
  minimum=0.0,
 
191
  step=1,
192
  info="Ajuste NFE Step.",
193
  )
 
194
 
195
  analyzer = SentenceAnalyzer()
196
 
 
204
  speed_slider,
205
  nfe_slider,
206
  chunk_size_slider,
207
+ seed_input,
208
  ):
 
 
 
209
  # Dividir o texto em sentenças
210
  sentences = analyzer.split_into_sentences(gen_text_input)
211
 
 
218
  # Processar cada chunk
219
  audio_segments = []
220
  for chunk in chunks:
221
+ audio_out, spectrogram_path, ref_text_out, seed_used = infer(
222
+ ref_audio_input,
223
+ ref_text_input, # Utiliza o Texto de Referência como está
224
+ chunk, # Processa o chunk atual
225
+ remove_silence,
226
+ cross_fade_duration_slider,
227
+ speed_slider,
228
+ nfe_slider,
229
  seed=seed_input,
 
230
  )
231
+ sr, audio_data = audio_out
232
+ audio_segments.append(audio_data)
233
 
234
  # Concatenar os segmentos de áudio gerados
235
  if audio_segments:
236
  final_audio_data = np.concatenate(audio_segments)
237
  return (
238
+ (sr, final_audio_data), # Áudio final
239
+ spectrogram_path, # Espectrograma
240
+ gr.update(
241
+ value=ref_text_out
242
+ ), # Nenhuma mudança no Texto de Referência
243
+ gr.update(value=seed_used),
244
  )
245
  else:
246
  gr.Warning("Nenhum áudio gerado.")
247
+ return None, None, gr.update(), gr.update()
248
 
249
  # Saídas
250
  gr.Markdown("### Resultados")
251
  audio_output = gr.Audio(label="Áudio Sintetizado")
252
+ spectrogram_output = gr.Image(label="Espectrograma")
253
+ seed_output = gr.Number(label="Seed Usada")
254
  # Associação do botão `generate_btn` à função `process_chunks`
255
  generate_btn.click(
256
  process_chunks,
 
263
  speed_slider,
264
  nfe_slider,
265
  chunk_size_slider,
266
+ seed_input,
267
+ ],
268
  outputs=[
269
  audio_output,
270
+ spectrogram_output,
271
+ ref_text_input, # Atualiza o texto de referência, se necessário
272
  seed_output,
273
  ],
274
  )
275
+
276
+
277
+ @click.command()
278
+ @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
279
+ @click.option("--host", "-H", default=None, help="Host to run the app on")
280
+ @click.option(
281
+ "--share",
282
+ "-s",
283
+ default=False,
284
+ is_flag=True,
285
+ help="Share the app via Gradio share link",
286
+ )
287
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
288
+ def main(port, host, share, api):
289
+ global app
290
+ print("Starting app...")
291
+ app.queue(api_open=api).launch(
292
+ server_name=host, server_port=port, share=share, show_api=api
293
  )
294
+
 
 
 
 
295
 
296
  if __name__ == "__main__":
297
  if not USING_SPACES:
logs/sentence_analyzer_2024-12-02.log CHANGED
@@ -76,3 +76,15 @@
76
  2024-12-02 20:47:48,875 - SentenceAnalyzer - DEBUG - Normalized whitespace
77
  2024-12-02 20:47:48,898 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
78
  2024-12-02 20:47:48,898 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  2024-12-02 20:47:48,875 - SentenceAnalyzer - DEBUG - Normalized whitespace
77
  2024-12-02 20:47:48,898 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
78
  2024-12-02 20:47:48,898 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
79
+ 2024-12-02 20:55:23,847 - SentenceAnalyzer - DEBUG - Logger set up successfully
80
+ 2024-12-02 20:55:23,847 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
81
+ 2024-12-02 20:55:48,213 - SentenceAnalyzer - DEBUG - Starting sentence splitting
82
+ 2024-12-02 20:55:48,213 - SentenceAnalyzer - DEBUG - Normalized text using NFC
83
+ 2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
84
+ 2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
85
+ 2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
86
+ 2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Normalized whitespace
87
+ 2024-12-02 20:55:48,235 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
88
+ 2024-12-02 20:55:48,235 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
89
+ 2024-12-02 21:02:37,760 - SentenceAnalyzer - DEBUG - Logger set up successfully
90
+ 2024-12-02 21:02:37,760 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
src/f5_tts/api.py CHANGED
@@ -1,5 +1,4 @@
1
- import random
2
- import sys
3
  from importlib.resources import files
4
 
5
  import soundfile as sf
 
1
+
 
2
  from importlib.resources import files
3
 
4
  import soundfile as sf