fix
Browse files- app.py +136 -119
- logs/sentence_analyzer_2024-12-02.log +12 -0
- src/f5_tts/api.py +1 -2
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
|
|
|
|
1 |
import nltk
|
|
|
2 |
nltk.download('punkt_tab')
|
3 |
from sentence_analyzer import SentenceAnalyzer
|
4 |
import re
|
@@ -10,29 +13,41 @@ import gradio as gr
|
|
10 |
import numpy as np
|
11 |
import soundfile as sf
|
12 |
import torchaudio
|
13 |
-
import torch
|
14 |
from cached_path import cached_path
|
15 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
16 |
-
|
17 |
try:
|
18 |
import spaces
|
|
|
19 |
USING_SPACES = True
|
20 |
except ImportError:
|
21 |
USING_SPACES = False
|
22 |
|
|
|
23 |
def gpu_decorator(func):
|
24 |
if USING_SPACES:
|
25 |
return spaces.GPU(func)
|
26 |
else:
|
27 |
return func
|
28 |
|
29 |
-
# Importando a nova API F5TTS
|
30 |
-
from f5_tts.api import F5TTS
|
31 |
-
from f5_tts.infer.utils_infer import preprocess_ref_audio_text
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
import os
|
34 |
from huggingface_hub import hf_hub_download
|
35 |
|
|
|
36 |
def load_f5tts():
|
37 |
# Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
|
38 |
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
|
@@ -40,88 +55,87 @@ def load_f5tts():
|
|
40 |
token = os.getenv("HUGGINGFACE_TOKEN")
|
41 |
# Valida se o token está presente
|
42 |
if not token:
|
43 |
-
raise ValueError(
|
|
|
|
|
44 |
# Faz o download do modelo do repositório privado
|
45 |
-
ckpt_path = hf_hub_download(
|
46 |
-
|
47 |
-
|
48 |
-
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
model_type="F5-TTS", # Ajuste o nome do modelo se necessário
|
53 |
-
ckpt_file=ckpt_path,
|
54 |
-
vocab_file="/home/user/app/data/Emilia_ZH_EN_pinyin/vocab.txt", # Caminho para o arquivo vocab.txt
|
55 |
-
device="cuda" if torch.cuda.is_available() else "cpu", # Define o dispositivo
|
56 |
-
use_ema=True
|
57 |
)
|
|
|
58 |
|
59 |
-
# Carregar modelo F5TTS usando a nova API
|
60 |
-
F5TTS_ema_model = load_f5tts()
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
last_device = None
|
65 |
-
last_ema = None
|
66 |
-
tts_api = None
|
67 |
-
training_process = None # Adicione esta linha se necessário para o seu contexto
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
fix_duration=None,
|
85 |
-
remove_silence=False,
|
86 |
-
file_wave=None,
|
87 |
-
file_spect=None,
|
88 |
-
seed=-1,
|
89 |
-
):
|
90 |
-
if seed == -1:
|
91 |
-
seed = random.randint(0, sys.maxsize)
|
92 |
-
seed_everything(seed)
|
93 |
-
self.seed = seed
|
94 |
-
wav, sr, spect = infer_process( # Chamando infer_process
|
95 |
-
ref_file,
|
96 |
-
ref_text,
|
97 |
-
gen_text,
|
98 |
-
self.ema_model,
|
99 |
-
show_info=show_info,
|
100 |
-
progress=progress,
|
101 |
-
target_rms=target_rms,
|
102 |
-
cross_fade_duration=cross_fade_duration,
|
103 |
-
nfe_step=nfe_step,
|
104 |
-
cfg_strength=cfg_strength,
|
105 |
-
sway_sampling_coef=sway_sampling_coef,
|
106 |
-
speed=speed,
|
107 |
-
fix_duration=fix_duration,
|
108 |
-
device=self.device,
|
109 |
-
)
|
110 |
-
if file_wave is not None:
|
111 |
-
self.export_wav(wav, file_wave, remove_silence)
|
112 |
-
if file_spect is not None:
|
113 |
-
self.export_spectrogram(spect, file_spect)
|
114 |
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
117 |
-
sf.write(f.name,
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
122 |
|
123 |
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
with gr.Tabs():
|
126 |
with gr.Tab("TTS Básico"):
|
127 |
gr.Markdown("# TTS Básico com F5-TTS")
|
@@ -129,8 +143,8 @@ with gr.Blocks() as app:
|
|
129 |
# Entradas básicas
|
130 |
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
|
131 |
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
|
|
|
132 |
generate_btn = gr.Button("Sintetizar", variant="primary")
|
133 |
-
|
134 |
# Configurações avançadas
|
135 |
gr.Markdown("### Configurações Avançadas")
|
136 |
with gr.Accordion("Expandir Configurações Avançadas", open=False):
|
@@ -152,6 +166,7 @@ with gr.Blocks() as app:
|
|
152 |
step=0.1,
|
153 |
info="Ajuste a velocidade do áudio.",
|
154 |
)
|
|
|
155 |
cross_fade_duration_slider = gr.Slider(
|
156 |
label="Duração do Cross-fade (s)",
|
157 |
minimum=0.0,
|
@@ -176,7 +191,6 @@ with gr.Blocks() as app:
|
|
176 |
step=1,
|
177 |
info="Ajuste NFE Step.",
|
178 |
)
|
179 |
-
seed_input = gr.Number(label="Seed", value=-1, minimum=-1) # Seed na seção avançada
|
180 |
|
181 |
analyzer = SentenceAnalyzer()
|
182 |
|
@@ -190,11 +204,8 @@ with gr.Blocks() as app:
|
|
190 |
speed_slider,
|
191 |
nfe_slider,
|
192 |
chunk_size_slider,
|
193 |
-
seed_input,
|
194 |
):
|
195 |
-
# Acessando a instância F5TTS_ema_model diretamente
|
196 |
-
f5tts_model = F5TTS_ema_model
|
197 |
-
|
198 |
# Dividir o texto em sentenças
|
199 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
200 |
|
@@ -207,37 +218,39 @@ with gr.Blocks() as app:
|
|
207 |
# Processar cada chunk
|
208 |
audio_segments = []
|
209 |
for chunk in chunks:
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
217 |
seed=seed_input,
|
218 |
-
remove_silence=remove_silence,
|
219 |
)
|
220 |
-
|
221 |
-
audio_segments.append(audio_data
|
222 |
|
223 |
# Concatenar os segmentos de áudio gerados
|
224 |
if audio_segments:
|
225 |
final_audio_data = np.concatenate(audio_segments)
|
226 |
return (
|
227 |
-
(
|
228 |
-
|
229 |
-
gr.update(
|
230 |
-
|
|
|
|
|
231 |
)
|
232 |
else:
|
233 |
gr.Warning("Nenhum áudio gerado.")
|
234 |
-
return None, None, gr.update(),
|
235 |
|
236 |
# Saídas
|
237 |
gr.Markdown("### Resultados")
|
238 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
239 |
-
|
240 |
-
|
241 |
# Associação do botão `generate_btn` à função `process_chunks`
|
242 |
generate_btn.click(
|
243 |
process_chunks,
|
@@ -250,31 +263,35 @@ with gr.Blocks() as app:
|
|
250 |
speed_slider,
|
251 |
nfe_slider,
|
252 |
chunk_size_slider,
|
253 |
-
seed_input,
|
254 |
-
],
|
255 |
outputs=[
|
256 |
audio_output,
|
257 |
-
|
|
|
258 |
seed_output,
|
259 |
],
|
260 |
)
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
)
|
273 |
-
|
274 |
-
def main(port, host, share, api):
|
275 |
-
global app
|
276 |
-
print("Starting app...")
|
277 |
-
app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
|
278 |
|
279 |
if __name__ == "__main__":
|
280 |
if not USING_SPACES:
|
|
|
1 |
+
import random
|
2 |
+
import sys
|
3 |
import nltk
|
4 |
+
|
5 |
nltk.download('punkt_tab')
|
6 |
from sentence_analyzer import SentenceAnalyzer
|
7 |
import re
|
|
|
13 |
import numpy as np
|
14 |
import soundfile as sf
|
15 |
import torchaudio
|
|
|
16 |
from cached_path import cached_path
|
17 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
18 |
+
|
19 |
try:
|
20 |
import spaces
|
21 |
+
|
22 |
USING_SPACES = True
|
23 |
except ImportError:
|
24 |
USING_SPACES = False
|
25 |
|
26 |
+
|
27 |
def gpu_decorator(func):
|
28 |
if USING_SPACES:
|
29 |
return spaces.GPU(func)
|
30 |
else:
|
31 |
return func
|
32 |
|
|
|
|
|
|
|
33 |
|
34 |
+
from f5_tts.model import DiT, UNetT
|
35 |
+
from f5_tts.model.utils import seed_everything
|
36 |
+
from f5_tts.infer.utils_infer import (
|
37 |
+
load_vocoder,
|
38 |
+
load_model,
|
39 |
+
preprocess_ref_audio_text,
|
40 |
+
infer_process,
|
41 |
+
remove_silence_for_generated_wav,
|
42 |
+
save_spectrogram,
|
43 |
+
)
|
44 |
+
|
45 |
+
# Carregar vocoder
|
46 |
+
vocoder = load_vocoder()
|
47 |
import os
|
48 |
from huggingface_hub import hf_hub_download
|
49 |
|
50 |
+
|
51 |
def load_f5tts():
|
52 |
# Carrega o caminho do repositório e o nome do arquivo das variáveis de ambiente
|
53 |
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base")
|
|
|
55 |
token = os.getenv("HUGGINGFACE_TOKEN")
|
56 |
# Valida se o token está presente
|
57 |
if not token:
|
58 |
+
raise ValueError(
|
59 |
+
"A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida."
|
60 |
+
)
|
61 |
# Faz o download do modelo do repositório privado
|
62 |
+
ckpt_path = hf_hub_download(
|
63 |
+
repo_id=repo_id, filename=filename, use_auth_token=token
|
64 |
+
)
|
|
|
65 |
|
66 |
+
F5TTS_model_cfg = dict(
|
67 |
+
dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
|
|
|
|
|
|
|
|
|
|
|
68 |
)
|
69 |
+
return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True)
|
70 |
|
|
|
|
|
71 |
|
72 |
+
# Carregar modelo F5TTS
|
73 |
+
F5TTS_ema_model = load_f5tts()
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
@gpu_decorator
|
76 |
+
def infer(
|
77 |
+
ref_audio_orig,
|
78 |
+
ref_text,
|
79 |
+
gen_text,
|
80 |
+
remove_silence,
|
81 |
+
cross_fade_duration=0.15,
|
82 |
+
speed=1,
|
83 |
+
nfe=32,
|
84 |
+
show_info=gr.Info,
|
85 |
+
seed=-1,
|
86 |
+
):
|
87 |
+
if seed == -1:
|
88 |
+
seed = random.randint(0, sys.maxsize)
|
89 |
+
seed_everything(seed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
print(f"Usando seed: {seed}")
|
92 |
+
ref_audio, ref_text = preprocess_ref_audio_text(
|
93 |
+
ref_audio_orig, ref_text, show_info=show_info
|
94 |
+
)
|
95 |
+
ema_model = F5TTS_ema_model
|
96 |
+
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
97 |
+
ref_audio,
|
98 |
+
ref_text.lower().strip(),
|
99 |
+
gen_text.lower().strip(),
|
100 |
+
ema_model,
|
101 |
+
vocoder,
|
102 |
+
cross_fade_duration=cross_fade_duration,
|
103 |
+
nfe_step=nfe,
|
104 |
+
speed=speed,
|
105 |
+
show_info=show_info,
|
106 |
+
progress=gr.Progress(),
|
107 |
+
)
|
108 |
+
# Remover silêncios
|
109 |
+
if remove_silence:
|
110 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
111 |
+
sf.write(f.name, final_wave, final_sample_rate)
|
112 |
+
remove_silence_for_generated_wav(f.name)
|
113 |
+
final_wave, _ = torchaudio.load(f.name)
|
114 |
+
final_wave = final_wave.squeeze().cpu().numpy()
|
115 |
+
# Salvar espectrograma
|
116 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
|
117 |
+
spectrogram_path = tmp_spectrogram.name
|
118 |
+
save_spectrogram(combined_spectrogram, spectrogram_path)
|
119 |
+
return (final_sample_rate, final_wave), spectrogram_path, ref_text, seed
|
120 |
|
121 |
|
122 |
+
# Estilos CSS
|
123 |
+
custom_css = """
|
124 |
+
#sentences-container {
|
125 |
+
border: 1px solid #ddd;
|
126 |
+
border-radius: 4px;
|
127 |
+
padding: 10px;
|
128 |
+
margin-bottom: 10px;
|
129 |
+
}
|
130 |
+
.sentence-box {
|
131 |
+
border: 1px solid #eee;
|
132 |
+
padding: 5px;
|
133 |
+
margin-bottom: 5px;
|
134 |
+
border-radius: 4px;
|
135 |
+
background-color: #f9f9f9;
|
136 |
+
}
|
137 |
+
"""
|
138 |
+
with gr.Blocks(css=custom_css) as app:
|
139 |
with gr.Tabs():
|
140 |
with gr.Tab("TTS Básico"):
|
141 |
gr.Markdown("# TTS Básico com F5-TTS")
|
|
|
143 |
# Entradas básicas
|
144 |
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath")
|
145 |
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10)
|
146 |
+
seed_input = gr.Number(label="Seed (opcional)", value=-1)
|
147 |
generate_btn = gr.Button("Sintetizar", variant="primary")
|
|
|
148 |
# Configurações avançadas
|
149 |
gr.Markdown("### Configurações Avançadas")
|
150 |
with gr.Accordion("Expandir Configurações Avançadas", open=False):
|
|
|
166 |
step=0.1,
|
167 |
info="Ajuste a velocidade do áudio.",
|
168 |
)
|
169 |
+
|
170 |
cross_fade_duration_slider = gr.Slider(
|
171 |
label="Duração do Cross-fade (s)",
|
172 |
minimum=0.0,
|
|
|
191 |
step=1,
|
192 |
info="Ajuste NFE Step.",
|
193 |
)
|
|
|
194 |
|
195 |
analyzer = SentenceAnalyzer()
|
196 |
|
|
|
204 |
speed_slider,
|
205 |
nfe_slider,
|
206 |
chunk_size_slider,
|
207 |
+
seed_input,
|
208 |
):
|
|
|
|
|
|
|
209 |
# Dividir o texto em sentenças
|
210 |
sentences = analyzer.split_into_sentences(gen_text_input)
|
211 |
|
|
|
218 |
# Processar cada chunk
|
219 |
audio_segments = []
|
220 |
for chunk in chunks:
|
221 |
+
audio_out, spectrogram_path, ref_text_out, seed_used = infer(
|
222 |
+
ref_audio_input,
|
223 |
+
ref_text_input, # Utiliza o Texto de Referência como está
|
224 |
+
chunk, # Processa o chunk atual
|
225 |
+
remove_silence,
|
226 |
+
cross_fade_duration_slider,
|
227 |
+
speed_slider,
|
228 |
+
nfe_slider,
|
229 |
seed=seed_input,
|
|
|
230 |
)
|
231 |
+
sr, audio_data = audio_out
|
232 |
+
audio_segments.append(audio_data)
|
233 |
|
234 |
# Concatenar os segmentos de áudio gerados
|
235 |
if audio_segments:
|
236 |
final_audio_data = np.concatenate(audio_segments)
|
237 |
return (
|
238 |
+
(sr, final_audio_data), # Áudio final
|
239 |
+
spectrogram_path, # Espectrograma
|
240 |
+
gr.update(
|
241 |
+
value=ref_text_out
|
242 |
+
), # Nenhuma mudança no Texto de Referência
|
243 |
+
gr.update(value=seed_used),
|
244 |
)
|
245 |
else:
|
246 |
gr.Warning("Nenhum áudio gerado.")
|
247 |
+
return None, None, gr.update(), gr.update()
|
248 |
|
249 |
# Saídas
|
250 |
gr.Markdown("### Resultados")
|
251 |
audio_output = gr.Audio(label="Áudio Sintetizado")
|
252 |
+
spectrogram_output = gr.Image(label="Espectrograma")
|
253 |
+
seed_output = gr.Number(label="Seed Usada")
|
254 |
# Associação do botão `generate_btn` à função `process_chunks`
|
255 |
generate_btn.click(
|
256 |
process_chunks,
|
|
|
263 |
speed_slider,
|
264 |
nfe_slider,
|
265 |
chunk_size_slider,
|
266 |
+
seed_input,
|
267 |
+
],
|
268 |
outputs=[
|
269 |
audio_output,
|
270 |
+
spectrogram_output,
|
271 |
+
ref_text_input, # Atualiza o texto de referência, se necessário
|
272 |
seed_output,
|
273 |
],
|
274 |
)
|
275 |
+
|
276 |
+
|
277 |
+
@click.command()
|
278 |
+
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
|
279 |
+
@click.option("--host", "-H", default=None, help="Host to run the app on")
|
280 |
+
@click.option(
|
281 |
+
"--share",
|
282 |
+
"-s",
|
283 |
+
default=False,
|
284 |
+
is_flag=True,
|
285 |
+
help="Share the app via Gradio share link",
|
286 |
+
)
|
287 |
+
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
|
288 |
+
def main(port, host, share, api):
|
289 |
+
global app
|
290 |
+
print("Starting app...")
|
291 |
+
app.queue(api_open=api).launch(
|
292 |
+
server_name=host, server_port=port, share=share, show_api=api
|
293 |
)
|
294 |
+
|
|
|
|
|
|
|
|
|
295 |
|
296 |
if __name__ == "__main__":
|
297 |
if not USING_SPACES:
|
logs/sentence_analyzer_2024-12-02.log
CHANGED
@@ -76,3 +76,15 @@
|
|
76 |
2024-12-02 20:47:48,875 - SentenceAnalyzer - DEBUG - Normalized whitespace
|
77 |
2024-12-02 20:47:48,898 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
|
78 |
2024-12-02 20:47:48,898 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
2024-12-02 20:47:48,875 - SentenceAnalyzer - DEBUG - Normalized whitespace
|
77 |
2024-12-02 20:47:48,898 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
|
78 |
2024-12-02 20:47:48,898 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
|
79 |
+
2024-12-02 20:55:23,847 - SentenceAnalyzer - DEBUG - Logger set up successfully
|
80 |
+
2024-12-02 20:55:23,847 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
|
81 |
+
2024-12-02 20:55:48,213 - SentenceAnalyzer - DEBUG - Starting sentence splitting
|
82 |
+
2024-12-02 20:55:48,213 - SentenceAnalyzer - DEBUG - Normalized text using NFC
|
83 |
+
2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Removed page numbers and chapter titles
|
84 |
+
2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Replaced hyphenated line breaks
|
85 |
+
2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Replaced multiple newlines with a space
|
86 |
+
2024-12-02 20:55:48,214 - SentenceAnalyzer - DEBUG - Normalized whitespace
|
87 |
+
2024-12-02 20:55:48,235 - SentenceAnalyzer - DEBUG - Split text into 1 sentences using NLTK
|
88 |
+
2024-12-02 20:55:48,235 - SentenceAnalyzer - INFO - Split text into 1 sentences after cleanup
|
89 |
+
2024-12-02 21:02:37,760 - SentenceAnalyzer - DEBUG - Logger set up successfully
|
90 |
+
2024-12-02 21:02:37,760 - SentenceAnalyzer - INFO - SentenceAnalyzer initialized successfully
|
src/f5_tts/api.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
import sys
|
3 |
from importlib.resources import files
|
4 |
|
5 |
import soundfile as sf
|
|
|
1 |
+
|
|
|
2 |
from importlib.resources import files
|
3 |
|
4 |
import soundfile as sf
|