|
|
|
import nltk |
|
nltk.download('punkt_tab') |
|
from sentence_analyzer import SentenceAnalyzer |
|
import re |
|
import tempfile |
|
from collections import OrderedDict |
|
from importlib.resources import files |
|
|
|
import click |
|
import gradio as gr |
|
import numpy as np |
|
import soundfile as sf |
|
import torchaudio |
|
from cached_path import cached_path |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
try: |
|
import spaces |
|
|
|
USING_SPACES = True |
|
except ImportError: |
|
USING_SPACES = False |
|
|
|
|
|
def gpu_decorator(func): |
|
if USING_SPACES: |
|
return spaces.GPU(func) |
|
else: |
|
return func |
|
|
|
|
|
from f5_tts.model import DiT, UNetT |
|
from f5_tts.infer.utils_infer import ( |
|
load_vocoder, |
|
load_model, |
|
preprocess_ref_audio_text, |
|
infer_process, |
|
remove_silence_for_generated_wav, |
|
save_spectrogram, |
|
) |
|
|
|
|
|
vocoder = load_vocoder() |
|
|
|
import os |
|
from huggingface_hub import hf_hub_download |
|
def load_f5tts(): |
|
|
|
repo_id = os.getenv("MODEL_REPO_ID", "SWivid/F5-TTS/F5TTS_Base") |
|
filename = os.getenv("MODEL_FILENAME", "model_1200000.safetensors") |
|
token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
|
if not token: |
|
raise ValueError("A variável de ambiente 'HUGGINGFACE_TOKEN' não foi definida.") |
|
|
|
ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=token) |
|
|
|
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) |
|
return load_model(DiT, F5TTS_model_cfg, ckpt_path, use_ema=True) |
|
|
|
|
|
F5TTS_ema_model = load_f5tts() |
|
|
|
@gpu_decorator |
|
def infer( |
|
ref_audio_orig, ref_text, gen_text, remove_silence, cross_fade_duration=0.15, speed=1, nfe=32, show_info=gr.Info |
|
): |
|
print(nfe) |
|
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info) |
|
ema_model = F5TTS_ema_model |
|
final_wave, final_sample_rate, combined_spectrogram = infer_process( |
|
ref_audio, |
|
ref_text.lower().strip(), |
|
gen_text.lower().strip(), |
|
ema_model, |
|
vocoder, |
|
cross_fade_duration=cross_fade_duration, |
|
nfe_step=nfe, |
|
speed=speed, |
|
show_info=show_info, |
|
progress=gr.Progress(), |
|
) |
|
|
|
if remove_silence: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
sf.write(f.name, final_wave, final_sample_rate) |
|
remove_silence_for_generated_wav(f.name) |
|
final_wave, _ = torchaudio.load(f.name) |
|
final_wave = final_wave.squeeze().cpu().numpy() |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram: |
|
spectrogram_path = tmp_spectrogram.name |
|
save_spectrogram(combined_spectrogram, spectrogram_path) |
|
return (final_sample_rate, final_wave), spectrogram_path, ref_text |
|
|
|
|
|
custom_css = """ |
|
#sentences-container { |
|
border: 1px solid #ddd; |
|
border-radius: 4px; |
|
padding: 10px; |
|
margin-bottom: 10px; |
|
} |
|
.sentence-box { |
|
border: 1px solid #eee; |
|
padding: 5px; |
|
margin-bottom: 5px; |
|
border-radius: 4px; |
|
background-color: #f9f9f9; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=custom_css) as app: |
|
with gr.Tabs(): |
|
with gr.Tab("TTS Básico"): |
|
gr.Markdown("# TTS Básico com F5-TTS") |
|
|
|
|
|
ref_audio_input = gr.Audio(label="Áudio de Referência", type="filepath") |
|
gen_text_input = gr.Textbox(label="Texto para Gerar", lines=10) |
|
generate_btn = gr.Button("Sintetizar", variant="primary") |
|
|
|
|
|
gr.Markdown("### Configurações Avançadas") |
|
with gr.Accordion("Expandir Configurações Avançadas", open=False): |
|
ref_text_input = gr.Textbox( |
|
label="Texto de Referência", |
|
info="Deixe em branco para transcrever automaticamente o áudio de referência. Se você inserir texto, ele substituirá a transcrição automática.", |
|
lines=2, |
|
) |
|
remove_silence = gr.Checkbox( |
|
label="Remover Silêncios", |
|
info="O modelo tende a produzir silêncios, especialmente em áudios mais longos. Podemos remover manualmente os silêncios, se necessário. Isso também aumentará o tempo de geração.", |
|
value=False, |
|
) |
|
speed_slider = gr.Slider( |
|
label="Velocidade", |
|
minimum=0.3, |
|
maximum=2.0, |
|
value=1.0, |
|
step=0.1, |
|
info="Ajuste a velocidade do áudio.", |
|
) |
|
|
|
cross_fade_duration_slider = gr.Slider( |
|
label="Duração do Cross-fade (s)", |
|
minimum=0.0, |
|
maximum=1.0, |
|
value=0.15, |
|
step=0.01, |
|
info="Defina a duração do cross-fade entre os clipes de áudio.", |
|
) |
|
chunk_size_slider = gr.Slider( |
|
label="Número de Sentenças por Chunk", |
|
minimum=1, |
|
maximum=10, |
|
value=1, |
|
step=1, |
|
info="Defina quantas sentenças serão processadas em cada chunk.", |
|
) |
|
nfe_slider = gr.Slider( |
|
label="NFE", |
|
minimum=16, |
|
maximum=64, |
|
value=32, |
|
step=1, |
|
info="Ajuste NFE Step.", |
|
) |
|
|
|
|
|
|
|
|
|
analyzer = SentenceAnalyzer() |
|
|
|
@gpu_decorator |
|
def process_chunks( |
|
ref_audio_input, |
|
ref_text_input, |
|
gen_text_input, |
|
remove_silence, |
|
cross_fade_duration_slider, |
|
speed_slider, |
|
nfe_slider, |
|
chunk_size_slider, |
|
): |
|
|
|
sentences = analyzer.split_into_sentences(gen_text_input) |
|
|
|
|
|
chunks = [ |
|
" ".join(sentences[i : i + chunk_size_slider]) |
|
for i in range(0, len(sentences), chunk_size_slider) |
|
] |
|
|
|
|
|
audio_segments = [] |
|
for chunk in chunks: |
|
audio_out, spectrogram_path, ref_text_out = infer( |
|
ref_audio_input, |
|
ref_text_input, |
|
chunk, |
|
remove_silence, |
|
cross_fade_duration_slider, |
|
speed_slider, |
|
nfe_slider, |
|
) |
|
sr, audio_data = audio_out |
|
audio_segments.append(audio_data) |
|
|
|
|
|
if audio_segments: |
|
final_audio_data = np.concatenate(audio_segments) |
|
return ( |
|
(sr, final_audio_data), |
|
spectrogram_path, |
|
gr.update(value=ref_text_out), |
|
) |
|
else: |
|
gr.Warning("Nenhum áudio gerado.") |
|
return None, None, gr.update() |
|
|
|
|
|
|
|
gr.Markdown("### Resultados") |
|
audio_output = gr.Audio(label="Áudio Sintetizado") |
|
spectrogram_output = gr.Image(label="Espectrograma") |
|
|
|
|
|
generate_btn.click( |
|
process_chunks, |
|
inputs=[ |
|
ref_audio_input, |
|
ref_text_input, |
|
gen_text_input, |
|
remove_silence, |
|
cross_fade_duration_slider, |
|
speed_slider, |
|
nfe_slider, |
|
chunk_size_slider, |
|
], |
|
outputs=[ |
|
audio_output, |
|
spectrogram_output, |
|
ref_text_input, |
|
], |
|
) |
|
|
|
@click.command() |
|
@click.option("--port", "-p", default=None, type=int, help="Port to run the app on") |
|
@click.option("--host", "-H", default=None, help="Host to run the app on") |
|
@click.option( |
|
"--share", |
|
"-s", |
|
default=False, |
|
is_flag=True, |
|
help="Share the app via Gradio share link", |
|
) |
|
@click.option("--api", "-a", default=True, is_flag=True, help="Allow API access") |
|
def main(port, host, share, api): |
|
global app |
|
print("Starting app...") |
|
app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api) |
|
|
|
if __name__ == "__main__": |
|
if not USING_SPACES: |
|
main() |
|
else: |
|
app.queue().launch() |
|
|
|
|
|
|