|
|
|
import gradio as gr |
|
import torch |
|
import librosa |
|
from pathlib import Path |
|
import tempfile, torchaudio |
|
|
|
from transformers import pipeline |
|
from uuid import uuid4 |
|
|
|
|
|
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True) |
|
|
|
asr_model = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-medium", |
|
chunk_length_s=30, |
|
device=torch.device("cuda"), |
|
) |
|
|
|
def transcribe_file(f: str) -> str: |
|
predictions = asr_model(f, return_timestamps=True)["chunks"] |
|
print(f">>>>>. predictions: {predictions}") |
|
return " ".join([prediction["text"] for prediction in predictions]) |
|
|
|
|
|
def synthesize(text, audio_file, transcript): |
|
audio_file = Path(audio_file) |
|
temp_file = f"{uuid4()}.{audio_file.suffix}" |
|
|
|
|
|
with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst: |
|
dst.write(src.read()) |
|
|
|
audio_file = temp_file |
|
|
|
print(f">>>>> synthesizing! audio_file: {audio_file}") |
|
if not transcript: |
|
transcript = transcribe_file(audio_file) |
|
|
|
|
|
wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True) |
|
wav = torch.from_numpy(wav) |
|
|
|
|
|
deep_clone = True |
|
cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3) |
|
|
|
|
|
ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg) |
|
|
|
|
|
output_path = Path(tempfile.mktemp(suffix=".wav")) |
|
torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr) |
|
return str(output_path) |
|
|
|
defaults = { |
|
'temperature': 0.8, |
|
'top_k': -1, |
|
'top_p': 0.2, |
|
'typical_p': 1.0, |
|
'freq_penalty': 2.6, |
|
'presence_penalty': 0.4, |
|
'rep_penalty_window': 100, |
|
'max_prompt_phones': 360, |
|
'deep_clone': True, |
|
'nar_guidance_w': 3 |
|
} |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## MARS5 TTS Demo\nEnter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS.") |
|
text = gr.Textbox(label="Text to synthesize") |
|
audio_file = gr.Audio(label="Audio file to clone from", type="filepath") |
|
|
|
generate_btn = gr.Button(label="Generate Synthesized Audio") |
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.") |
|
prompt_text = gr.Textbox(label="Transcript of voice reference") |
|
temperature = gr.Slider(minimum=0.01, maximum=3, step=0.01, label="temperature", value=defaults['temperature']) |
|
top_k = gr.Slider(minimum=-1, maximum=2000, step=1, label="top_k", value=defaults['top_k']) |
|
top_p = gr.Slider(minimum=0.01, maximum=1.0, step=0.01, label="top_p", value=defaults['top_p']) |
|
typical_p = gr.Slider(minimum=0.01, maximum=1, step=0.01, label="typical_p", value=defaults['typical_p']) |
|
freq_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="freq_penalty", value=defaults['freq_penalty']) |
|
presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty']) |
|
rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window']) |
|
nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w']) |
|
meta_n = gr.Slider(minimum=1, maximum=10, step=1, label="meta_n", value=2, interactive=False) |
|
deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone') |
|
|
|
dummy = gr.Number(label='Example number', visible=False) |
|
|
|
output = gr.Audio(label="Synthesized Audio", type="filepath") |
|
def on_click(text, audio_file, prompt_text): |
|
print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}") |
|
of = synthesize(text, audio_file, prompt_text) |
|
print(f">>>> output file: {of}") |
|
return of |
|
|
|
generate_btn.click(on_click, inputs=[text, audio_file, prompt_text], outputs=[output]) |
|
|
|
demo.launch(share=False) |