File size: 5,499 Bytes
0b5b973 6ae1eee 0b5b973 3a0629c 0b5b973 3a0629c b0c547a 3a0629c 26fdaed 3a0629c 0b5b973 3a0629c 0b5b973 b0c547a 3a0629c 0b5b973 b0c547a 0b5b973 b0c547a 0b5b973 6ae1eee 3a0629c 6ae1eee 3a0629c 9f8a599 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 3a0629c b0c547a 9f8a599 b0c547a 3a0629c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
import torch
import librosa
from pathlib import Path
import tempfile, torchaudio
from transformers import pipeline
# Load the MARS5 model
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
asr_model = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny",
chunk_length_s=30,
device=torch.device("cuda:0"),
)
def transcribe_file(f: str) -> str:
predictions = asr_model(f, return_timestamps=True)["chunks"]
print(f">>>>>. predictions: {predictions}")
return " ".join([prediction["text"] for prediction in predictions])
# Function to process the text and audio input and generate the synthesized output
def synthesize(text, audio_file, transcript, kwargs_dict):
print(f">>>>>>> Kwargs dict: {kwargs_dict}")
if not transcript:
transcript = transcribe_file(audio_file)
# Load the reference audio
wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True)
wav = torch.from_numpy(wav)
# Define the configuration for the TTS model
cfg = config_class(**kwargs_dict)
# Generate the synthesized audio
ar_codes, wav_out = mars5.tts(text, wav, transcript.strip(), cfg=cfg)
# Save the synthesized audio to a temporary file
output_path = Path(tempfile.mktemp(suffix=".wav"))
torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr)
return str(output_path)
defaults = {
'temperature': 0.8,
'top_k': -1,
'top_p': 0.2,
'typical_p': 1.0,
'freq_penalty': 2.6,
'presence_penalty': 0.4,
'rep_penalty_window': 100,
'max_prompt_phones': 360,
'deep_clone': True,
'nar_guidance_w': 3
}
with gr.Blocks() as demo:
link = "https://github.com/Camb-ai/MARS5-TTS"
gr.Markdown("## MARS5 TTS Demo\nEnter text and upload an audio file to clone the voice and generate synthesized speech using **[MARS5-TTS]({link})**")
text = gr.Textbox(label="Text to synthesize")
audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
generate_btn = gr.Button("Generate Synthesized Audio")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
prompt_text = gr.Textbox(label="Transcript of voice reference")
temperature = gr.Slider(minimum=0.01, maximum=3, step=0.01, label="temperature", value=defaults['temperature'])
top_k = gr.Slider(minimum=-1, maximum=2000, step=1, label="top_k", value=defaults['top_k'])
top_p = gr.Slider(minimum=0.01, maximum=1.0, step=0.01, label="top_p", value=defaults['top_p'])
typical_p = gr.Slider(minimum=0.01, maximum=1, step=0.01, label="typical_p", value=defaults['typical_p'])
freq_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="freq_penalty", value=defaults['freq_penalty'])
presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
output = gr.Audio(label="Synthesized Audio", type="filepath")
def on_click(
text,
audio_file,
prompt_text,
temperature,
top_k,
top_p,
typical_p,
freq_penalty,
presence_penalty,
rep_penalty_window,
nar_guidance_w,
deep_clone
):
print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
of = synthesize(
text,
audio_file,
prompt_text,
{
'temperature': temperature,
'top_k': top_k,
'top_p': top_p,
'typical_p': typical_p,
'freq_penalty': freq_penalty,
'presence_penalty': presence_penalty,
'rep_penalty_window': rep_penalty_window,
'nar_guidance_w': nar_guidance_w,
'deep_clone': deep_clone
}
)
print(f">>>> output file: {of}")
return of
generate_btn.click(
on_click,
inputs=[
text,
audio_file,
prompt_text,
temperature,
top_k,
top_p,
typical_p,
freq_penalty,
presence_penalty,
rep_penalty_window,
nar_guidance_w,
deep_clone
],
outputs=[output]
)
# Add examples
defaults = [0.8, -1, 0.2, 1.0, 2.6, 0.4, 100, 3, True]
examples = [
["Can you please go there and figure it out?", "female_speaker_1.flac", "People look, but no one ever finds it.", *defaults],
["Hey, do you need my help?", "male_speaker_1.flac", "Ask her to bring these things with her from the store.", *defaults]
]
gr.Examples(
examples=examples,
inputs=[text, audio_file, prompt_text, temperature, top_k, top_p, typical_p, freq_penalty, presence_penalty, rep_penalty_window, nar_guidance_w, deep_clone],
outputs=[output],
cache_examples=False,
fn=on_click
)
demo.launch(share=False) |