|
import gradio as gr |
|
import torch |
|
from outetts.v0_1.interface import InterfaceGGUF |
|
import soundfile as sf |
|
import tempfile |
|
import os |
|
from faster_whisper import WhisperModel |
|
import huggingface_hub |
|
|
|
def download_model(): |
|
"""Download the GGUF model from HuggingFace""" |
|
model_path = huggingface_hub.hf_hub_download( |
|
repo_id="OuteAI/OuteTTS-0.1-350M-GGUF", |
|
filename="outetts-0.1-350m.gguf" |
|
) |
|
return model_path |
|
|
|
def initialize_models(): |
|
"""Initialize the OuteTTS and Faster-Whisper models""" |
|
|
|
model_path = download_model() |
|
tts_interface = InterfaceGGUF( |
|
model_path, |
|
n_ctx=2048, |
|
n_batch=512, |
|
n_threads=4, |
|
verbose=False, |
|
) |
|
|
|
|
|
asr_model = WhisperModel("tiny", |
|
device="cpu", |
|
compute_type="int8", |
|
num_workers=1, |
|
cpu_threads=1) |
|
return tts_interface, asr_model |
|
|
|
|
|
try: |
|
TTS_INTERFACE, ASR_MODEL = initialize_models() |
|
except Exception as e: |
|
print(f"Error initializing models: {str(e)}") |
|
raise |
|
|
|
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): |
|
"""Process the audio file and generate speech with the cloned voice""" |
|
try: |
|
|
|
if not reference_text.strip(): |
|
gr.Info("Transcribing audio...") |
|
reference_text = transcribe_audio(audio_path) |
|
if reference_text.startswith("Error"): |
|
return None, reference_text |
|
|
|
gr.Info(f"Using reference text: {reference_text}") |
|
|
|
|
|
reference_text = reference_text[:2000] |
|
text_to_speak = text_to_speak[:300] |
|
|
|
|
|
speaker = TTS_INTERFACE.create_speaker( |
|
audio_path, |
|
reference_text, |
|
) |
|
|
|
|
|
output = TTS_INTERFACE.generate( |
|
text=text_to_speak, |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_lenght=1024 |
|
) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name) |
|
return temp_file.name, f"""Processing complete! |
|
Reference text: {reference_text[:300]}... |
|
(Showing first 300 characters of reference text)""" |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo: |
|
gr.Markdown("# ποΈ Voice Cloning with OuteTTS (GGUF)") |
|
gr.Markdown(""" |
|
This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file, |
|
provide the text being spoken in that audio (or leave blank for automatic transcription), |
|
and enter the new text you want to be spoken in the cloned voice. |
|
|
|
Note: |
|
- For best results, use clear audio with minimal background noise |
|
- Reference text is limited to 2000 characters |
|
- Output text is limited to 300 characters |
|
- Short inputs work best for quality results |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
audio_input = gr.Audio( |
|
label="Upload Reference Audio", |
|
type="filepath", |
|
max_length=30 |
|
) |
|
with gr.Row(): |
|
transcribe_btn = gr.Button("π Transcribe Audio", variant="secondary") |
|
|
|
reference_text = gr.Textbox( |
|
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", |
|
placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio", |
|
lines=3, |
|
max_lines=5 |
|
) |
|
text_to_speak = gr.Textbox( |
|
label="Text to Speak (what you want the cloned voice to say, max 300 characters)", |
|
placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)", |
|
lines=3, |
|
max_lines=5 |
|
) |
|
|
|
with gr.Row(): |
|
temperature = gr.Slider( |
|
minimum=0.1, |
|
maximum=0.5, |
|
value=0.1, |
|
step=0.05, |
|
label="Temperature (keep low for stability)" |
|
) |
|
repetition_penalty = gr.Slider( |
|
minimum=1.0, |
|
maximum=1.3, |
|
value=1.1, |
|
step=0.05, |
|
label="Repetition Penalty" |
|
) |
|
|
|
|
|
submit_btn = gr.Button("ποΈ Generate Voice", variant="primary") |
|
|
|
with gr.Column(): |
|
|
|
output_audio = gr.Audio(label="Generated Speech") |
|
output_message = gr.Textbox(label="Status", lines=4) |
|
|
|
|
|
gr.Markdown(""" |
|
β οΈ Note: Initial processing may take a few moments. Please be patient. |
|
""") |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
"""Transcribe audio using Faster-Whisper tiny""" |
|
try: |
|
if not audio_path: |
|
return "Please upload audio first." |
|
|
|
segments, _ = ASR_MODEL.transcribe( |
|
audio_path, |
|
beam_size=1, |
|
best_of=1, |
|
temperature=1.0, |
|
condition_on_previous_text=False, |
|
compression_ratio_threshold=2.4, |
|
log_prob_threshold=-1.0, |
|
no_speech_threshold=0.6 |
|
) |
|
|
|
text = " ".join([segment.text for segment in segments]).strip() |
|
return text[:2000] |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
transcribe_btn.click( |
|
fn=transcribe_audio, |
|
inputs=[audio_input], |
|
outputs=[reference_text], |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=process_audio_file, |
|
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], |
|
outputs=[output_audio, output_message] |
|
) |
|
|
|
gr.Markdown(""" |
|
### Tips for best results: |
|
1. Use clear, short audio samples (5-15 seconds is ideal) |
|
2. Keep both reference and output text concise |
|
3. Use lower temperature (0.1-0.2) for more stable output |
|
4. Start with short phrases to test the voice |
|
5. If generation fails, try: |
|
- Using shorter text |
|
- Reducing temperature |
|
- Using clearer audio |
|
- Simplifying the text |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |