Spaces:
Runtime error
Runtime error
import gradio as gr | |
from outetts.v0_1.interface import InterfaceHF | |
import logging | |
import os | |
# Configure logging to display information in the terminal | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Initialize the OuteTTS interface with the Hugging Face model | |
try: | |
logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'") | |
interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M") | |
logger.info("Model loaded successfully.") | |
except Exception as e: | |
logger.error(f"Failed to load model: {e}") | |
raise e | |
def generate_tts(text, temperature, repetition_penalty, max_length, speaker): | |
""" | |
Generates speech from the input text using the OuteTTS model. | |
Parameters: | |
text (str): The input text for TTS. | |
temperature (float): Sampling temperature. | |
repetition_penalty (float): Repetition penalty. | |
max_length (int): Maximum length of the generated audio tokens. | |
speaker (dict): Speaker configuration for voice cloning. | |
Returns: | |
str: Path to the generated audio file. | |
""" | |
logger.info("Received TTS generation request.") | |
logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}") | |
try: | |
# Due to a typo in interface.py, use 'max_lenght' instead of 'max_length' | |
output = interface.generate( | |
text=text, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_lenght=max_length, # Pass the parameter with typo | |
speaker=speaker | |
) | |
logger.info("TTS generation complete.") | |
# Save the output to a temporary WAV file | |
output_path = "output.wav" | |
output.save(output_path) | |
logger.info(f"Audio saved to {output_path}") | |
return output_path # Gradio will handle the audio playback | |
except Exception as e: | |
logger.error(f"Error during TTS generation: {e}") | |
return None | |
def create_speaker(audio_file, transcript): | |
""" | |
Creates a custom speaker from a reference audio file and transcript. | |
Parameters: | |
audio_file (file): Path to the reference audio file. | |
transcript (str): The transcript matching the audio. | |
Returns: | |
dict: Speaker configuration. | |
""" | |
logger.info("Received Voice Cloning request.") | |
logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}") | |
try: | |
speaker = interface.create_speaker(audio_file.name, transcript) | |
logger.info("Speaker created successfully.") | |
return speaker | |
except Exception as e: | |
logger.error(f"Error during speaker creation: {e}") | |
return None | |
# Define the Gradio Blocks interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# π€ OuteTTS - Text to Speech Interface") | |
gr.Markdown( | |
""" | |
Generate speech from text using the **OuteTTS-0.1-350M** model. | |
**Key Features:** | |
- Pure language modeling approach to TTS | |
- Voice cloning capabilities | |
- Compatible with LLaMa architecture | |
""" | |
) | |
with gr.Tab("Basic TTS"): | |
with gr.Row(): | |
text_input = gr.Textbox( | |
label="π Text Input", | |
placeholder="Enter the text for TTS generation", | |
lines=3 | |
) | |
with gr.Row(): | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.1, | |
step=0.01, | |
label="π‘οΈ Temperature" | |
) | |
repetition_penalty = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.1, | |
step=0.1, | |
label="π Repetition Penalty" | |
) | |
max_length = gr.Slider( | |
minimum=256, | |
maximum=4096, | |
value=1024, | |
step=256, | |
label="π Max Length" | |
) | |
generate_button = gr.Button("π Generate Speech") | |
output_audio = gr.Audio( | |
label="π§ Generated Speech", | |
type="filepath" # Expecting a file path to the audio | |
) | |
# Define the button click event for Basic TTS | |
generate_button.click( | |
fn=generate_tts, | |
inputs=[text_input, temperature, repetition_penalty, max_length, None], | |
outputs=output_audio | |
) | |
with gr.Tab("Voice Cloning"): | |
with gr.Row(): | |
reference_audio = gr.Audio( | |
label="π Reference Audio", | |
type="filepath", | |
source="upload", | |
optional=False | |
) | |
reference_transcript = gr.Textbox( | |
label="π Transcript", | |
placeholder="Enter the transcript matching the reference audio", | |
lines=2 | |
) | |
create_speaker_button = gr.Button("π€ Create Speaker") | |
speaker_info = gr.JSON(label="ποΈ Speaker Configuration") | |
generate_cloned_speech = gr.Textbox( | |
label="π Text Input", | |
placeholder="Enter the text for TTS generation with cloned voice", | |
lines=3 | |
) | |
with gr.Row(): | |
temperature_clone = gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.1, | |
step=0.01, | |
label="π‘οΈ Temperature" | |
) | |
repetition_penalty_clone = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.1, | |
step=0.1, | |
label="π Repetition Penalty" | |
) | |
max_length_clone = gr.Slider( | |
minimum=256, | |
maximum=4096, | |
value=1024, | |
step=256, | |
label="π Max Length" | |
) | |
generate_cloned_button = gr.Button("π Generate Cloned Speech") | |
output_cloned_audio = gr.Audio( | |
label="π§ Generated Cloned Speech", | |
type="filepath" # Expecting a file path to the audio | |
) | |
# Define the button click event for creating a speaker | |
create_speaker_button.click( | |
fn=create_speaker, | |
inputs=[reference_audio, reference_transcript], | |
outputs=speaker_info | |
) | |
# Define the button click event for generating speech with the cloned voice | |
generate_cloned_button.click( | |
fn=generate_tts, | |
inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info], | |
outputs=output_cloned_audio | |
) | |
gr.Markdown( | |
""" | |
--- | |
**Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M) | |
**Credits:** | |
- [WavTokenizer](https://github.com/jishengpeng/WavTokenizer) | |
- [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html) | |
""" | |
) | |
# Launch the Gradio app | |
if __name__ == "__main__": | |
demo.launch() | |