testpublic / app.py
rwine's picture
Update app.py
fe8a53b verified
import subprocess
import sys
import gradio as gr
from inference import Mars5TTS, InferenceConfig
import librosa
import torch
import numpy as np
# requirements.txt 설치 확인
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
print("Successfully installed requirements.txt")
except subprocess.CalledProcessError as e:
print(f"Failed to install requirements.txt: {e}")
# GPU 메모리 초기화
if torch.cuda.is_available():
torch.cuda.empty_cache()
# MARS5 TTS 모델 로드
try:
mars5 = Mars5TTS.from_pretrained("CAMB-AI/MARS5-TTS")
config = InferenceConfig(temperature=0.7)
except Exception as e:
print(f"Model loading error: {str(e)}")
raise
def clone_with_prosody(text, ref_audio, enhance_prosody=True):
try:
if isinstance(ref_audio, str):
audio_data, sr = librosa.load(ref_audio, sr=16000)
else:
audio_data = ref_audio
output_audio = mars5.tts(
text=text,
ref_audio=audio_data,
ref_sr=16000,
config=config if enhance_prosody else None,
language="ko"
)
output_path = "output_cloned_audio.wav"
output_audio.save(output_path)
return output_path
except Exception as e:
return f"Error: {str(e)}"
interface = gr.Interface(
fn=clone_with_prosody,
inputs=[
gr.Textbox(label="Text to Convert", placeholder="Enter text to convert to speech"),
gr.Audio(label="Reference Audio (Your Voice)", type="filepath", source="upload"),
gr.Checkbox(label="Enhance Prosody (Intonation/Rhythm)", value=True)
],
outputs=gr.Audio(label="Cloned Voice Output"),
title="MARS5 Voice Cloner with Prosody",
description="Upload a 3-5 second audio of your voice and enter text to clone your voice with prosody (intonation, rhythm, emotion).",
allow_flagging="never"
)
interface.launch()