File size: 5,626 Bytes
7712bf9 6711545 7712bf9 0c20337 ab0bdb4 16c7cf3 49c7767 16c7cf3 b5485c0 16c7cf3 0856e34 5e9b992 93c38a4 1fbf0a3 ab0bdb4 16c7cf3 ab0bdb4 16c7cf3 53a7adb 5e9b992 16c7cf3 0c5c249 5e9b992 cbb34e3 16c7cf3 ed1a5ad 5e9b992 ed1a5ad 8e6abd8 ed1a5ad c1e585c 6d77b5b 5e9b992 cbb34e3 5e9b992 689f7db 5e9b992 cbb34e3 5e9b992 689f7db 5e9b992 0c20337 16c7cf3 c1e585c ed1a5ad 5e9b992 ed1a5ad 16c7cf3 0c5c249 5165e58 ed1a5ad 0c20337 cbb34e3 878264e 41260bf 16c7cf3 ab0bdb4 16c7cf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import nltk
nltk.download('all')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
#from melo.api import TTS
from MeloTTS.melo.api import TTS
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
#from meloTTS import english
# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize tone converter
ckpt_converter = "checkpoint/converter"
# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
for speaker_key in speaker_ids.keys():
speaker_id = speaker_ids[speaker_key]
speaker_key = speaker_key.lower().replace('_', '-')
source_se = torch.load(f'checkpoint/base_speakers/ses/{speaker_key}.pth', map_location=device)
speed = 1.0
# Use speaker_wav as reference to extract style embedding
#torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=False)
if torch.backends.mps.is_available() and device == 'cpu':
torch.backends.mps.is_available = lambda: False
model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed)
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Run the tone conversion
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=source_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace",
)
return final_output_path
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged",
title="Text to Voice using Melo TTS + OpenVoice",
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()
# iface = gr.Interface(
# fn=clone_with_base_speaker,
# inputs=[
# gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."),
# gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"),
# ],
# outputs=gr.Audio(type="filepath", label="Cloned Voice Output"),
# title="Voice Cloning with OpenVoice Base Speakers",
# description="Choose a base speaker from OpenVoice and enter text to generate voice."
# )
# iface.launch()
# import os
# import time
# import uuid
# import gradio as gr
# from TTS.api import TTS
# from openvoice import se_extractor
# from openvoice.api import ToneColorConverter
# # Import your local english.py logic
# from meloTTS import english
# # Paths
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
# output_dir = "outputs"
# os.makedirs(output_dir, exist_ok=True)
# # Load OpenVoice tone converter
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
# tone_color_converter.load_model()
# def clone_and_speak(text, speaker_wav):
# if not speaker_wav:
# return "Please upload a reference .wav file."
# base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
# tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
# final_output_path = f"{output_dir}/{base_name}_converted.wav"
# # Use English speaker model
# model = TTS(language="EN", device=device)
# speaker_ids = model.hps.data.spk2id
# default_speaker_id = next(iter(speaker_ids.values()))
# # Generate base TTS voice
# model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
# # Extract style embedding
# ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# # Convert tone
# tone_color_converter.convert(
# audio_src_path=tmp_melo_path,
# src_se=ref_se,
# tgt_se=ref_se,
# output_path=final_output_path,
# message="@HuggingFace"
# )
# return final_output_path
# # Gradio Interface
# demo = gr.Interface(
# fn=clone_and_speak,
# inputs=[
# gr.Textbox(label="Text to Synthesize"),
# gr.Audio(label="Reference Voice (WAV)", type="filepath")
# ],
# outputs=gr.Audio(label="Cloned Voice Output"),
# title="Voice Cloner with MeloTTS + OpenVoice"
# )
# if __name__ == "__main__":
# demo.launch()
|