File size: 5,025 Bytes
0c20337 ab0bdb4 16c7cf3 49c7767 16c7cf3 b5485c0 16c7cf3 ab0bdb4 93c38a4 ab0bdb4 16c7cf3 ab0bdb4 16c7cf3 0c5c249 16c7cf3 ed1a5ad 8e6abd8 ed1a5ad 16c7cf3 0c5c249 16c7cf3 ed1a5ad 0c20337 16c7cf3 a0ea8bb ed1a5ad 16c7cf3 0c5c249 5165e58 ed1a5ad 0c20337 16c7cf3 0c5c249 0c20337 16c7cf3 0c20337 16c7cf3 ab0bdb4 16c7cf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
#from melo.api import TTS
from TTS.api import TTS
from openvoice.api import ToneColorConverter
from meloTTS import english
# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize tone converter
ckpt_converter = "checkpoints/converter/config.json"
tone_color_converter = ToneColorConverter(ckpt_converter)
# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
# import melo.text.english as english
# original_g2p = english.g2p
# def patched_g2p(text):
# phones, tones, word2ph = original_g2p(text)
# # Fix: wrap ints in list to avoid TypeError
# word2ph_fixed = []
# for item in word2ph:
# if isinstance(item, int):
# word2ph_fixed.append([item])
# else:
# word2ph_fixed.append(item)
# return phones, tones, word2ph_fixed
# english.g2p = patched_g2p
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
# Generate base TTS voice
speed = 1.0
model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
# Use speaker_wav as reference to extract style embedding
from openvoice import se_extractor
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# Run the tone conversion
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=ref_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace",
)
return final_output_path
# Gradio interface
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged",
title="Text to Voice using Melo TTS + OpenVoice",
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()
# import os
# import time
# import uuid
# import gradio as gr
# from TTS.api import TTS
# from openvoice import se_extractor
# from openvoice.api import ToneColorConverter
# # Import your local english.py logic
# from meloTTS import english
# # Paths
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
# output_dir = "outputs"
# os.makedirs(output_dir, exist_ok=True)
# # Load OpenVoice tone converter
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
# tone_color_converter.load_model()
# def clone_and_speak(text, speaker_wav):
# if not speaker_wav:
# return "Please upload a reference .wav file."
# base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
# tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
# final_output_path = f"{output_dir}/{base_name}_converted.wav"
# # Use English speaker model
# model = TTS(language="EN", device=device)
# speaker_ids = model.hps.data.spk2id
# default_speaker_id = next(iter(speaker_ids.values()))
# # Generate base TTS voice
# model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
# # Extract style embedding
# ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# # Convert tone
# tone_color_converter.convert(
# audio_src_path=tmp_melo_path,
# src_se=ref_se,
# tgt_se=ref_se,
# output_path=final_output_path,
# message="@HuggingFace"
# )
# return final_output_path
# # Gradio Interface
# demo = gr.Interface(
# fn=clone_and_speak,
# inputs=[
# gr.Textbox(label="Text to Synthesize"),
# gr.Audio(label="Reference Voice (WAV)", type="filepath")
# ],
# outputs=gr.Audio(label="Cloned Voice Output"),
# title="Voice Cloner with MeloTTS + OpenVoice"
# )
# if __name__ == "__main__":
# demo.launch()
|