|
import os |
|
import re |
|
import random |
|
from scipy.io.wavfile import write |
|
from scipy.io.wavfile import read |
|
import numpy as np |
|
import gradio as gr |
|
import yt_dlp |
|
import subprocess |
|
from pydub import AudioSegment |
|
from audio_separator.separator import Separator |
|
from lib.infer import infer_audio |
|
import edge_tts |
|
import tempfile |
|
import anyio |
|
from pathlib import Path |
|
from lib.language_tts import language_dict |
|
import os |
|
import zipfile |
|
import shutil |
|
import urllib.request |
|
import gdown |
|
import subprocess |
|
from argparse import ArgumentParser |
|
main_dir = Path().resolve() |
|
print(main_dir) |
|
|
|
os.chdir(main_dir) |
|
models_dir = "models" |
|
audio_separat_dir = main_dir / "audio_input" |
|
|
|
|
|
|
|
def download_audio(url): |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'outtmpl': 'ytdl/%(title)s.%(ext)s', |
|
'postprocessors': [{ |
|
'key': 'FFmpegExtractAudio', |
|
'preferredcodec': 'wav', |
|
'preferredquality': '192', |
|
}], |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(url, download=True) |
|
file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav' |
|
sample_rate, audio_data = read(file_path) |
|
audio_array = np.asarray(audio_data, dtype=np.int16) |
|
|
|
return sample_rate, audio_array |
|
|
|
|
|
|
|
|
|
|
|
def separate_audio(input_audio, model_voc_inst, model_deecho, model_back_voc): |
|
output_dir = audio_separat_dir |
|
separator = Separator(output_dir=output_dir) |
|
|
|
|
|
vocals = os.path.join(output_dir, 'Vocals.wav') |
|
instrumental = os.path.join(output_dir, 'Instrumental.wav') |
|
vocals_reverb = os.path.join(output_dir, 'Vocals (Reverb).wav') |
|
vocals_no_reverb = os.path.join(output_dir, 'Vocals (No Reverb).wav') |
|
lead_vocals = os.path.join(output_dir, 'Lead Vocals.wav') |
|
backing_vocals = os.path.join(output_dir, 'Backing Vocals.wav') |
|
|
|
|
|
separator.load_model(model_filename=model_voc_inst) |
|
voc_inst = separator.separate(input_audio) |
|
os.rename(os.path.join(output_dir, voc_inst[0]), instrumental) |
|
os.rename(os.path.join(output_dir, voc_inst[1]), vocals) |
|
|
|
|
|
separator.load_model(model_filename=model_deecho) |
|
voc_no_reverb = separator.separate(vocals) |
|
os.rename(os.path.join(output_dir, voc_no_reverb[0]), vocals_no_reverb) |
|
os.rename(os.path.join(output_dir, voc_no_reverb[1]), vocals_reverb) |
|
|
|
|
|
separator.load_model(model_filename=model_back_voc) |
|
backing_voc = separator.separate(vocals_no_reverb) |
|
os.rename(os.path.join(output_dir, backing_voc[0]), backing_vocals) |
|
os.rename(os.path.join(output_dir, backing_voc[1]), lead_vocals) |
|
|
|
return instrumental, vocals, vocals_reverb, vocals_no_reverb, lead_vocals, backing_vocals |
|
|
|
|
|
|
|
def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, |
|
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, |
|
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None): |
|
|
|
|
|
if not SOUND_PATH and upload_audio is not None: |
|
SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name) |
|
with open(SOUND_PATH, "wb") as f: |
|
f.write(upload_audio.read()) |
|
|
|
|
|
if not MODEL_NAME: |
|
return "Please provide a model name." |
|
|
|
|
|
os.system("chmod +x stftpitchshift") |
|
inferred_audio = infer_audio( |
|
MODEL_NAME, |
|
SOUND_PATH, |
|
F0_CHANGE, |
|
F0_METHOD, |
|
MIN_PITCH, |
|
MAX_PITCH, |
|
CREPE_HOP_LENGTH, |
|
INDEX_RATE, |
|
FILTER_RADIUS, |
|
RMS_MIX_RATE, |
|
PROTECT, |
|
SPLIT_INFER, |
|
MIN_SILENCE, |
|
SILENCE_THRESHOLD, |
|
SEEK_STEP, |
|
KEEP_SILENCE, |
|
FORMANT_SHIFT, |
|
QUEFRENCY, |
|
TIMBRE, |
|
F0_AUTOTUNE, |
|
OUTPUT_FORMAT |
|
) |
|
|
|
return inferred_audio |
|
|
|
|
|
async def text_to_speech_edge(text, language_code): |
|
voice = language_dict.get(language_code, "default_voice") |
|
communicate = edge_tts.Communicate(text, voice) |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: |
|
tmp_path = tmp_file.name |
|
await communicate.save(tmp_path) |
|
return tmp_path |
|
|
|
|
|
|
|
|
|
def extract_zip(extraction_folder, zip_name): |
|
os.makedirs(extraction_folder) |
|
with zipfile.ZipFile(zip_name, 'r') as zip_ref: |
|
zip_ref.extractall(extraction_folder) |
|
os.remove(zip_name) |
|
|
|
index_filepath, model_filepath = None, None |
|
for root, dirs, files in os.walk(extraction_folder): |
|
for name in files: |
|
if name.endswith('.index') and os.stat(os.path.join(root, name)).st_size > 1024 * 100: |
|
index_filepath = os.path.join(root, name) |
|
|
|
if name.endswith('.pth') and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40: |
|
model_filepath = os.path.join(root, name) |
|
|
|
if not model_filepath: |
|
raise Exception(f'No .pth model file was found in the extracted zip. Please check {extraction_folder}.') |
|
|
|
|
|
os.rename(model_filepath, os.path.join(extraction_folder, os.path.basename(model_filepath))) |
|
if index_filepath: |
|
os.rename(index_filepath, os.path.join(extraction_folder, os.path.basename(index_filepath))) |
|
|
|
|
|
for filepath in os.listdir(extraction_folder): |
|
if os.path.isdir(os.path.join(extraction_folder, filepath)): |
|
shutil.rmtree(os.path.join(extraction_folder, filepath)) |
|
|
|
def download_online_model(url, dir_name): |
|
try: |
|
print(f'[~] Downloading voice model with name {dir_name}...') |
|
zip_name = url.split('/')[-1] |
|
extraction_folder = os.path.join(models_dir, dir_name) |
|
if os.path.exists(extraction_folder): |
|
raise Exception(f'Voice model directory {dir_name} already exists! Choose a different name for your voice model.') |
|
|
|
if 'pixeldrain.com' in url: |
|
url = f'https://pixeldrain.com/api/file/{zip_name}' |
|
if 'drive.google.com' in url: |
|
zip_name = dir_name + ".zip" |
|
gdown.download(url, output=zip_name, use_cookies=True, quiet=True, fuzzy=True) |
|
else: |
|
urllib.request.urlretrieve(url, zip_name) |
|
|
|
print(f'[~] Extracting zip file...') |
|
extract_zip(extraction_folder, zip_name) |
|
print(f'[+] {dir_name} Model successfully downloaded!') |
|
|
|
except Exception as e: |
|
raise Exception(str(e)) |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
parser = ArgumentParser(description='Generate a AI song in the song_output/id directory.', add_help=True) |
|
parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing") |
|
parser.add_argument("--listen", action="store_true", default=False, help="Make the UI reachable from your local network.") |
|
parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.') |
|
parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.') |
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Hex RVC", theme=gr.themes.Default(primary_hue="red", secondary_hue="pink")) as app: |
|
gr.Markdown("# Hex RVC") |
|
gr.Markdown(" join [AIHub](https://discord.gg/aihub) to get the rvc model!") |
|
|
|
with gr.Tab("Inference"): |
|
with gr.Row(): |
|
MODEL_NAME = gr.Textbox(label="Model Name", placeholder="Enter model name") |
|
SOUND_PATH = gr.Textbox(label="Audio Path (Optional)", placeholder="Leave blank to upload audio") |
|
upload_audio = gr.Audio(label="Upload Audio", type='filepath') |
|
|
|
with gr.Row(): |
|
F0_CHANGE = gr.Number(label="Pitch Change (semitones)", value=0) |
|
F0_METHOD = gr.Dropdown(choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe+", "fcpe", "hybrid[rmvpe+fcpe]"], |
|
label="F0 Method", value="fcpe") |
|
|
|
with gr.Row(): |
|
MIN_PITCH = gr.Textbox(label="Min Pitch", value="50") |
|
MAX_PITCH = gr.Textbox(label="Max Pitch", value="1100") |
|
CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120) |
|
INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75) |
|
FILTER_RADIUS = gr.Number(label="Filter Radius", value=3) |
|
RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25) |
|
PROTECT = gr.Slider(label="Protect", minimum=0, maximum=1, value=0.33) |
|
|
|
with gr.Accordion("Hex TTS"): |
|
input_text = gr.Textbox(lines=5, label="Input Text") |
|
|
|
|
|
language = gr.Dropdown(choices=list(language_dict.keys()), label="Choose the Voice Model") |
|
tts_convert = gr.Button("Convert") |
|
tts_convert.click(fn=text_to_speech_edge, inputs=[input_text, language], outputs=[upload_audio]) |
|
with gr.Accordion("Advanced Settings", open=False): |
|
SPLIT_INFER = gr.Checkbox(label="Enable Split Inference", value=False) |
|
MIN_SILENCE = gr.Number(label="Min Silence (ms)", value=500) |
|
SILENCE_THRESHOLD = gr.Number(label="Silence Threshold (dBFS)", value=-50) |
|
SEEK_STEP = gr.Slider(label="Seek Step (ms)", minimum=1, maximum=10, value=1) |
|
KEEP_SILENCE = gr.Number(label="Keep Silence (ms)", value=200) |
|
FORMANT_SHIFT = gr.Checkbox(label="Enable Formant Shift", value=False) |
|
QUEFRENCY = gr.Number(label="Quefrency", value=0) |
|
TIMBRE = gr.Number(label="Timbre", value=1) |
|
F0_AUTOTUNE = gr.Checkbox(label="Enable F0 Autotune", value=False) |
|
OUTPUT_FORMAT = gr.Dropdown(choices=["wav", "flac", "mp3"], label="Output Format", value="wav") |
|
|
|
run_button = gr.Button("Run Inference") |
|
output_audio = gr.Audio(label="Generated Audio", type='filepath') |
|
|
|
run_button.click( |
|
process_audio, |
|
inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, |
|
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, |
|
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio], |
|
outputs=output_audio |
|
) |
|
|
|
with gr.Tab("Download RVC Model"): |
|
url = gr.Textbox(label="Your model URL") |
|
dirname = gr.Textbox(label="Your Model name") |
|
button_model = gr.Button("Download model") |
|
button_model.click(fn=download_online_model, inputs=[url, dirname], outputs=[dirname]) |
|
with gr.Tab("Audio Separation"): |
|
with gr.Row(): |
|
input_audio = gr.Audio(type="filepath", label="Upload Audio File") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("Separation by Link", open = False): |
|
with gr.Row(): |
|
roformer_link = gr.Textbox( |
|
label = "Link", |
|
placeholder = "Paste the link here", |
|
interactive = True |
|
) |
|
with gr.Row(): |
|
gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)") |
|
with gr.Row(): |
|
roformer_download_button = gr.Button( |
|
"Download!", |
|
variant = "primary" |
|
) |
|
|
|
roformer_download_button.click(download_audio, [roformer_link], [input_audio]) |
|
|
|
with gr.Row(): |
|
model_voc_inst = gr.Textbox(value='model_bs_roformer_ep_317_sdr_12.9755.ckpt', label="Vocal & Instrumental Model", visible=False) |
|
model_deecho = gr.Textbox(value='UVR-DeEcho-DeReverb.pth', label="DeEcho-DeReverb Model", visible=False) |
|
model_back_voc = gr.Textbox(value='mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt', label="Backing Vocals Model", visible=False) |
|
|
|
separate_button = gr.Button("Separate Audio") |
|
|
|
with gr.Row(): |
|
instrumental_out = gr.Audio(label="Instrumental") |
|
vocals_out = gr.Audio(label="Vocals") |
|
vocals_reverb_out = gr.Audio(label="Vocals (Reverb)") |
|
vocals_no_reverb_out = gr.Audio(label="Vocals (No Reverb)") |
|
lead_vocals_out = gr.Audio(label="Lead Vocals") |
|
backing_vocals_out = gr.Audio(label="Backing Vocals") |
|
|
|
separate_button.click( |
|
separate_audio, |
|
inputs=[input_audio, model_voc_inst, model_deecho, model_back_voc], |
|
outputs=[instrumental_out, vocals_out, vocals_reverb_out, vocals_no_reverb_out, lead_vocals_out, backing_vocals_out] |
|
) |
|
|
|
|
|
|
|
app.launch( |
|
share=args.share_enabled, |
|
server_name=None if not args.listen else (args.listen_host or '0.0.0.0'), |
|
server_port=args.listen_port, |
|
) |
|
|