|
import os |
|
import binascii |
|
import warnings |
|
|
|
import json |
|
import argparse |
|
import copy |
|
|
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import torch |
|
import tqdm |
|
import librosa |
|
import librosa.display |
|
import soundfile as sf |
|
import gradio as gr |
|
import pytube as pt |
|
|
|
from pytube.exceptions import VideoUnavailable |
|
|
|
from inference.style_transfer import * |
|
|
|
|
|
yt_video_dir = f"./yt_dir/{binascii.hexlify(os.urandom(8)).decode()}" |
|
os.makedirs(yt_video_dir, exist_ok=True) |
|
|
|
|
|
def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30): |
|
try: |
|
yt = pt.YouTube(yt_link) |
|
t = yt.streams.filter(only_audio=True) |
|
filename_in = os.path.join(yt_video_dir, "input.wav") |
|
t[0].download(filename=filename_in) |
|
except VideoUnavailable as e: |
|
warnings.warn(f"Video Not Found at {yt_link} ({e})") |
|
filename_in = None |
|
|
|
|
|
trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) |
|
|
|
return filename_in, filename_in |
|
|
|
def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30): |
|
try: |
|
yt = pt.YouTube(yt_link) |
|
t = yt.streams.filter(only_audio=True) |
|
filename_ref = os.path.join(yt_video_dir, "reference.wav") |
|
t[0].download(filename=filename_ref) |
|
except VideoUnavailable as e: |
|
warnings.warn(f"Video Not Found at {yt_link} ({e})") |
|
filename_ref = None |
|
|
|
|
|
trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) |
|
|
|
return filename_ref, filename_ref |
|
|
|
def inference(file_uploaded_in, file_uploaded_ref): |
|
|
|
os.system(f"rm -r {yt_video_dir}/separated") |
|
|
|
os.system(f"mv {file_uploaded_in} {yt_video_dir}/input.wav") |
|
os.system(f"mv {file_uploaded_ref} {yt_video_dir}/reference.wav") |
|
|
|
|
|
args = set_up() |
|
|
|
inference_style_transfer = Mixing_Style_Transfer_Inference(args) |
|
output_wav_path = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref) |
|
|
|
return output_wav_path |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Music Mixing Style Transfer |
|
</h1> |
|
</div> |
|
""" |
|
) |
|
gr.Markdown( |
|
""" |
|
This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023). |
|
- [project page](https://jhtonykoo.github.io/MixingStyleTransfer/) |
|
- [GitHub](https://github.com/jhtonyKoo/music_mixing_style_transfer) |
|
- [supplementary](https://pale-cicada-946.notion.site/Music-Mixing-Style-Transfer-A-Contrastive-Learning-Approach-to-Disentangle-Audio-Effects-Supplemen-e6eccd9a431a4a8fa4fdd5adb2d3f219) |
|
""" |
|
) |
|
with gr.Group(): |
|
with gr.Column(): |
|
with gr.Blocks(): |
|
with gr.Tab("Input Music"): |
|
file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type="filepath") |
|
with gr.Tab("YouTube url"): |
|
with gr.Row(): |
|
yt_link_in = gr.Textbox( |
|
label="Enter YouTube Link of the Video", autofocus=True, lines=3 |
|
) |
|
yt_in_start_sec = gr.Number( |
|
value=0, |
|
label="starting point of the song (in seconds)" |
|
) |
|
yt_in_duration_sec = gr.Number( |
|
value=30, |
|
label="duration of the song (in seconds)" |
|
) |
|
yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg") |
|
yt_audio_path_in = gr.Audio( |
|
label="Input Audio Extracted from the YouTube Video", interactive=False |
|
) |
|
yt_btn_in.click( |
|
get_audio_from_yt_video_input, |
|
inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec], |
|
outputs=[yt_audio_path_in, file_uploaded_in], |
|
) |
|
with gr.Blocks(): |
|
with gr.Tab("Reference Music"): |
|
file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type="filepath") |
|
with gr.Tab("YouTube url"): |
|
with gr.Row(): |
|
yt_link_ref = gr.Textbox( |
|
label="Enter YouTube Link of the Video", autofocus=True, lines=3 |
|
) |
|
yt_ref_start_sec = gr.Number( |
|
value=0, |
|
label="starting point of the song (in seconds)" |
|
) |
|
yt_ref_duration_sec = gr.Number( |
|
value=30, |
|
label="duration of the song (in seconds)" |
|
) |
|
yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg") |
|
yt_audio_path_ref = gr.Audio( |
|
label="Reference Audio Extracted from the YouTube Video", interactive=False |
|
) |
|
yt_btn_ref.click( |
|
get_audio_from_yt_video_ref, |
|
inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec], |
|
outputs=[yt_audio_path_ref, file_uploaded_ref], |
|
) |
|
|
|
with gr.Group(): |
|
gr.HTML( |
|
""" |
|
<div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient... </h3> </div> |
|
""" |
|
) |
|
with gr.Column(): |
|
inference_btn = gr.Button("Run Mixing Style Transfer") |
|
with gr.Row(): |
|
output_mix = gr.Audio(label="mixing style transferred music track") |
|
inference_btn.click( |
|
inference, |
|
inputs=[file_uploaded_in, file_uploaded_ref], |
|
outputs=[output_mix], |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|