|
import os |
|
import binascii |
|
import warnings |
|
|
|
import json |
|
import argparse |
|
import copy |
|
|
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import torch |
|
import tqdm |
|
import librosa |
|
import librosa.display |
|
import soundfile as sf |
|
import gradio as gr |
|
import pytube as pt |
|
|
|
from pytube.exceptions import VideoUnavailable |
|
|
|
from inference.style_transfer import * |
|
from inference.mastering_transfer import * |
|
|
|
|
|
yt_video_dir = "./yt_dir/0" |
|
os.makedirs(yt_video_dir, exist_ok=True) |
|
|
|
|
|
def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30): |
|
try: |
|
yt = pt.YouTube(yt_link) |
|
t = yt.streams.filter(only_audio=True) |
|
filename_in = os.path.join(yt_video_dir, "input.wav") |
|
t[0].download(filename=filename_in) |
|
except VideoUnavailable as e: |
|
warnings.warn(f"Video Not Found at {yt_link} ({e})") |
|
filename_in = None |
|
|
|
|
|
trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) |
|
|
|
return filename_in, filename_in |
|
|
|
def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30): |
|
try: |
|
yt = pt.YouTube(yt_link) |
|
t = yt.streams.filter(only_audio=True) |
|
filename_ref = os.path.join(yt_video_dir, "reference.wav") |
|
t[0].download(filename=filename_ref) |
|
except VideoUnavailable as e: |
|
warnings.warn(f"Video Not Found at {yt_link} ({e})") |
|
filename_ref = None |
|
|
|
|
|
trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) |
|
|
|
return filename_ref, filename_ref |
|
|
|
def inference(file_uploaded_in, file_uploaded_ref): |
|
|
|
args = set_up() |
|
|
|
inference_style_transfer = Mixing_Style_Transfer_Inference(args) |
|
output_wav_path = inference_style_transfer.inference(None, None) |
|
|
|
return output_wav_path |
|
|
|
def inference_mastering(file_uploaded_in, file_uploaded_ref): |
|
|
|
args = set_up() |
|
|
|
inference_mastering_style_transfer = Mastering_Style_Transfer_Inference(args) |
|
output_wav_path_mastering = inference_mastering_style_transfer.inference(file_uploaded_in, file_uploaded_ref) |
|
|
|
return output_wav_path_mastering |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px;"> |
|
Music Mixing Style Transfer |
|
</h1> |
|
</div> |
|
""" |
|
) |
|
gr.Markdown( |
|
""" |
|
This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023). |
|
|
|
Transfer the input track's mixing style to that of reference's by uploading files or downloading via YouTube links. This demo page also provides mastering style transfer, which can be interpreted as an extended work of [this paper](https://huggingface.co/papers/2202.08520). |
|
""" |
|
) |
|
with gr.Group(): |
|
with gr.Column(): |
|
with gr.Blocks(): |
|
with gr.Tab("Input Music"): |
|
file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type="filepath") |
|
with gr.Tab("YouTube url"): |
|
with gr.Row(): |
|
yt_link_in = gr.Textbox( |
|
label="Enter YouTube Link of the Video", autofocus=True, lines=3 |
|
) |
|
yt_in_start_sec = gr.Number( |
|
value=0 |
|
) |
|
yt_in_duration_sec = gr.Number( |
|
value=30 |
|
) |
|
yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg") |
|
yt_audio_path_in = gr.Audio( |
|
label="Input Audio Extracted from the YouTube Video", interactive=False |
|
) |
|
yt_btn_in.click( |
|
get_audio_from_yt_video_input, |
|
inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec], |
|
outputs=[yt_audio_path_in, file_uploaded_in], |
|
) |
|
with gr.Blocks(): |
|
with gr.Tab("Reference Music"): |
|
file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type="filepath") |
|
with gr.Tab("YouTube url"): |
|
with gr.Row(): |
|
yt_link_ref = gr.Textbox( |
|
label="Enter YouTube Link of the Video", autofocus=True, lines=3 |
|
) |
|
yt_ref_start_sec = gr.Number( |
|
value=0 |
|
) |
|
yt_ref_duration_sec = gr.Number( |
|
value=30 |
|
) |
|
yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg") |
|
yt_audio_path_ref = gr.Audio( |
|
label="Reference Audio Extracted from the YouTube Video", interactive=False |
|
) |
|
yt_btn_ref.click( |
|
get_audio_from_yt_video_ref, |
|
inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec], |
|
outputs=[yt_audio_path_ref, file_uploaded_ref], |
|
) |
|
|
|
with gr.Group(): |
|
gr.HTML( |
|
""" |
|
<div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient... </h3> </div> |
|
""" |
|
) |
|
with gr.Column(): |
|
inference_btn = gr.Button("Run Mixing Style Transfer") |
|
with gr.Row(): |
|
output_mix = gr.Audio(label="mixing style transferred music track") |
|
inference_btn.click( |
|
inference, |
|
inputs=[file_uploaded_in, file_uploaded_ref], |
|
outputs=[output_mix], |
|
) |
|
|
|
|
|
with gr.Group(): |
|
gr.HTML( |
|
""" |
|
<div> <h3> <center> Mastering Style Transfer. Perform mastering style transfer using the FXencoder.</h3> </div> |
|
""" |
|
) |
|
with gr.Column(): |
|
inference_mastering_btn = gr.Button("Run Mastering Style Transfer") |
|
with gr.Row(): |
|
output_master = gr.Audio(label="mastering style transferred music track") |
|
inference_mastering_btn.click( |
|
inference_mastering, |
|
inputs=[file_uploaded_in, file_uploaded_ref], |
|
outputs=[output_master], |
|
) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|