Spaces:

jhtonyKoo
/

music_mixing_style_transfer

Running on T4

File size: 7,726 Bytes

6b95f60
77a1faa
 
 
6b95f60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166943a
c1f5b45
166943a
6b95f60
ee77966
e551a22
6b95f60
 
7748d49
6b95f60
 
 
ee4d0cd
aacd8f7
6b95f60
 
aacd8f7
6b95f60
7748d49
 
 
aacd8f7
6b95f60
7748d49
ad763a0
 
 
ee4d0cd
aacd8f7
ad763a0
 
aacd8f7
ad763a0
7748d49
 
 
aacd8f7
6b95f60
 
166943a
 
 
 
 
6b95f60
eb004a9
6b95f60
c1f5b45
 
 
 
 
 
 
eb004a9
c1f5b45
6b95f60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb004a9
 
 
 
 
253ad1f
eb004a9
6b95f60
 
 
 
 
 
 
 
 
 
 
 
14c7484
f7b0315
 
14c7484
 
f7b0315
 
14c7484
7295413
 
6b95f60
 
7295413
ad763a0
14c7484
7295413
6b95f60
 
 
 
 
 
 
 
 
14c7484
f7b0315
 
14c7484
 
f7b0315
 
14c7484
7295413
 
6b95f60
 
7295413
ad763a0
14c7484
7295413
6b95f60
4534858
6b95f60
 
 
14c7484
6b95f60
 
c1f5b45
253ad1f
6b95f60
c1f5b45
4534858
6b95f60
 
c1f5b45
6b95f60
 
c1f5b45
 
 
 
253ad1f
c1f5b45
 
 
253ad1f
c1f5b45
 
 
 
 
 
 
 
6b95f60

import os
import binascii
import warnings

import json
import argparse
import copy

import numpy as np
import matplotlib.pyplot as plt
import torch
import tqdm
import librosa
import librosa.display
import soundfile as sf
import gradio as gr
import pytube as pt

from pytube.exceptions import VideoUnavailable

from inference.style_transfer import *
from inference.mastering_transfer import *


yt_video_dir = "./yt_dir/0"
os.makedirs(yt_video_dir, exist_ok=True)


def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename_in = os.path.join(yt_video_dir, "input.wav")
        t[0].download(filename=filename_in)
    except VideoUnavailable as e:
        warnings.warn(f"Video Not Found at {yt_link} ({e})")
        filename_in = None

    # trim audio length - due to computation time on HuggingFace environment
    trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
    
    return filename_in, filename_in

def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30):
    try:
        yt = pt.YouTube(yt_link)
        t = yt.streams.filter(only_audio=True)
        filename_ref = os.path.join(yt_video_dir, "reference.wav")
        t[0].download(filename=filename_ref)
    except VideoUnavailable as e:
        warnings.warn(f"Video Not Found at {yt_link} ({e})")
        filename_ref = None

    # trim audio length - due to computation time on HuggingFace environment
    trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second)
    
    return filename_ref, filename_ref

def inference(file_uploaded_in, file_uploaded_ref):
    # Perform music mixing style transfer
    args = set_up()
    
    inference_style_transfer = Mixing_Style_Transfer_Inference(args)
    output_wav_path = inference_style_transfer.inference(None, None)
    
    return output_wav_path

def inference_mastering(file_uploaded_in, file_uploaded_ref):
    # Perform music mastering style transfer
    args = set_up()
    
    inference_mastering_style_transfer = Mastering_Style_Transfer_Inference(args)
    output_wav_path_mastering = inference_mastering_style_transfer.inference(file_uploaded_in, file_uploaded_ref)
    
    return output_wav_path_mastering


with gr.Blocks() as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
              <div
                style="
                  display: inline-flex;
                  align-items: center;
                  gap: 0.8rem;
                  font-size: 1.75rem;
                "
              >
                <h1 style="font-weight: 900; margin-bottom: 7px;">
                  Music Mixing Style Transfer
                </h1>
              </div>
        """
    )
    gr.Markdown(
        """
        This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023).

        Transfer the input track's mixing style to that of reference's by uploading files or downloading via YouTube links. This demo page also provides mastering style transfer, which can be interpreted as an extended work of [this paper](https://huggingface.co/papers/2202.08520).
        """
    )
    with gr.Group():
        with gr.Column():
            with gr.Blocks():
                with gr.Tab("Input Music"):
                    file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link_in = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_in_start_sec = gr.Number(
                            value=0,
                            label="starting point of the song (in seconds)"
                        )
                        yt_in_duration_sec = gr.Number(
                            value=30,
                            label="duration of the song (in seconds)"
                        )
                        yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path_in = gr.Audio(
                        label="Input Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn_in.click(
                        get_audio_from_yt_video_input,
                        inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec],
                        outputs=[yt_audio_path_in, file_uploaded_in],
                    )
            with gr.Blocks():
                with gr.Tab("Reference Music"):
                    file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style", type="filepath")
                with gr.Tab("YouTube url"):
                    with gr.Row():
                        yt_link_ref = gr.Textbox(
                            label="Enter YouTube Link of the Video", autofocus=True, lines=3
                        )
                        yt_ref_start_sec = gr.Number(
                            value=0,
                            label="starting point of the song (in seconds)"
                        )
                        yt_ref_duration_sec = gr.Number(
                            value=30,
                            label="duration of the song (in seconds)"
                        )
                        yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg")
                    yt_audio_path_ref = gr.Audio(
                        label="Reference Audio Extracted from the YouTube Video", interactive=False
                    )
                    yt_btn_ref.click(
                        get_audio_from_yt_video_ref,
                        inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec],
                        outputs=[yt_audio_path_ref, file_uploaded_ref],
                    )
                
    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient...  </h3> </div>
            """
        )
        with gr.Column():
            inference_btn = gr.Button("Run Mixing Style Transfer")
        with gr.Row():
            output_mix = gr.Audio(label="mixing style transferred music track")
            inference_btn.click(
                inference,
                inputs=[file_uploaded_in, file_uploaded_ref],
                outputs=[output_mix],
            )

    
    with gr.Group():
        gr.HTML(
            """
            <div> <h3> <center> Mastering Style Transfer. Perform mastering style transfer using the FXencoder.</h3> </div>
            """
        )
        with gr.Column():
            inference_mastering_btn = gr.Button("Run Mastering Style Transfer")
        with gr.Row():
            output_master = gr.Audio(label="mastering style transferred music track")
            inference_mastering_btn.click(
                inference_mastering,
                inputs=[file_uploaded_in, file_uploaded_ref],
                outputs=[output_master],
            )
            

    
if __name__ == "__main__":
    demo.launch(debug=True)