Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 16, 2024

Commit

fea46cb

1 Parent(s): db9ab3b

modify app

Browse files

Files changed (4) hide show

__pycache__/inference.cpython-311.pyc +0 -0
app.py +12 -7
inference.py +17 -19
ito_snow.png +0 -0

__pycache__/inference.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/inference.cpython-311.pyc and b/__pycache__/inference.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -155,20 +155,25 @@ def update_ito_output(all_results, selected_step):
 """ APP display """
 with gr.Blocks() as demo:
     gr.Markdown("# ITO-Master: Inference Time Optimization for Mastering Style Transfer")
     gr.Markdown("# Step 1: Mastering Style Transfer")
     with gr.Tab("Upload Audio"):
         with gr.Row():
-            input_audio = gr.Audio(label="Input Audio")
-            reference_audio = gr.Audio(label="Reference Audio")
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
             with gr.Column():
-                output_audio = gr.Audio(label="Output Audio", type='numpy')
-                normalized_input = gr.Audio(label="Normalized Input Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
         process_button.click(
@@ -182,8 +187,8 @@ with gr.Blocks() as demo:
             input_youtube_url = gr.Textbox(label="Input YouTube URL")
             reference_youtube_url = gr.Textbox(label="Reference YouTube URL")
         with gr.Row():
-            input_audio_yt = gr.Audio(label="Input Audio (Do not put when using YouTube URL)")
-            reference_audio_yt = gr.Audio(label="Reference Audio (Do not put when using YouTube URL)")
         process_button_yt = gr.Button("Process Mastering Style Transfer")
@@ -208,7 +213,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## Step 2: Inference Time Optimization (ITO)")
     with gr.Row():
-        ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)")
         with gr.Column():
             num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps")
             optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")

 """ APP display """
 with gr.Blocks() as demo:
     gr.Markdown("# ITO-Master: Inference Time Optimization for Mastering Style Transfer")
+    with gr.Row():
+        gr.Markdown("Demo of Inference Time Optimization (ITO) for Music Mastering Style Transfer. \n"
+                    "The mastering style transfer is performed by a differentiable audio processing model, and the predicted parameters are shown as the output. \n"
+                    "Perform mastering style transfer with an input source audio and a reference mastering style audio. On top of this result, you can perform ITO to optimize the reference embedding $z_{\\text{ref}}$ to further gain control over the output mastering style.")
+        gr.Image("ito_snow.png", width=500)
     gr.Markdown("# Step 1: Mastering Style Transfer")
     with gr.Tab("Upload Audio"):
         with gr.Row():
+            input_audio = gr.Audio(label="Source Audio ($x_{\\text{in}}$)")
+            reference_audio = gr.Audio(label="Reference Style Audio ($x_{\\text{ref}}$)")
         process_button = gr.Button("Process Mastering Style Transfer")
         with gr.Row():
             with gr.Column():
+                output_audio = gr.Audio(label="Output Audio ($y'$)", type='numpy')
+                normalized_input = gr.Audio(label="Normalized Source Audio", type='numpy')
             param_output = gr.Textbox(label="Predicted Parameters", lines=5)
         process_button.click(
             input_youtube_url = gr.Textbox(label="Input YouTube URL")
             reference_youtube_url = gr.Textbox(label="Reference YouTube URL")
         with gr.Row():
+            input_audio_yt = gr.Audio(label="Source Audio (Do not put when using YouTube URL)")
+            reference_audio_yt = gr.Audio(label="Reference Style Audio (Do not put when using YouTube URL)")
         process_button_yt = gr.Button("Process Mastering Style Transfer")
     gr.Markdown("## Step 2: Inference Time Optimization (ITO)")
     with gr.Row():
+        ito_reference_audio = gr.Audio(label="ITO Reference Style Audio (optional)")
         with gr.Column():
             num_steps = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Steps")
             optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")

inference.py CHANGED Viewed

@@ -179,10 +179,10 @@ class MasteringStyleTransfer:
                 'band2_cutoff_freq': ('High-Mid Band Frequency', 'Hz', 8000, 12000),
                 'band2_q_factor': ('High-Mid Band Q', '', 0.1, 5.0),
                 'band3_gain_db': ('High Band Gain', 'dB', -20, 20),
-                'band3_cutoff_freq': ('High Band Frequency', 'Hz', 12000, 20000),  # Assuming sample_rate is 44100
                 'band3_q_factor': ('High Band Q', '', 0.1, 5.0),
                 'high_shelf_gain_db': ('High Shelf Gain', 'dB', -20, 20),
-                'high_shelf_cutoff_freq': ('High Shelf Cutoff', 'Hz', 4000, 20000),  # Assuming sample_rate is 44100
                 'high_shelf_q_factor': ('High Shelf Q', '', 0.1, 5.0),
             },
             'distortion': {
@@ -194,21 +194,21 @@ class MasteringStyleTransfer:
                 'high_cutoff': ('Mid/High Crossover', 'Hz', 1000, 20000),
                 'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
                 'low_shelf_comp_thresh': ('Low Band Comp Threshold', 'dB', -60, 0),
-                'low_shelf_comp_ratio': ('Low Band Comp Ratio', ':1', 1, 20),
                 'low_shelf_exp_thresh': ('Low Band Exp Threshold', 'dB', -60, 0),
-                'low_shelf_exp_ratio': ('Low Band Exp Ratio', ':1', 1, 20),
                 'low_shelf_at': ('Low Band Attack Time', 'ms', 5, 100),
                 'low_shelf_rt': ('Low Band Release Time', 'ms', 5, 100),
                 'mid_band_comp_thresh': ('Mid Band Comp Threshold', 'dB', -60, 0),
-                'mid_band_comp_ratio': ('Mid Band Comp Ratio', ':1', 1, 20),
                 'mid_band_exp_thresh': ('Mid Band Exp Threshold', 'dB', -60, 0),
-                'mid_band_exp_ratio': ('Mid Band Exp Ratio', ':1', 1, 20),
                 'mid_band_at': ('Mid Band Attack Time', 'ms', 5, 100),
                 'mid_band_rt': ('Mid Band Release Time', 'ms', 5, 100),
                 'high_shelf_comp_thresh': ('High Band Comp Threshold', 'dB', -60, 0),
-                'high_shelf_comp_ratio': ('High Band Comp Ratio', ':1', 1, 20),
                 'high_shelf_exp_thresh': ('High Band Exp Threshold', 'dB', -60, 0),
-                'high_shelf_exp_ratio': ('High Band Exp Ratio', ':1', 1, 20),
                 'high_shelf_at': ('High Band Attack Time', 'ms', 5, 100),
                 'high_shelf_rt': ('High Band Release Time', 'ms', 5, 100),
             },
@@ -236,20 +236,18 @@ class MasteringStyleTransfer:
                     print(f"fx name: {fx_name}   param_name: {param_name}")
                     if fx_name in param_mapper and param_name in param_mapper[fx_name]:
                         friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
-                        if fx_name == 'IMAGER' and param_name == 'width':
-                            # Convert width to a more intuitive scale
-                            width_percentage = param_value * 200
-                            output.append(f"  {friendly_name}: {width_percentage:.2f}% (Range: 0-200%)")
-                        else:
-                            output.append(f"  {friendly_name}: {param_value:.2f} {unit} (Range: {min_val}-{max_val})")
                     else:
                         output.append(f"  {param_name}: {param_value:.2f}")
             else:
-                if fx_name == 'IMAGER':
-                    width_percentage = fx_params.item() * 200
-                    output.append(f"  Stereo Width: {width_percentage:.2f}% (Range: 0-200%)")
-                else:
-                    output.append(f"  {fx_params.item():.2f}")
         return "\n".join(output)

                 'band2_cutoff_freq': ('High-Mid Band Frequency', 'Hz', 8000, 12000),
                 'band2_q_factor': ('High-Mid Band Q', '', 0.1, 5.0),
                 'band3_gain_db': ('High Band Gain', 'dB', -20, 20),
+                'band3_cutoff_freq': ('High Band Frequency', 'Hz', 12000, 20000),
                 'band3_q_factor': ('High Band Q', '', 0.1, 5.0),
                 'high_shelf_gain_db': ('High Shelf Gain', 'dB', -20, 20),
+                'high_shelf_cutoff_freq': ('High Shelf Cutoff', 'Hz', 4000, 20000),
                 'high_shelf_q_factor': ('High Shelf Q', '', 0.1, 5.0),
             },
             'distortion': {
                 'high_cutoff': ('Mid/High Crossover', 'Hz', 1000, 20000),
                 'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
                 'low_shelf_comp_thresh': ('Low Band Comp Threshold', 'dB', -60, 0),
+                'low_shelf_comp_ratio': ('Low Band Comp Ratio', ': 1', 1, 20),
                 'low_shelf_exp_thresh': ('Low Band Exp Threshold', 'dB', -60, 0),
+                'low_shelf_exp_ratio': ('Low Band Exp Ratio', ': 1', 1, 20),
                 'low_shelf_at': ('Low Band Attack Time', 'ms', 5, 100),
                 'low_shelf_rt': ('Low Band Release Time', 'ms', 5, 100),
                 'mid_band_comp_thresh': ('Mid Band Comp Threshold', 'dB', -60, 0),
+                'mid_band_comp_ratio': ('Mid Band Comp Ratio', ': 1', 1, 20),
                 'mid_band_exp_thresh': ('Mid Band Exp Threshold', 'dB', -60, 0),
+                'mid_band_exp_ratio': ('Mid Band Exp Ratio', ': 1', 0, 1),
                 'mid_band_at': ('Mid Band Attack Time', 'ms', 5, 100),
                 'mid_band_rt': ('Mid Band Release Time', 'ms', 5, 100),
                 'high_shelf_comp_thresh': ('High Band Comp Threshold', 'dB', -60, 0),
+                'high_shelf_comp_ratio': ('High Band Comp Ratio', ': 1', 1, 20),
                 'high_shelf_exp_thresh': ('High Band Exp Threshold', 'dB', -60, 0),
+                'high_shelf_exp_ratio': ('High Band Exp Ratio', ': 1', 1, 20),
                 'high_shelf_at': ('High Band Attack Time', 'ms', 5, 100),
                 'high_shelf_rt': ('High Band Release Time', 'ms', 5, 100),
             },
                     print(f"fx name: {fx_name}   param_name: {param_name}")
                     if fx_name in param_mapper and param_name in param_mapper[fx_name]:
                         friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
+                        if unit=='%':
+                            param_value = param_value * 100
+                        current_content = f"  {friendly_name}: {param_value:.2f} {unit}"
+                        if param_name=='mid_band_exp_ratio':
+                            current_content += f" (Range: {min_val}-{max_val})"
+                        output.append(current_content)
                     else:
                         output.append(f"  {param_name}: {param_value:.2f}")
             else:
+                # stereo imager
+                width_percentage = fx_params.item() * 200
+                output.append(f"  Stereo Width: {width_percentage:.2f}% (Range: 0-200%)")
         return "\n".join(output)

ito_snow.png ADDED Viewed