Spaces:
Running
Running
modify app
Browse files- app.py +40 -30
- inference.py +72 -23
app.py
CHANGED
@@ -139,11 +139,31 @@ def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, op
|
|
139 |
|
140 |
yield (args.sample_rate, current_output), ito_param_output, step, ito_log, pd.DataFrame(loss_values)
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
|
|
|
145 |
|
|
|
|
|
146 |
with gr.Blocks() as demo:
|
|
|
|
|
147 |
gr.Markdown("# Step 1: Mastering Style Transfer")
|
148 |
|
149 |
with gr.Tab("Upload Audio"):
|
@@ -207,6 +227,7 @@ with gr.Blocks() as demo:
|
|
207 |
with gr.Column():
|
208 |
ito_output_audio = gr.Audio(label="ITO Output Audio")
|
209 |
ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=15)
|
|
|
210 |
with gr.Column():
|
211 |
ito_loss_plot = gr.LinePlot(
|
212 |
x="step",
|
@@ -219,39 +240,28 @@ with gr.Blocks() as demo:
|
|
219 |
)
|
220 |
ito_log = gr.Textbox(label="ITO Log", lines=10)
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
ito_generator = perform_ito(
|
225 |
-
input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
|
226 |
-
)
|
227 |
-
|
228 |
-
# Initialize variables to store the final results
|
229 |
-
final_audio = None
|
230 |
-
final_params = None
|
231 |
-
final_steps = 0
|
232 |
-
final_log = ""
|
233 |
-
loss_df = None
|
234 |
-
|
235 |
-
# Iterate through the generator to get the final results
|
236 |
-
for audio, params, steps, log, loss_data in ito_generator:
|
237 |
-
final_audio = audio
|
238 |
-
final_params = params
|
239 |
-
final_steps = steps
|
240 |
-
final_log = log
|
241 |
-
loss_df = loss_data
|
242 |
-
|
243 |
-
# Calculate y_min and y_max
|
244 |
-
y_min = loss_df['loss'].min()
|
245 |
-
y_max = loss_df['loss'].max()
|
246 |
-
|
247 |
-
# Return the plot configuration along with the data
|
248 |
-
return final_audio, final_params, final_log, loss_df
|
249 |
|
|
|
|
|
|
|
|
|
250 |
|
251 |
ito_button.click(
|
252 |
run_ito,
|
253 |
inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
|
254 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
)
|
256 |
|
257 |
demo.launch()
|
|
|
139 |
|
140 |
yield (args.sample_rate, current_output), ito_param_output, step, ito_log, pd.DataFrame(loss_values)
|
141 |
|
142 |
+
def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
|
143 |
+
af_weights = [float(w.strip()) for w in af_weights.split(',')]
|
144 |
+
ito_generator = mastering_transfer.inference_time_optimization(
|
145 |
+
input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
|
146 |
+
)
|
147 |
+
|
148 |
+
all_results = []
|
149 |
+
for result in ito_generator:
|
150 |
+
all_results.append(result)
|
151 |
+
|
152 |
+
min_loss_step = min(range(len(all_results)), key=lambda i: all_results[i]['loss'])
|
153 |
+
|
154 |
+
loss_df = pd.DataFrame([(r['step'], r['loss']) for r in all_results], columns=['step', 'loss'])
|
155 |
+
|
156 |
+
return all_results, min_loss_step, loss_df
|
157 |
|
158 |
+
def update_ito_output(all_results, selected_step):
|
159 |
+
selected_result = all_results[selected_step]
|
160 |
+
return (args.sample_rate, selected_result['audio']), selected_result['params'], selected_result['log']
|
161 |
|
162 |
+
|
163 |
+
""" APP display """
|
164 |
with gr.Blocks() as demo:
|
165 |
+
gr.Markdown("# ITO-Master: Inference Time Optimization for Mastering Style Transfer")
|
166 |
+
|
167 |
gr.Markdown("# Step 1: Mastering Style Transfer")
|
168 |
|
169 |
with gr.Tab("Upload Audio"):
|
|
|
227 |
with gr.Column():
|
228 |
ito_output_audio = gr.Audio(label="ITO Output Audio")
|
229 |
ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=15)
|
230 |
+
ito_step_slider = gr.Slider(minimum=1, maximum=100, step=1, label="ITO Step", interactive=True)
|
231 |
with gr.Column():
|
232 |
ito_loss_plot = gr.LinePlot(
|
233 |
x="step",
|
|
|
240 |
)
|
241 |
ito_log = gr.Textbox(label="ITO Log", lines=10)
|
242 |
|
243 |
+
all_results = gr.State([])
|
244 |
+
min_loss_step = gr.State(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
+
def on_ito_complete(results, min_step, loss_df):
|
247 |
+
all_results.value = results
|
248 |
+
min_loss_step.value = min_step
|
249 |
+
return loss_df, gr.update(maximum=len(results), value=min_step+1)
|
250 |
|
251 |
ito_button.click(
|
252 |
run_ito,
|
253 |
inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
|
254 |
+
outputs=[all_results, min_loss_step, ito_loss_plot, ito_step_slider]
|
255 |
+
).then(
|
256 |
+
update_ito_output,
|
257 |
+
inputs=[all_results, ito_step_slider],
|
258 |
+
outputs=[ito_output_audio, ito_param_output, ito_log]
|
259 |
+
)
|
260 |
+
|
261 |
+
ito_step_slider.change(
|
262 |
+
update_ito_output,
|
263 |
+
inputs=[all_results, ito_step_slider],
|
264 |
+
outputs=[ito_output_audio, ito_param_output, ito_log]
|
265 |
)
|
266 |
|
267 |
demo.launch()
|
inference.py
CHANGED
@@ -60,6 +60,68 @@ class MasteringStyleTransfer:
|
|
60 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
61 |
return output_audio, predicted_params
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
64 |
fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
65 |
optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
@@ -73,13 +135,7 @@ class MasteringStyleTransfer:
|
|
73 |
|
74 |
min_loss = float('inf')
|
75 |
min_loss_step = 0
|
76 |
-
|
77 |
-
min_loss_params = None
|
78 |
-
min_loss_embedding = None
|
79 |
-
|
80 |
-
loss_history = []
|
81 |
-
divergence_counter = 0
|
82 |
-
ito_log = []
|
83 |
|
84 |
for step in range(ito_config['num_steps']):
|
85 |
optimizer.zero_grad()
|
@@ -90,20 +146,9 @@ class MasteringStyleTransfer:
|
|
90 |
losses = af_loss(output_audio, reference_tensor)
|
91 |
total_loss = sum(losses.values())
|
92 |
|
93 |
-
loss_history.append(total_loss.item())
|
94 |
-
|
95 |
if total_loss < min_loss:
|
96 |
min_loss = total_loss.item()
|
97 |
min_loss_step = step
|
98 |
-
min_loss_output = output_audio.detach()
|
99 |
-
min_loss_params = current_params
|
100 |
-
min_loss_embedding = fit_embedding.detach().clone()
|
101 |
-
|
102 |
-
# Check for divergence
|
103 |
-
if len(loss_history) > 10 and total_loss > loss_history[-11]:
|
104 |
-
divergence_counter += 1
|
105 |
-
else:
|
106 |
-
divergence_counter = 0
|
107 |
|
108 |
# Log top 5 parameter differences
|
109 |
if step == 0:
|
@@ -111,16 +156,20 @@ class MasteringStyleTransfer:
|
|
111 |
top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
|
112 |
log_entry = f"Step {step + 1}\n Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
117 |
|
118 |
total_loss.backward()
|
119 |
optimizer.step()
|
120 |
|
121 |
-
yield
|
122 |
|
123 |
-
return
|
124 |
|
125 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
126 |
sample_rate, data = audio
|
|
|
60 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
61 |
return output_audio, predicted_params
|
62 |
|
63 |
+
# def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
64 |
+
# fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
65 |
+
# optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
66 |
+
|
67 |
+
# af_loss = AudioFeatureLoss(
|
68 |
+
# weights=ito_config['af_weights'],
|
69 |
+
# sample_rate=ito_config['sample_rate'],
|
70 |
+
# stem_separation=False,
|
71 |
+
# use_clap=False
|
72 |
+
# )
|
73 |
+
|
74 |
+
# min_loss = float('inf')
|
75 |
+
# min_loss_step = 0
|
76 |
+
# min_loss_output = None
|
77 |
+
# min_loss_params = None
|
78 |
+
# min_loss_embedding = None
|
79 |
+
|
80 |
+
# loss_history = []
|
81 |
+
# divergence_counter = 0
|
82 |
+
# ito_log = []
|
83 |
+
|
84 |
+
# for step in range(ito_config['num_steps']):
|
85 |
+
# optimizer.zero_grad()
|
86 |
+
|
87 |
+
# output_audio = self.mastering_converter(input_tensor, fit_embedding)
|
88 |
+
# current_params = self.mastering_converter.get_last_predicted_params()
|
89 |
+
|
90 |
+
# losses = af_loss(output_audio, reference_tensor)
|
91 |
+
# total_loss = sum(losses.values())
|
92 |
+
|
93 |
+
# loss_history.append(total_loss.item())
|
94 |
+
|
95 |
+
# if total_loss < min_loss:
|
96 |
+
# min_loss = total_loss.item()
|
97 |
+
# min_loss_step = step
|
98 |
+
# min_loss_output = output_audio.detach()
|
99 |
+
# min_loss_params = current_params
|
100 |
+
# min_loss_embedding = fit_embedding.detach().clone()
|
101 |
+
|
102 |
+
# # Check for divergence
|
103 |
+
# if len(loss_history) > 10 and total_loss > loss_history[-11]:
|
104 |
+
# divergence_counter += 1
|
105 |
+
# else:
|
106 |
+
# divergence_counter = 0
|
107 |
+
|
108 |
+
# # Log top 5 parameter differences
|
109 |
+
# if step == 0:
|
110 |
+
# initial_params = current_params
|
111 |
+
# top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
|
112 |
+
# log_entry = f"Step {step + 1}\n Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
113 |
+
|
114 |
+
# if divergence_counter >= 10:
|
115 |
+
# print(f"Optimization stopped early due to divergence at step {step}")
|
116 |
+
# break
|
117 |
+
|
118 |
+
# total_loss.backward()
|
119 |
+
# optimizer.step()
|
120 |
+
|
121 |
+
# yield log_entry, output_audio.detach(), current_params, step + 1, total_loss.item()
|
122 |
+
|
123 |
+
# return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
|
124 |
+
|
125 |
def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
126 |
fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
127 |
optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
|
|
135 |
|
136 |
min_loss = float('inf')
|
137 |
min_loss_step = 0
|
138 |
+
all_results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
for step in range(ito_config['num_steps']):
|
141 |
optimizer.zero_grad()
|
|
|
146 |
losses = af_loss(output_audio, reference_tensor)
|
147 |
total_loss = sum(losses.values())
|
148 |
|
|
|
|
|
149 |
if total_loss < min_loss:
|
150 |
min_loss = total_loss.item()
|
151 |
min_loss_step = step
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
# Log top 5 parameter differences
|
154 |
if step == 0:
|
|
|
156 |
top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
|
157 |
log_entry = f"Step {step + 1}\n Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
|
158 |
|
159 |
+
all_results.append({
|
160 |
+
'step': step + 1,
|
161 |
+
'loss': total_loss.item(),
|
162 |
+
'audio': output_audio.detach(),
|
163 |
+
'params': current_params,
|
164 |
+
'log': log_entry
|
165 |
+
})
|
166 |
|
167 |
total_loss.backward()
|
168 |
optimizer.step()
|
169 |
|
170 |
+
yield all_results[-1]
|
171 |
|
172 |
+
return all_results, min_loss_step
|
173 |
|
174 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
175 |
sample_rate, data = audio
|