jhtonyKoo commited on
Commit
6bbce1b
·
1 Parent(s): 71be77a

modify app

Browse files
app.py CHANGED
@@ -64,8 +64,8 @@ def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio,
64
  return process_audio(input_audio, reference_audio)
65
 
66
  def process_audio(input_audio, reference_audio):
67
- output_audio, predicted_params, sr = mastering_transfer.process_audio(
68
- input_audio, reference_audio, reference_audio
69
  )
70
 
71
  param_output = mastering_transfer.get_param_output_string(predicted_params)
@@ -88,7 +88,7 @@ def process_audio(input_audio, reference_audio):
88
  # Denormalize the audio to int16
89
  output_audio = denormalize_audio(output_audio, dtype=np.int16)
90
 
91
- return (sr, output_audio), param_output
92
 
93
  def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
94
  if ito_reference_audio is None:
@@ -182,13 +182,15 @@ with gr.Blocks() as demo:
182
  process_button = gr.Button("Process Mastering Style Transfer")
183
 
184
  with gr.Row():
185
- output_audio = gr.Audio(label="Output Audio", type='numpy')
 
 
186
  param_output = gr.Textbox(label="Predicted Parameters", lines=5)
187
 
188
  process_button.click(
189
  process_audio,
190
  inputs=[input_audio, reference_audio],
191
- outputs=[output_audio, param_output]
192
  )
193
 
194
  with gr.Tab("YouTube Audio"):
@@ -252,7 +254,7 @@ with gr.Blocks() as demo:
252
 
253
  ito_button.click(
254
  perform_ito,
255
- inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
256
  outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
257
  ).then(
258
  update_ito_output,
 
64
  return process_audio(input_audio, reference_audio)
65
 
66
  def process_audio(input_audio, reference_audio):
67
+ output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
68
+ input_audio, reference_audio
69
  )
70
 
71
  param_output = mastering_transfer.get_param_output_string(predicted_params)
 
88
  # Denormalize the audio to int16
89
  output_audio = denormalize_audio(output_audio, dtype=np.int16)
90
 
91
+ return (sr, output_audio), param_output, (sr, normalized_input)
92
 
93
  def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
94
  if ito_reference_audio is None:
 
182
  process_button = gr.Button("Process Mastering Style Transfer")
183
 
184
  with gr.Row():
185
+ with gr.Column():
186
+ output_audio = gr.Audio(label="Output Audio", type='numpy')
187
+ normalized_input = gr.Audio(label="Normalized Input Audio", type='numpy')
188
  param_output = gr.Textbox(label="Predicted Parameters", lines=5)
189
 
190
  process_button.click(
191
  process_audio,
192
  inputs=[input_audio, reference_audio],
193
+ outputs=[output_audio, param_output, normalized_input]
194
  )
195
 
196
  with gr.Tab("YouTube Audio"):
 
254
 
255
  ito_button.click(
256
  perform_ito,
257
+ inputs=[normalized_input, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
258
  outputs=[ito_output_audio, ito_param_output, ito_step_slider, ito_log, ito_loss_plot, all_results]
259
  ).then(
260
  update_ito_output,
inference.py CHANGED
@@ -30,6 +30,11 @@ class MasteringStyleTransfer:
30
  self.effects_encoder = self.load_effects_encoder()
31
  self.mastering_converter = self.load_mastering_converter()
32
 
 
 
 
 
 
33
  def load_effects_encoder(self):
34
  effects_encoder = Effects_Encoder(self.args.cfg_enc)
35
  reload_weights(effects_encoder, self.args.encoder_path, self.device)
@@ -60,68 +65,6 @@ class MasteringStyleTransfer:
60
  predicted_params = self.mastering_converter.get_last_predicted_params()
61
  return output_audio, predicted_params
62
 
63
- # def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
64
- # fit_embedding = torch.nn.Parameter(initial_reference_feature)
65
- # optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
66
-
67
- # af_loss = AudioFeatureLoss(
68
- # weights=ito_config['af_weights'],
69
- # sample_rate=ito_config['sample_rate'],
70
- # stem_separation=False,
71
- # use_clap=False
72
- # )
73
-
74
- # min_loss = float('inf')
75
- # min_loss_step = 0
76
- # min_loss_output = None
77
- # min_loss_params = None
78
- # min_loss_embedding = None
79
-
80
- # loss_history = []
81
- # divergence_counter = 0
82
- # ito_log = []
83
-
84
- # for step in range(ito_config['num_steps']):
85
- # optimizer.zero_grad()
86
-
87
- # output_audio = self.mastering_converter(input_tensor, fit_embedding)
88
- # current_params = self.mastering_converter.get_last_predicted_params()
89
-
90
- # losses = af_loss(output_audio, reference_tensor)
91
- # total_loss = sum(losses.values())
92
-
93
- # loss_history.append(total_loss.item())
94
-
95
- # if total_loss < min_loss:
96
- # min_loss = total_loss.item()
97
- # min_loss_step = step
98
- # min_loss_output = output_audio.detach()
99
- # min_loss_params = current_params
100
- # min_loss_embedding = fit_embedding.detach().clone()
101
-
102
- # # Check for divergence
103
- # if len(loss_history) > 10 and total_loss > loss_history[-11]:
104
- # divergence_counter += 1
105
- # else:
106
- # divergence_counter = 0
107
-
108
- # # Log top 5 parameter differences
109
- # if step == 0:
110
- # initial_params = current_params
111
- # top_5_diff = self.get_top_n_diff_string(initial_params, current_params, top_n=5)
112
- # log_entry = f"Step {step + 1}\n Loss: {total_loss.item():.4f}\n{top_5_diff}\n"
113
-
114
- # if divergence_counter >= 10:
115
- # print(f"Optimization stopped early due to divergence at step {step}")
116
- # break
117
-
118
- # total_loss.backward()
119
- # optimizer.step()
120
-
121
- # yield log_entry, output_audio.detach(), current_params, step + 1, total_loss.item()
122
-
123
- # return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1
124
-
125
  def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
126
  fit_embedding = torch.nn.Parameter(initial_reference_feature)
127
  optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
@@ -167,11 +110,9 @@ class MasteringStyleTransfer:
167
  total_loss.backward()
168
  optimizer.step()
169
 
170
- # yield all_results[-1]
171
-
172
  return all_results, min_loss_step
173
 
174
- def preprocess_audio(self, audio, target_sample_rate=44100):
175
  sample_rate, data = audio
176
 
177
  # Normalize audio to -1 to 1 range
@@ -195,62 +136,119 @@ class MasteringStyleTransfer:
195
  else:
196
  raise ValueError(f"Unsupported audio shape: {data.shape}")
197
 
198
- # Convert to torch tensor
199
- data_tensor = torch.FloatTensor(data).unsqueeze(0)
200
-
201
  # Resample if necessary
202
  if sample_rate != target_sample_rate:
203
- data_tensor = julius.resample_frac(data_tensor, sample_rate, target_sample_rate)
 
 
 
 
 
 
 
204
 
205
  return data_tensor.to(self.device)
206
 
207
- def process_audio(self, input_audio, reference_audio, ito_reference_audio):
208
- input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate)
209
  reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
210
- ito_reference_tensor = self.preprocess_audio(ito_reference_audio, self.args.sample_rate)
211
 
212
  reference_feature = self.get_reference_embedding(reference_tensor)
213
 
214
  output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
215
 
216
- return output_audio, predicted_params, self.args.sample_rate
217
-
218
- def print_predicted_params(self, predicted_params):
219
- if predicted_params is None:
220
- print("No predicted parameters available.")
221
- return
222
-
223
- print("Predicted Parameters:")
224
- for fx_name, fx_params in predicted_params.items():
225
- print(f"\n{fx_name.upper()}:")
226
- if isinstance(fx_params, dict):
227
- for param_name, param_value in fx_params.items():
228
- if isinstance(param_value, torch.Tensor):
229
- param_value = param_value.detach().cpu().numpy()
230
- print(f" {param_name}: {param_value}")
231
- elif isinstance(fx_params, torch.Tensor):
232
- param_value = fx_params.detach().cpu().numpy()
233
- print(f" {param_value}")
234
- else:
235
- print(f" {fx_params}")
236
 
237
  def get_param_output_string(self, params):
238
  if params is None:
239
  return "No parameters available"
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  output = []
242
  for fx_name, fx_params in params.items():
243
- output.append(f"{fx_name.upper()}:")
244
  if isinstance(fx_params, dict):
245
  for param_name, param_value in fx_params.items():
246
  if isinstance(param_value, torch.Tensor):
247
  param_value = param_value.item()
248
- output.append(f" {param_name}: {param_value:.2f}")
249
- elif isinstance(fx_params, torch.Tensor):
250
- output.append(f" {fx_params.item():.2f}")
 
 
 
 
 
 
 
 
251
  else:
252
- output.append(f" {fx_params:.2f}")
253
-
 
 
 
 
254
  return "\n".join(output)
255
 
256
  def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
 
30
  self.effects_encoder = self.load_effects_encoder()
31
  self.mastering_converter = self.load_mastering_converter()
32
 
33
+ self.fx_normalizer = Audio_Effects_Normalizer(precomputed_feature_path=args.fx_norm_feature_path, \
34
+ STEMS=['mixture'], \
35
+ EFFECTS=['eq', 'imager', 'loudness'], \
36
+ audio_extension=args.audio_extension)
37
+
38
  def load_effects_encoder(self):
39
  effects_encoder = Effects_Encoder(self.args.cfg_enc)
40
  reload_weights(effects_encoder, self.args.encoder_path, self.device)
 
65
  predicted_params = self.mastering_converter.get_last_predicted_params()
66
  return output_audio, predicted_params
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
69
  fit_embedding = torch.nn.Parameter(initial_reference_feature)
70
  optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
 
110
  total_loss.backward()
111
  optimizer.step()
112
 
 
 
113
  return all_results, min_loss_step
114
 
115
+ def preprocess_audio(self, audio, target_sample_rate=44100, is_input=False):
116
  sample_rate, data = audio
117
 
118
  # Normalize audio to -1 to 1 range
 
136
  else:
137
  raise ValueError(f"Unsupported audio shape: {data.shape}")
138
 
 
 
 
139
  # Resample if necessary
140
  if sample_rate != target_sample_rate:
141
+ data = julius.resample_frac(torch.from_numpy(data), sample_rate, target_sample_rate).numpy()
142
+
143
+ # Apply fx normalization for input audio during mastering style transfer
144
+ if is_input:
145
+ data = self.fx_normalizer.normalize_audio(data, 'mixture')
146
+
147
+ # Convert to torch tensor
148
+ data_tensor = torch.FloatTensor(data).unsqueeze(0)
149
 
150
  return data_tensor.to(self.device)
151
 
152
+ def process_audio(self, input_audio, reference_audio):
153
+ input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, is_input=True)
154
  reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
 
155
 
156
  reference_feature = self.get_reference_embedding(reference_tensor)
157
 
158
  output_audio, predicted_params = self.mastering_style_transfer(input_tensor, reference_feature)
159
 
160
+ return output_audio, predicted_params, self.args.sample_rate, input_tensor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  def get_param_output_string(self, params):
163
  if params is None:
164
  return "No parameters available"
165
 
166
+ param_mapper = {
167
+ 'EQ': {
168
+ 'low_shelf_gain_db': ('Low Shelf Gain', 'dB', -20, 20),
169
+ 'low_shelf_cutoff_freq': ('Low Shelf Cutoff', 'Hz', 20, 2000),
170
+ 'low_shelf_q_factor': ('Low Shelf Q', '', 0.1, 5.0),
171
+ 'band0_gain_db': ('Low-Mid Band Gain', 'dB', -20, 20),
172
+ 'band0_cutoff_freq': ('Low-Mid Band Frequency', 'Hz', 80, 2000),
173
+ 'band0_q_factor': ('Low-Mid Band Q', '', 0.1, 5.0),
174
+ 'band1_gain_db': ('Mid Band Gain', 'dB', -20, 20),
175
+ 'band1_cutoff_freq': ('Mid Band Frequency', 'Hz', 2000, 8000),
176
+ 'band1_q_factor': ('Mid Band Q', '', 0.1, 5.0),
177
+ 'band2_gain_db': ('High-Mid Band Gain', 'dB', -20, 20),
178
+ 'band2_cutoff_freq': ('High-Mid Band Frequency', 'Hz', 8000, 12000),
179
+ 'band2_q_factor': ('High-Mid Band Q', '', 0.1, 5.0),
180
+ 'band3_gain_db': ('High Band Gain', 'dB', -20, 20),
181
+ 'band3_cutoff_freq': ('High Band Frequency', 'Hz', 12000, 20000), # Assuming sample_rate is 44100
182
+ 'band3_q_factor': ('High Band Q', '', 0.1, 5.0),
183
+ 'high_shelf_gain_db': ('High Shelf Gain', 'dB', -20, 20),
184
+ 'high_shelf_cutoff_freq': ('High Shelf Cutoff', 'Hz', 4000, 20000), # Assuming sample_rate is 44100
185
+ 'high_shelf_q_factor': ('High Shelf Q', '', 0.1, 5.0),
186
+ },
187
+ 'DISTORTION': {
188
+ 'drive_db': ('Drive', 'dB', 0, 8),
189
+ 'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
190
+ },
191
+ 'MULTIBAND_COMP': {
192
+ 'low_cutoff': ('Low/Mid Crossover', 'Hz', 20, 1000),
193
+ 'high_cutoff': ('Mid/High Crossover', 'Hz', 1000, 20000),
194
+ 'parallel_weight_factor': ('Dry/Wet Mix', '%', 0, 100),
195
+ 'low_shelf_comp_thresh': ('Low Band Comp Threshold', 'dB', -60, 0),
196
+ 'low_shelf_comp_ratio': ('Low Band Comp Ratio', ':1', 1, 20),
197
+ 'low_shelf_exp_thresh': ('Low Band Exp Threshold', 'dB', -60, 0),
198
+ 'low_shelf_exp_ratio': ('Low Band Exp Ratio', ':1', 1, 20),
199
+ 'low_shelf_at': ('Low Band Attack Time', 'ms', 5, 100),
200
+ 'low_shelf_rt': ('Low Band Release Time', 'ms', 5, 100),
201
+ 'mid_band_comp_thresh': ('Mid Band Comp Threshold', 'dB', -60, 0),
202
+ 'mid_band_comp_ratio': ('Mid Band Comp Ratio', ':1', 1, 20),
203
+ 'mid_band_exp_thresh': ('Mid Band Exp Threshold', 'dB', -60, 0),
204
+ 'mid_band_exp_ratio': ('Mid Band Exp Ratio', ':1', 1, 20),
205
+ 'mid_band_at': ('Mid Band Attack Time', 'ms', 5, 100),
206
+ 'mid_band_rt': ('Mid Band Release Time', 'ms', 5, 100),
207
+ 'high_shelf_comp_thresh': ('High Band Comp Threshold', 'dB', -60, 0),
208
+ 'high_shelf_comp_ratio': ('High Band Comp Ratio', ':1', 1, 20),
209
+ 'high_shelf_exp_thresh': ('High Band Exp Threshold', 'dB', -60, 0),
210
+ 'high_shelf_exp_ratio': ('High Band Exp Ratio', ':1', 1, 20),
211
+ 'high_shelf_at': ('High Band Attack Time', 'ms', 5, 100),
212
+ 'high_shelf_rt': ('High Band Release Time', 'ms', 5, 100),
213
+ },
214
+ 'GAIN': {
215
+ 'gain_db': ('Output Gain', 'dB', -24, 24),
216
+ },
217
+ 'IMAGER': {
218
+ 'width': ('Stereo Width', '', 0, 1),
219
+ },
220
+ 'LIMITER': {
221
+ 'threshold': ('Threshold', 'dB', -60, 0),
222
+ 'at': ('Attack Time', 'ms', 5, 100),
223
+ 'rt': ('Release Time', 'ms', 5, 100),
224
+ },
225
+ }
226
+
227
  output = []
228
  for fx_name, fx_params in params.items():
229
+ output.append(f"{fx_name}:")
230
  if isinstance(fx_params, dict):
231
  for param_name, param_value in fx_params.items():
232
  if isinstance(param_value, torch.Tensor):
233
  param_value = param_value.item()
234
+
235
+ if fx_name in param_mapper and param_name in param_mapper[fx_name]:
236
+ friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
237
+ if fx_name == 'IMAGER' and param_name == 'width':
238
+ # Convert width to a more intuitive scale
239
+ width_percentage = param_value * 200
240
+ output.append(f" {friendly_name}: {width_percentage:.2f}% (Range: 0-200%)")
241
+ else:
242
+ output.append(f" {friendly_name}: {param_value:.2f} {unit} (Range: {min_val}-{max_val})")
243
+ else:
244
+ output.append(f" {param_name}: {param_value:.2f}")
245
  else:
246
+ if fx_name == 'IMAGER':
247
+ width_percentage = fx_params.item() * 200
248
+ output.append(f" Stereo Width: {width_percentage:.2f}% (Range: 0-200%)")
249
+ else:
250
+ output.append(f" {fx_params.item():.2f}")
251
+
252
  return "\n".join(output)
253
 
254
  def get_top_n_diff_string(self, initial_params, ito_params, top_n=5):
modules/common_audioeffects.py ADDED
@@ -0,0 +1,1537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio effects for data augmentation.
3
+
4
+ Several audio effects can be combined into an augmentation chain.
5
+
6
+ Important note: We assume that the parallelization during training is done using
7
+ multi-processing and not multi-threading. Hence, we do not need the
8
+ `@sox.sox_context()` decorators as discussed in this
9
+ [thread](https://github.com/pseeth/soxbindings/issues/4).
10
+
11
+ AI Music Technology Group, Sony Group Corporation
12
+ AI Speech and Sound Group, Sony Europe
13
+
14
+
15
+ This implementation originally belongs to Sony Group Corporation,
16
+ which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
17
+ Original repo link: https://github.com/sony/FxNorm-automix
18
+ This work modifies a few implementations from the original repo to suit the task.
19
+ """
20
+
21
+ from itertools import permutations
22
+ import logging
23
+ import numpy as np
24
+ import pymixconsole as pymc
25
+ from pymixconsole.parameter import Parameter
26
+ from pymixconsole.parameter_list import ParameterList
27
+ from pymixconsole.processor import Processor
28
+ from random import shuffle
29
+ from scipy.signal import oaconvolve
30
+ import soxbindings as sox
31
+ from typing import List, Optional, Tuple, Union
32
+ from numba import jit
33
+
34
+ # prevent pysox from logging warnings regarding non-opimal timestretch factors
35
+ logging.getLogger('sox').setLevel(logging.ERROR)
36
+
37
+
38
+ # Monkey-Patch `Processor` for convenience
39
+ # (a) Allow `None` as blocksize if processor can work on variable-length audio
40
+ def new_init(self, name, parameters, block_size, sample_rate, dtype='float32'):
41
+ """
42
+ Initialize processor.
43
+
44
+ Args:
45
+ self: Reference to object
46
+ name (str): Name of processor.
47
+ parameters (parameter_list): Parameters for this processor.
48
+ block_size (int): Size of blocks for blockwise processing.
49
+ Can also be `None` if full audio can be processed at once.
50
+ sample_rate (int): Sample rate of input audio. Use `None` if effect is independent of this value.
51
+ dtype (str): data type of samples
52
+ """
53
+ self.name = name
54
+ self.parameters = parameters
55
+ self.block_size = block_size
56
+ self.sample_rate = sample_rate
57
+ self.dtype = dtype
58
+
59
+
60
+ # (b) make code simpler
61
+ def new_update(self, parameter_name):
62
+ """
63
+ Update processor after randomization of parameters.
64
+
65
+ Args:
66
+ self: Reference to object.
67
+ parameter_name (str): Parameter whose value has changed.
68
+ """
69
+ pass
70
+
71
+
72
+ # (c) representation for nice print
73
+ def new_repr(self):
74
+ """
75
+ Create human-readable representation.
76
+
77
+ Args:
78
+ self: Reference to object.
79
+
80
+ Returns:
81
+ string representation of object.
82
+ """
83
+ return f'Processor(name={self.name!r}, parameters={self.parameters!r}'
84
+
85
+
86
+ Processor.__init__ = new_init
87
+ Processor.__repr__ = new_repr
88
+ Processor.update = new_update
89
+
90
+
91
+ class AugmentationChain:
92
+ """Basic audio Fx chain which is used for data augmentation."""
93
+
94
+ def __init__(self,
95
+ fxs: Optional[List[Tuple[Union[Processor, 'AugmentationChain'], float, bool]]] = [],
96
+ shuffle: Optional[bool] = False,
97
+ parallel: Optional[bool] = False,
98
+ parallel_weight_factor = None,
99
+ randomize_param_value=True):
100
+ """
101
+ Create augmentation chain from the dictionary `fxs`.
102
+
103
+ Args:
104
+ fxs (list of tuples): First tuple element is an instances of `pymc.processor` or `AugmentationChain` that
105
+ we want to use for data augmentation. Second element gives probability that effect should be applied.
106
+ Third element defines, whether the processed signal is normalized by the RMS of the input.
107
+ shuffle (bool): If `True` then order of Fx are changed whenever chain is applied.
108
+ """
109
+ self.fxs = fxs
110
+ self.shuffle = shuffle
111
+ self.parallel = parallel
112
+ self.parallel_weight_factor = parallel_weight_factor
113
+ self.randomize_param_value = randomize_param_value
114
+
115
+ def apply_processor(self, x, processor: Processor, rms_normalize):
116
+ """
117
+ Pass audio in `x` through `processor` and output the respective processed audio.
118
+
119
+ Args:
120
+ x (Numpy array): Input audio of shape `n_samples` x `n_channels`.
121
+ processor (Processor): Audio effect that we want to apply.
122
+ rms_normalize (bool): If `True`, the processed signal is normalized by the RMS of the signal.
123
+
124
+ Returns:
125
+ Numpy array: Processed audio of shape `n_samples` x `n_channels` (same size as `x')
126
+ """
127
+
128
+ n_samples_input = x.shape[0]
129
+
130
+ if processor.block_size is None:
131
+ y = processor.process(x)
132
+ else:
133
+ # make sure that n_samples is a multiple of `processor.block_size`
134
+ if x.shape[0] % processor.block_size != 0:
135
+ n_pad = processor.block_size - x.shape[0] % processor.block_size
136
+ x = np.pad(x, ((0, n_pad), (0, 0)), mode='reflective')
137
+
138
+ y = np.zeros_like(x)
139
+ for idx in range(0, x.shape[0], processor.block_size):
140
+ y[idx:idx+processor.block_size, :] = processor.process(x[idx:idx+processor.block_size, :])
141
+
142
+ if rms_normalize:
143
+ # normalize output energy such that it is the same as the input energy
144
+ scale = np.sqrt(np.mean(np.square(x)) / np.maximum(1e-7, np.mean(np.square(y))))
145
+ y *= scale
146
+
147
+ # return audio of same length as x
148
+ return y[:n_samples_input, :]
149
+
150
+ def apply_same_processor(self, x_list, processor: Processor, rms_normalize):
151
+ for i in range(len(x_list)):
152
+ x_list[i] = self.apply_processor(x_list[i], processor, rms_normalize)
153
+
154
+ return x_list
155
+
156
+ def __call__(self, x_list):
157
+ """
158
+ Apply the same augmentation chain to audio tracks in list `x_list`.
159
+
160
+ Args:
161
+ x_list (list of Numpy array) : List of audio samples of shape `n_samples` x `n_channels`.
162
+
163
+ Returns:
164
+ y_list (list of Numpy array) : List of processed audio of same shape as `x_list` where the same effects have been applied.
165
+ """
166
+ # randomly shuffle effect order if `self.shuffle` is True
167
+ if self.shuffle:
168
+ shuffle(self.fxs)
169
+
170
+ # apply effects with probabilities given in `self.fxs`
171
+ y_list = x_list.copy()
172
+ for fx, p, rms_normalize in self.fxs:
173
+ if np.random.rand() < p:
174
+ if isinstance(fx, Processor):
175
+ # randomize all effect parameters (also calls `update()` for each processor)
176
+ if self.randomize_param_value:
177
+ fx.randomize()
178
+ else:
179
+ fx.update(None)
180
+
181
+ # apply processor
182
+ y_list = self.apply_same_processor(y_list, fx, rms_normalize)
183
+ else:
184
+ y_list = fx(y_list)
185
+
186
+ if self.parallel:
187
+ # weighting factor of input signal in the range of (0.0 ~ 0.5)
188
+ weight_in = self.parallel_weight_factor if self.parallel_weight_factor else np.random.rand() / 2.
189
+ for i in range(len(y_list)):
190
+ y_list[i] = weight_in*x_list[i] + (1-weight_in)*y_list[i]
191
+
192
+ return y_list
193
+
194
+ def __repr__(self):
195
+ """
196
+ Human-readable representation.
197
+
198
+ Returns:
199
+ string representation of object.
200
+ """
201
+ return f'AugmentationChain(fxs={self.fxs!r}, shuffle={self.shuffle!r})'
202
+
203
+
204
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DISTORTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
205
+ def hard_clip(x, threshold_dB, drive):
206
+ """
207
+ Hard clip distortion.
208
+
209
+ Args:
210
+ x: input audio
211
+ threshold_dB: threshold
212
+ drive: drive
213
+
214
+ Returns:
215
+ (Numpy array): distorted audio
216
+ """
217
+ drive_linear = np.power(10., drive / 20.).astype(np.float32)
218
+ threshold_linear = 10. ** (threshold_dB / 20.)
219
+ return np.clip(x * drive_linear, -threshold_linear, threshold_linear)
220
+
221
+
222
+ def overdrive(x, drive, colour, sample_rate):
223
+ """
224
+ Overdrive distortion.
225
+
226
+ Args:
227
+ x: input audio
228
+ drive: Controls the amount of distortion (dB).
229
+ colour: Controls the amount of even harmonic content in the output(dB)
230
+ sample_rate: sampling rate
231
+
232
+ Returns:
233
+ (Numpy array): distorted audio
234
+ """
235
+ scale = np.max(np.abs(x))
236
+ if scale > 0.9:
237
+ clips = True
238
+ x = x * (0.9 / scale)
239
+ else:
240
+ clips = False
241
+
242
+ tfm = sox.Transformer()
243
+ tfm.overdrive(gain_db=drive, colour=colour)
244
+ y = tfm.build_array(input_array=x, sample_rate_in=sample_rate).astype(np.float32)
245
+
246
+ if clips:
247
+ y *= scale / 0.9 # rescale output to original scale
248
+ return y
249
+
250
+
251
+ def hyperbolic_tangent(x, drive):
252
+ """
253
+ Hyperbolic Tanh distortion.
254
+
255
+ Args:
256
+ x: input audio
257
+ drive: drive
258
+
259
+ Returns:
260
+ (Numpy array): distorted audio
261
+ """
262
+ drive_linear = np.power(10., drive / 20.).astype(np.float32)
263
+ return np.tanh(2. * x * drive_linear)
264
+
265
+
266
+ def soft_sine(x, drive):
267
+ """
268
+ Soft sine distortion.
269
+
270
+ Args:
271
+ x: input audio
272
+ drive: drive
273
+
274
+ Returns:
275
+ (Numpy array): distorted audio
276
+ """
277
+ drive_linear = np.power(10., drive / 20.).astype(np.float32)
278
+ y = np.clip(x * drive_linear, -np.pi/4.0, np.pi/4.0)
279
+ return np.sin(2. * y)
280
+
281
+
282
+ def bit_crusher(x, bits):
283
+ """
284
+ Bit crusher distortion.
285
+
286
+ Args:
287
+ x: input audio
288
+ bits: bits
289
+
290
+ Returns:
291
+ (Numpy array): distorted audio
292
+ """
293
+ return np.rint(x * (2 ** bits)) / (2 ** bits)
294
+
295
+
296
+ class Distortion(Processor):
297
+ """
298
+ Distortion processor.
299
+
300
+ Processor parameters:
301
+ mode (str): Currently supports the following five modes: hard_clip, waveshaper, soft_sine, tanh, bit_crusher.
302
+ Each mode has different parameters such as threshold, factor, or bits.
303
+ threshold (float): threshold
304
+ drive (float): drive
305
+ factor (float): factor
306
+ limit_range (float): limit range
307
+ bits (int): bits
308
+ """
309
+
310
+ def __init__(self, sample_rate, name='Distortion', parameters=None):
311
+ """
312
+ Initialize processor.
313
+
314
+ Args:
315
+ sample_rate (int): sample rate.
316
+ name (str): Name of processor.
317
+ parameters (parameter_list): Parameters for this processor.
318
+ """
319
+ super().__init__(name, None, block_size=None, sample_rate=sample_rate)
320
+ if not parameters:
321
+ self.parameters = ParameterList()
322
+ self.parameters.add(Parameter('mode', 'hard_clip', 'string',
323
+ options=['hard_clip',
324
+ 'overdrive',
325
+ 'soft_sine',
326
+ 'tanh',
327
+ 'bit_crusher']))
328
+ self.parameters.add(Parameter('threshold', 0.0, 'float',
329
+ units='dB', maximum=0.0, minimum=-20.0))
330
+ self.parameters.add(Parameter('drive', 0.0, 'float',
331
+ units='dB', maximum=20.0, minimum=0.0))
332
+ self.parameters.add(Parameter('colour', 20.0, 'float',
333
+ maximum=100.0, minimum=0.0))
334
+ self.parameters.add(Parameter('bits', 12, 'int',
335
+ maximum=12, minimum=8))
336
+
337
+ def process(self, x):
338
+ """
339
+ Process audio.
340
+
341
+ Args:
342
+ x (Numpy array): input audio of size `n_samples x n_channels`.
343
+
344
+ Returns:
345
+ (Numpy array): distorted audio of size `n_samples x n_channels`.
346
+ """
347
+ if self.parameters.mode.value == 'hard_clip':
348
+ y = hard_clip(x, self.parameters.threshold.value, self.parameters.drive.value)
349
+ elif self.parameters.mode.value == 'overdrive':
350
+ y = overdrive(x, self.parameters.drive.value,
351
+ self.parameters.colour.value, self.sample_rate)
352
+ elif self.parameters.mode.value == 'soft_sine':
353
+ y = soft_sine(x, self.parameters.drive.value)
354
+ elif self.parameters.mode.value == 'tanh':
355
+ y = hyperbolic_tangent(x, self.parameters.drive.value)
356
+ elif self.parameters.mode.value == 'bit_crusher':
357
+ y = bit_crusher(x, self.parameters.bits.value)
358
+
359
+ # If the output has low amplitude, (some distortion settigns can "crush" down the amplitude)
360
+ # Then it`s normalised to the input's amplitude
361
+ x_max = np.max(np.abs(x)) + 1e-8
362
+ o_max = np.max(np.abs(y)) + 1e-8
363
+ if x_max > o_max:
364
+ y = y*(x_max/o_max)
365
+
366
+ return y
367
+
368
+
369
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% EQUALISER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
370
+ class Equaliser(Processor):
371
+ """
372
+ Five band parametric equaliser (two shelves and three central bands).
373
+
374
+ All gains are set in dB values and range from `MIN_GAIN` dB to `MAX_GAIN` dB.
375
+ This processor is implemented as cascade of five biquad IIR filters
376
+ that are implemented using the infamous cookbook formulae from RBJ.
377
+
378
+ Processor parameters:
379
+ low_shelf_gain (float), low_shelf_freq (float)
380
+ first_band_gain (float), first_band_freq (float), first_band_q (float)
381
+ second_band_gain (float), second_band_freq (float), second_band_q (float)
382
+ third_band_gain (float), third_band_freq (float), third_band_q (float)
383
+
384
+ original from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/equaliser.py
385
+ """
386
+
387
+ def __init__(self, n_channels,
388
+ sample_rate,
389
+ gain_range=(-15.0, 15.0),
390
+ q_range=(0.1, 2.0),
391
+ bands=['low_shelf', 'first_band', 'second_band', 'third_band', 'high_shelf'],
392
+ hard_clip=False,
393
+ name='Equaliser', parameters=None):
394
+ """
395
+ Initialize processor.
396
+
397
+ Args:
398
+ n_channels (int): Number of audio channels.
399
+ sample_rate (int): Sample rate of audio.
400
+ gain_range (tuple of floats): minimum and maximum gain that can be used.
401
+ q_range (tuple of floats): minimum and maximum q value.
402
+ hard_clip (bool): Whether we clip to [-1, 1.] after processing.
403
+ name (str): Name of processor.
404
+ parameters (parameter_list): Parameters for this processor.
405
+ """
406
+ super().__init__(name, parameters=parameters, block_size=None, sample_rate=sample_rate)
407
+
408
+ self.n_channels = n_channels
409
+
410
+ MIN_GAIN, MAX_GAIN = gain_range
411
+ MIN_Q, MAX_Q = q_range
412
+
413
+ if not parameters:
414
+ self.parameters = ParameterList()
415
+ # low shelf parameters -------
416
+ self.parameters.add(Parameter('low_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
417
+ self.parameters.add(Parameter('low_shelf_freq', 80.0, 'float', minimum=30.0, maximum=200.0))
418
+ # first band parameters ------
419
+ self.parameters.add(Parameter('first_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
420
+ self.parameters.add(Parameter('first_band_freq', 400.0, 'float', minimum=200.0, maximum=1000.0))
421
+ self.parameters.add(Parameter('first_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
422
+ # second band parameters -----
423
+ self.parameters.add(Parameter('second_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
424
+ self.parameters.add(Parameter('second_band_freq', 2000.0, 'float', minimum=1000.0, maximum=3000.0))
425
+ self.parameters.add(Parameter('second_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
426
+ # third band parameters ------
427
+ self.parameters.add(Parameter('third_band_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
428
+ self.parameters.add(Parameter('third_band_freq', 4000.0, 'float', minimum=3000.0, maximum=8000.0))
429
+ self.parameters.add(Parameter('third_band_q', 0.7, 'float', minimum=MIN_Q, maximum=MAX_Q))
430
+ # high shelf parameters ------
431
+ self.parameters.add(Parameter('high_shelf_gain', 0.0, 'float', minimum=MIN_GAIN, maximum=MAX_GAIN))
432
+ self.parameters.add(Parameter('high_shelf_freq', 8000.0, 'float', minimum=5000.0, maximum=10000.0))
433
+
434
+ self.bands = bands
435
+ self.filters = self.setup_filters()
436
+ self.hard_clip = hard_clip
437
+
438
+ def setup_filters(self):
439
+ """
440
+ Create IIR filters.
441
+
442
+ Returns:
443
+ IIR filters
444
+ """
445
+ filters = {}
446
+
447
+ for band in self.bands:
448
+
449
+ G = getattr(self.parameters, band + '_gain').value
450
+ fc = getattr(self.parameters, band + '_freq').value
451
+ rate = self.sample_rate
452
+
453
+ if band in ['low_shelf', 'high_shelf']:
454
+ Q = 0.707
455
+ filter_type = band
456
+ else:
457
+ Q = getattr(self.parameters, band + '_q').value
458
+ filter_type = 'peaking'
459
+
460
+ filters[band] = pymc.components.iirfilter.IIRfilter(G, Q, fc, rate, filter_type, n_channels=self.n_channels)
461
+
462
+ return filters
463
+
464
+ def update_filter(self, band):
465
+ """
466
+ Update filters.
467
+
468
+ Args:
469
+ band (str): Band that should be updated.
470
+ """
471
+ self.filters[band].G = getattr(self.parameters, band + '_gain').value
472
+ self.filters[band].fc = getattr(self.parameters, band + '_freq').value
473
+ self.filters[band].rate = self.sample_rate
474
+
475
+ if band in ['first_band', 'second_band', 'third_band']:
476
+ self.filters[band].Q = getattr(self.parameters, band + '_q').value
477
+
478
+ def update(self, parameter_name=None):
479
+ """
480
+ Update processor after randomization of parameters.
481
+
482
+ Args:
483
+ parameter_name (str): Parameter whose value has changed.
484
+ """
485
+ if parameter_name is not None:
486
+ bands = ['_'.join(parameter_name.split('_')[:2])]
487
+ else:
488
+ bands = self.bands
489
+
490
+ for band in bands:
491
+ self.update_filter(band)
492
+
493
+ for _band, iirfilter in self.filters.items():
494
+ iirfilter.reset_state()
495
+
496
+ def reset_state(self):
497
+ """Reset state."""
498
+ for _band, iirfilter in self.filters.items():
499
+ iirfilter.reset_state()
500
+
501
+ def process(self, x):
502
+ """
503
+ Process audio.
504
+
505
+ Args:
506
+ x (Numpy array): input audio of size `n_samples x n_channels`.
507
+
508
+ Returns:
509
+ (Numpy array): equalized audio of size `n_samples x n_channels`.
510
+ """
511
+ for _band, iirfilter in self.filters.items():
512
+ iirfilter.reset_state()
513
+ x = iirfilter.apply_filter(x)
514
+
515
+ if self.hard_clip:
516
+ x = np.clip(x, -1.0, 1.0)
517
+
518
+ # make sure that we have float32 as IIR filtering returns float64
519
+ x = x.astype(np.float32)
520
+
521
+ # make sure that we have two dimensions (if `n_channels == 1`)
522
+ if x.ndim == 1:
523
+ x = x[:, np.newaxis]
524
+
525
+ return x
526
+
527
+
528
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% COMPRESSOR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
529
+ @jit(nopython=True)
530
+ def compressor_process(x, threshold, attack_time, release_time, ratio, makeup_gain, sample_rate, yL_prev):
531
+ """
532
+ Apply compressor.
533
+
534
+ Args:
535
+ x (Numpy array): audio data.
536
+ threshold: threshold in dB.
537
+ attack_time: attack_time in ms.
538
+ release_time: release_time in ms.
539
+ ratio: ratio.
540
+ makeup_gain: makeup_gain.
541
+ sample_rate: sample rate.
542
+ yL_prev: internal state of the envelop gain.
543
+
544
+ Returns:
545
+ compressed audio.
546
+ """
547
+ M = x.shape[0]
548
+ x_g = np.zeros(M)
549
+ x_l = np.zeros(M)
550
+ y_g = np.zeros(M)
551
+ y_l = np.zeros(M)
552
+ c = np.zeros(M)
553
+ yL_prev = 0.
554
+
555
+ alpha_attack = np.exp(-1/(0.001 * sample_rate * attack_time))
556
+ alpha_release = np.exp(-1/(0.001 * sample_rate * release_time))
557
+
558
+ for i in np.arange(M):
559
+ if np.abs(x[i]) < 0.000001:
560
+ x_g[i] = -120.0
561
+ else:
562
+ x_g[i] = 20 * np.log10(np.abs(x[i]))
563
+
564
+ if ratio > 1:
565
+ if x_g[i] >= threshold:
566
+ y_g[i] = threshold + (x_g[i] - threshold) / ratio
567
+ else:
568
+ y_g[i] = x_g[i]
569
+ elif ratio < 1:
570
+ if x_g[i] <= threshold:
571
+ y_g[i] = threshold + (x_g[i] - threshold) / (1/ratio)
572
+ else:
573
+ y_g[i] = x_g[i]
574
+
575
+ x_l[i] = x_g[i] - y_g[i]
576
+
577
+ if x_l[i] > yL_prev:
578
+ y_l[i] = alpha_attack * yL_prev + (1 - alpha_attack) * x_l[i]
579
+ else:
580
+ y_l[i] = alpha_release * yL_prev + (1 - alpha_release) * x_l[i]
581
+
582
+ c[i] = np.power(10.0, (makeup_gain - y_l[i]) / 20.0)
583
+ yL_prev = y_l[i]
584
+
585
+ y = x * c
586
+
587
+ return y, yL_prev
588
+
589
+
590
+ class Compressor(Processor):
591
+ """
592
+ Single band stereo dynamic range compressor.
593
+
594
+ Processor parameters:
595
+ threshold (float)
596
+ attack_time (float)
597
+ release_time (float)
598
+ ratio (float)
599
+ makeup_gain (float)
600
+ """
601
+
602
+ def __init__(self, sample_rate, name='Compressor', parameters=None):
603
+ """
604
+ Initialize processor.
605
+
606
+ Args:
607
+ sample_rate (int): Sample rate of input audio.
608
+ name (str): Name of processor.
609
+ parameters (parameter_list): Parameters for this processor.
610
+ """
611
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
612
+
613
+ if not parameters:
614
+ self.parameters = ParameterList()
615
+ self.parameters.add(Parameter('threshold', -20.0, 'float', units='dB', minimum=-80.0, maximum=-5.0))
616
+ self.parameters.add(Parameter('attack_time', 2.0, 'float', units='ms', minimum=1., maximum=20.0))
617
+ self.parameters.add(Parameter('release_time', 100.0, 'float', units='ms', minimum=50.0, maximum=500.0))
618
+ self.parameters.add(Parameter('ratio', 4.0, 'float', minimum=4., maximum=40.0))
619
+ # we remove makeup_gain parameter inside the Compressor
620
+
621
+ # store internal state (for block-wise processing)
622
+ self.yL_prev = None
623
+
624
+ def process(self, x):
625
+ """
626
+ Process audio.
627
+
628
+ Args:
629
+ x (Numpy array): input audio of size `n_samples x n_channels`.
630
+
631
+ Returns:
632
+ (Numpy array): compressed audio of size `n_samples x n_channels`.
633
+ """
634
+ if self.yL_prev is None:
635
+ self.yL_prev = [0.] * x.shape[1]
636
+
637
+ if not self.parameters.threshold.value == 0.0 or not self.parameters.ratio.value == 1.0:
638
+ y = np.zeros_like(x)
639
+
640
+ for ch in range(x.shape[1]):
641
+ y[:, ch], self.yL_prev[ch] = compressor_process(x[:, ch],
642
+ self.parameters.threshold.value,
643
+ self.parameters.attack_time.value,
644
+ self.parameters.release_time.value,
645
+ self.parameters.ratio.value,
646
+ 0.0, # makeup_gain = 0
647
+ self.sample_rate,
648
+ self.yL_prev[ch])
649
+ else:
650
+ y = x
651
+
652
+ return y
653
+
654
+ def update(self, parameter_name=None):
655
+ """
656
+ Update processor after randomization of parameters.
657
+
658
+ Args:
659
+ parameter_name (str): Parameter whose value has changed.
660
+ """
661
+ self.yL_prev = None
662
+
663
+
664
+ # %%%%%%%%%%%%%%%%%%%%%%%%%% CONVOLUTIONAL REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
665
+ class ConvolutionalReverb(Processor):
666
+ """
667
+ Convolutional Reverb.
668
+
669
+ Processor parameters:
670
+ wet_dry (float): Wet/dry ratio.
671
+ decay (float): Applies a fade out to the impulse response.
672
+ pre_delay (float): Value in ms. Shifts the IR in time and allows.
673
+ A positive value produces a traditional delay between the dry signal and the wet.
674
+ A negative delay is, in reality, zero delay, but effectively trims off the start of IR,
675
+ so the reverb response begins at a point further in.
676
+ """
677
+
678
+ def __init__(self, impulse_responses, sample_rate, name='ConvolutionalReverb', parameters=None):
679
+ """
680
+ Initialize processor.
681
+
682
+ Args:
683
+ impulse_responses (list): List with impulse responses created by `common_dataprocessing.create_dataset`
684
+ sample_rate (int): Sample rate that we should assume (used for fade-out computation)
685
+ name (str): Name of processor.
686
+ parameters (parameter_list): Parameters for this processor.
687
+
688
+ Raises:
689
+ ValueError: if no impulse responses are provided.
690
+ """
691
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
692
+
693
+ if impulse_responses is None:
694
+ raise ValueError('List of impulse responses must be provided for ConvolutionalReverb processor.')
695
+ self.impulse_responses = impulse_responses
696
+
697
+ if not parameters:
698
+ self.parameters = ParameterList()
699
+ self.max_ir_num = len(max(impulse_responses, key=len))
700
+ self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(impulse_responses)))
701
+ self.parameters.add(Parameter('index_ir', 0, 'int', minimum=0, maximum=self.max_ir_num))
702
+ self.parameters.add(Parameter('wet', 1.0, 'float', minimum=1.0, maximum=1.0))
703
+ self.parameters.add(Parameter('dry', 0.0, 'float', minimum=0.0, maximum=0.0))
704
+ self.parameters.add(Parameter('decay', 1.0, 'float', minimum=1.0, maximum=1.0))
705
+ self.parameters.add(Parameter('pre_delay', 0, 'int', units='ms', minimum=0, maximum=0))
706
+
707
+ def update(self, parameter_name=None):
708
+ """
709
+ Update processor after randomization of parameters.
710
+
711
+ Args:
712
+ parameter_name (str): Parameter whose value has changed.
713
+ """
714
+ # we sample IR with a uniform random distribution according to RT60 values
715
+ chosen_ir_duration = self.impulse_responses[self.parameters.index.value]
716
+ chosen_ir_idx = self.parameters.index_ir.value % len(chosen_ir_duration)
717
+ self.h = np.copy(chosen_ir_duration[chosen_ir_idx]['impulse_response']())
718
+
719
+ # fade out the impulse based on the decay setting (starting from peak value)
720
+ if self.parameters.decay.value < 1.:
721
+ idx_peak = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
722
+ fstart = np.minimum(self.h.shape[0],
723
+ idx_peak + int(self.parameters.decay.value * (self.h.shape[0] - idx_peak)))
724
+ fstop = np.minimum(self.h.shape[0], fstart + int(0.020*self.sample_rate)) # constant 20 ms fade out
725
+ flen = fstop - fstart
726
+
727
+ fade = np.arange(1, flen+1, dtype=self.dtype)/flen
728
+ fade = np.power(0.1, fade * 5)
729
+ self.h[fstart:fstop, :] *= fade[:, np.newaxis]
730
+ self.h = self.h[:fstop]
731
+
732
+ def process(self, x):
733
+ """
734
+ Process audio.
735
+
736
+ Args:
737
+ x (Numpy array): input audio of size `n_samples x n_channels`.
738
+
739
+ Returns:
740
+ (Numpy array): reverbed audio of size `n_samples x n_channels`.
741
+ """
742
+ # reshape IR to the correct size
743
+ n_channels = x.shape[1]
744
+ if self.h.shape[1] == 1 and n_channels > 1:
745
+ self.h = np.hstack([self.h] * n_channels) # repeat mono IR for multi-channel input
746
+ if self.h.shape[1] > 1 and n_channels == 1:
747
+ self.h = self.h[:, np.random.randint(self.h.shape[1]), np.newaxis] # randomly choose one IR channel
748
+
749
+ if self.parameters.wet.value == 0.0:
750
+ return x
751
+ else:
752
+ # perform convolution to get wet signal
753
+ y = oaconvolve(x, self.h, mode='full', axes=0)
754
+
755
+ # cut out wet signal (compensating for the delay that the IR is introducing + predelay)
756
+ idx = np.argmax(np.max(np.abs(self.h), axis=1), axis=0)
757
+ idx += int(0.001 * np.abs(self.parameters.pre_delay.value) * self.sample_rate)
758
+
759
+ idx = np.clip(idx, 0, self.h.shape[0]-1)
760
+
761
+ y = y[idx:idx+x.shape[0], :]
762
+
763
+ # return weighted sum of dry and wet signal
764
+ return self.parameters.dry.value * x + self.parameters.wet.value * y
765
+
766
+
767
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%% HAAS EFFECT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
768
+ def haas_process(x, delay, feedback, wet_channel):
769
+ """
770
+ Add Haas effect to audio.
771
+
772
+ Args:
773
+ x (Numpy array): input audio.
774
+ delay: Delay that we apply to one of the channels (in samples).
775
+ feedback: Feedback value.
776
+ wet_channel: Which channel we process (`left` or `right`).
777
+
778
+ Returns:
779
+ (Numpy array): Audio with Haas effect.
780
+ """
781
+ y = np.copy(x)
782
+ if wet_channel == 'left':
783
+ y[:, 0] += feedback * np.roll(x[:, 0], delay)
784
+ elif wet_channel == 'right':
785
+ y[:, 1] += feedback * np.roll(x[:, 1], delay)
786
+
787
+ return y
788
+
789
+
790
+ class Haas(Processor):
791
+ """
792
+ Haas Effect Processor.
793
+
794
+ Randomly selects one channel and applies a short delay to it.
795
+
796
+ Processor parameters:
797
+ delay (int)
798
+ feedback (float)
799
+ wet_channel (string)
800
+ """
801
+
802
+ def __init__(self, sample_rate, delay_range=(-0.040, 0.040), name='Haas', parameters=None,
803
+ ):
804
+ """
805
+ Initialize processor.
806
+
807
+ Args:
808
+ sample_rate (int): Sample rate of input audio.
809
+ delay_range (tuple of floats): minimum/maximum delay for Haas effect.
810
+ name (str): Name of processor.
811
+ parameters (parameter_list): Parameters for this processor.
812
+ """
813
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
814
+
815
+ if not parameters:
816
+ self.parameters = ParameterList()
817
+ self.parameters.add(Parameter('delay', int(delay_range[1] * sample_rate), 'int', units='samples',
818
+ minimum=int(delay_range[0] * sample_rate),
819
+ maximum=int(delay_range[1] * sample_rate)))
820
+ self.parameters.add(Parameter('feedback', 0.35, 'float', minimum=0.33, maximum=0.66))
821
+ self.parameters.add(Parameter('wet_channel', 'left', 'string', options=['left', 'right']))
822
+
823
+ def process(self, x):
824
+ """
825
+ Process audio.
826
+
827
+ Args:
828
+ x (Numpy array): input audio of size `n_samples x n_channels`.
829
+
830
+ Returns:
831
+ (Numpy array): audio with Haas effect of size `n_samples x n_channels`.
832
+ """
833
+ assert x.shape[1] == 1 or x.shape[1] == 2, 'Haas effect only works with monaural or stereo audio.'
834
+
835
+ if x.shape[1] < 2:
836
+ x = np.repeat(x, 2, axis=1)
837
+
838
+ y = haas_process(x, self.parameters.delay.value,
839
+ self.parameters.feedback.value, self.parameters.wet_channel.value)
840
+
841
+ return y
842
+
843
+ def update(self, parameter_name=None):
844
+ """
845
+ Update processor after randomization of parameters.
846
+
847
+ Args:
848
+ parameter_name (str): Parameter whose value has changed.
849
+ """
850
+ self.reset_state()
851
+
852
+ def reset_state(self):
853
+ """Reset state."""
854
+ self.read_idx = 0
855
+ self.write_idx = self.parameters.delay.value
856
+ self.buffer = np.zeros((65536, 2))
857
+
858
+
859
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PANNER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
860
+ class Panner(Processor):
861
+ """
862
+ Simple stereo panner.
863
+
864
+ If input is mono, output is stereo.
865
+ Original edited from https://github.com/csteinmetz1/pymixconsole/blob/master/pymixconsole/processors/panner.py
866
+ """
867
+
868
+ def __init__(self, name='Panner', parameters=None):
869
+ """
870
+ Initialize processor.
871
+
872
+ Args:
873
+ name (str): Name of processor.
874
+ parameters (parameter_list): Parameters for this processor.
875
+ """
876
+ # default processor class constructor
877
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
878
+
879
+ if not parameters:
880
+ self.parameters = ParameterList()
881
+ self.parameters.add(Parameter('pan', 0.5, 'float', minimum=0., maximum=1.))
882
+ self.parameters.add(Parameter('pan_law', '-4.5dB', 'string',
883
+ options=['-4.5dB', 'linear', 'constant_power']))
884
+
885
+ # setup the coefficents based on default params
886
+ self.update()
887
+
888
+ def _calculate_pan_coefficents(self):
889
+ """
890
+ Calculate panning coefficients from the chosen pan law.
891
+
892
+ Based on the set pan law determine the gain value
893
+ to apply for the left and right channel to achieve panning effect.
894
+ This operates on the assumption that the input channel is mono.
895
+ The output data will be stereo at the moment, but could be expanded
896
+ to a higher channel count format.
897
+ The panning value is in the range [0, 1], where
898
+ 0 means the signal is panned completely to the left, and
899
+ 1 means the signal is apanned copletely to the right.
900
+
901
+ Raises:
902
+ ValueError: `self.parameters.pan_law` is not supported.
903
+ """
904
+ self.gains = np.zeros(2, dtype=self.dtype)
905
+
906
+ # first scale the linear [0, 1] to [0, pi/2]
907
+ theta = self.parameters.pan.value * (np.pi/2)
908
+
909
+ if self.parameters.pan_law.value == 'linear':
910
+ self.gains[0] = ((np.pi/2) - theta) * (2/np.pi)
911
+ self.gains[1] = theta * (2/np.pi)
912
+ elif self.parameters.pan_law.value == 'constant_power':
913
+ self.gains[0] = np.cos(theta)
914
+ self.gains[1] = np.sin(theta)
915
+ elif self.parameters.pan_law.value == '-4.5dB':
916
+ self.gains[0] = np.sqrt(((np.pi/2) - theta) * (2/np.pi) * np.cos(theta))
917
+ self.gains[1] = np.sqrt(theta * (2/np.pi) * np.sin(theta))
918
+ else:
919
+ raise ValueError(f'Invalid pan_law {self.parameters.pan_law.value}.')
920
+
921
+
922
+ def process(self, x):
923
+ """
924
+ Process audio.
925
+
926
+ Args:
927
+ x (Numpy array): input audio of size `n_samples x n_channels`.
928
+
929
+ Returns:
930
+ (Numpy array): panned audio of size `n_samples x n_channels`.
931
+ """
932
+ assert x.shape[1] == 1 or x.shape[1] == 2, 'Panner only works with monaural or stereo audio.'
933
+
934
+ if x.shape[1] < 2:
935
+ x = np.repeat(x, 2, axis=1)
936
+
937
+
938
+ return x * self.gains
939
+
940
+ def update(self, parameter_name=None):
941
+ """
942
+ Update processor after randomization of parameters.
943
+
944
+ Args:
945
+ parameter_name (str): Parameter whose value has changed.
946
+ """
947
+ self._calculate_pan_coefficents()
948
+
949
+ def reset_state(self):
950
+ """Reset state."""
951
+ self._output_buffer = np.empty([self.block_size, 2])
952
+ self.update()
953
+
954
+
955
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% STEREO IMAGER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
956
+ class MidSideImager(Processor):
957
+ def __init__(self, name='IMAGER', parameters=None):
958
+ super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
959
+
960
+ if not parameters:
961
+ self.parameters = ParameterList()
962
+ # values of 0.0~1.0 indicate making the signal more centered while 1.0~2.0 means making the signal more wider
963
+ self.parameters.add(Parameter("bal", 0.0, "float", processor=self, minimum=0.0, maximum=2.0))
964
+
965
+ def process(self, data):
966
+ """
967
+ # input shape : [signal length, 2]
968
+ ### note! stereo imager won't work if the input signal is a mono signal (left==right)
969
+ ### if you want to apply stereo imager to a mono signal, first stereoize it with Haas effects
970
+ """
971
+
972
+ # to mid-side channels
973
+ mid, side = self.lr_to_ms(data[:,0], data[:,1])
974
+ # apply mid-side weights according to energy
975
+ mid_e, side_e = np.sum(mid**2), np.sum(side**2)
976
+ total_e = mid_e + side_e
977
+ # apply weights
978
+ max_side_multiplier = np.sqrt(total_e / (side_e + 1e-3))
979
+ # compute current multiply factor
980
+ cur_bal = round(getattr(self.parameters, "bal").value, 3)
981
+ side_gain = cur_bal if cur_bal <= 1. else max_side_multiplier * (cur_bal-1)
982
+ # multiply weighting factor
983
+ new_side = side * side_gain
984
+ new_side_e = side_e * (side_gain ** 2)
985
+ left_mid_e = total_e - new_side_e
986
+ mid_gain = np.sqrt(left_mid_e / (mid_e + 1e-3))
987
+ new_mid = mid * mid_gain
988
+ # convert back to left-right channels
989
+ left, right = self.ms_to_lr(new_mid, new_side)
990
+ imaged = np.stack([left, right], 1)
991
+
992
+ return imaged
993
+
994
+ # left-right channeled signal to mid-side signal
995
+ def lr_to_ms(self, left, right):
996
+ mid = left + right
997
+ side = left - right
998
+ return mid, side
999
+
1000
+ # mid-side channeled signal to left-right signal
1001
+ def ms_to_lr(self, mid, side):
1002
+ left = (mid + side) / 2
1003
+ right = (mid - side) / 2
1004
+ return left, right
1005
+
1006
+ def update(self, parameter_name=None):
1007
+ return parameter_name
1008
+
1009
+
1010
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% GAIN %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1011
+ class Gain(Processor):
1012
+ """
1013
+ Gain Processor.
1014
+
1015
+ Applies gain in dB and can also randomly inverts polarity.
1016
+
1017
+ Processor parameters:
1018
+ gain (float): Gain that should be applied (dB scale).
1019
+ invert (bool): If True, then we also invert the waveform.
1020
+ """
1021
+
1022
+ def __init__(self, name='Gain', parameters=None):
1023
+ """
1024
+ Initialize processor.
1025
+
1026
+ Args:
1027
+ name (str): Name of processor.
1028
+ parameters (parameter_list): Parameters for this processor.
1029
+ """
1030
+ super().__init__(name, parameters=parameters, block_size=None, sample_rate=None)
1031
+
1032
+ if not parameters:
1033
+ self.parameters = ParameterList()
1034
+ # self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-12.0, maximum=6.0))
1035
+ self.parameters.add(Parameter('gain', 1.0, 'float', units='dB', minimum=-6.0, maximum=9.0))
1036
+ self.parameters.add(Parameter('invert', False, 'bool'))
1037
+
1038
+ def process(self, x):
1039
+ """
1040
+ Process audio.
1041
+
1042
+ Args:
1043
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1044
+
1045
+ Returns:
1046
+ (Numpy array): gain-augmented audio of size `n_samples x n_channels`.
1047
+ """
1048
+ gain = 10 ** (self.parameters.gain.value / 20.)
1049
+ if self.parameters.invert.value:
1050
+ gain = -gain
1051
+ return gain * x
1052
+
1053
+
1054
+ # %%%%%%%%%%%%%%%%%%%%%%% SIMPLE CHANNEL SWAP %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1055
+ class SwapChannels(Processor):
1056
+ """
1057
+ Swap channels in multi-channel audio.
1058
+
1059
+ Processor parameters:
1060
+ index (int) Selects the permutation that we are using.
1061
+ Please note that "no permutation" is one of the permutations in `self.permutations` at index `0`.
1062
+ """
1063
+
1064
+ def __init__(self, n_channels, name='SwapChannels', parameters=None):
1065
+ """
1066
+ Initialize processor.
1067
+
1068
+ Args:
1069
+ n_channels (int): Number of channels in audio that we want to process.
1070
+ name (str): Name of processor.
1071
+ parameters (parameter_list): Parameters for this processor.
1072
+ """
1073
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
1074
+
1075
+ self.permutations = tuple(permutations(range(n_channels), n_channels))
1076
+
1077
+ if not parameters:
1078
+ self.parameters = ParameterList()
1079
+ self.parameters.add(Parameter('index', 0, 'int', minimum=0, maximum=len(self.permutations)))
1080
+
1081
+ def process(self, x):
1082
+ """
1083
+ Process audio.
1084
+
1085
+ Args:
1086
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1087
+
1088
+ Returns:
1089
+ (Numpy array): channel-swapped audio of size `n_samples x n_channels`.
1090
+ """
1091
+ return x[:, self.permutations[self.parameters.index.value]]
1092
+
1093
+
1094
+ # %%%%%%%%%%%%%%%%%%%%%%% Monauralize %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1095
+ class Monauralize(Processor):
1096
+ """
1097
+ Monauralizes audio (i.e., removes spatial information).
1098
+
1099
+ Process parameters:
1100
+ seed_channel (int): channel that we use for overwriting the others.
1101
+ """
1102
+
1103
+ def __init__(self, n_channels, name='Monauralize', parameters=None):
1104
+ """
1105
+ Initialize processor.
1106
+
1107
+ Args:
1108
+ n_channels (int): Number of channels in audio that we want to process.
1109
+ name (str): Name of processor.
1110
+ parameters (parameter_list): Parameters for this processor.
1111
+ """
1112
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=None)
1113
+
1114
+ if not parameters:
1115
+ self.parameters = ParameterList()
1116
+ self.parameters.add(Parameter('seed_channel', 0, 'int', minimum=0, maximum=n_channels))
1117
+
1118
+ def process(self, x):
1119
+ """
1120
+ Process audio.
1121
+
1122
+ Args:
1123
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1124
+
1125
+ Returns:
1126
+ (Numpy array): monauralized audio of size `n_samples x n_channels`.
1127
+ """
1128
+ return np.tile(x[:, [self.parameters.seed_channel.value]], (1, x.shape[1]))
1129
+
1130
+
1131
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PITCH SHIFT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1132
+ class PitchShift(Processor):
1133
+ """
1134
+ Simple pitch shifter using SoX and soxbindings (https://github.com/pseeth/soxbindings).
1135
+
1136
+ Processor parameters:
1137
+ steps (float): Pitch shift as positive/negative semitones
1138
+ quick (bool): If True, this effect will run faster but with lower sound quality.
1139
+ """
1140
+
1141
+ def __init__(self, sample_rate, fix_length=True, name='PitchShift', parameters=None):
1142
+ """
1143
+ Initialize processor.
1144
+
1145
+ Args:
1146
+ sample_rate (int): Sample rate of input audio.
1147
+ fix_length (bool): If True, then output has same length as input.
1148
+ name (str): Name of processor.
1149
+ parameters (parameter_list): Parameters for this processor.
1150
+ """
1151
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
1152
+
1153
+ if not parameters:
1154
+ self.parameters = ParameterList()
1155
+ self.parameters.add(Parameter('steps', 0.0, 'float', minimum=-6., maximum=6.))
1156
+ self.parameters.add(Parameter('quick', False, 'bool'))
1157
+
1158
+ self.fix_length = fix_length
1159
+ self.clips = False
1160
+
1161
+ def process(self, x):
1162
+ """
1163
+ Process audio.
1164
+
1165
+ Args:
1166
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1167
+
1168
+ Returns:
1169
+ (Numpy array): pitch-shifted audio of size `n_samples x n_channels`.
1170
+ """
1171
+ if self.parameters.steps.value == 0.0:
1172
+ y = x
1173
+ else:
1174
+ scale = np.max(np.abs(x))
1175
+ if scale > 0.9:
1176
+ clips = True
1177
+ x = x * (0.9 / scale)
1178
+ else:
1179
+ clips = False
1180
+
1181
+ tfm = sox.Transformer()
1182
+ tfm.pitch(self.parameters.steps.value, quick=bool(self.parameters.quick.value))
1183
+ y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
1184
+
1185
+ if clips:
1186
+ y *= scale / 0.9 # rescale output to original scale
1187
+
1188
+ if self.fix_length:
1189
+ n_samples_input = x.shape[0]
1190
+ n_samples_output = y.shape[0]
1191
+ if n_samples_input < n_samples_output:
1192
+ idx1 = (n_samples_output - n_samples_input) // 2
1193
+ idx2 = idx1 + n_samples_input
1194
+ y = y[idx1:idx2]
1195
+ elif n_samples_input > n_samples_output:
1196
+ n_pad = n_samples_input - n_samples_output
1197
+ y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
1198
+
1199
+ return y
1200
+
1201
+
1202
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TIME STRETCH %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1203
+ class TimeStretch(Processor):
1204
+ """
1205
+ Simple time stretcher using SoX and soxbindings (https://github.com/pseeth/soxbindings).
1206
+
1207
+ Processor parameters:
1208
+ factor (float): Time stretch factor.
1209
+ quick (bool): If True, this effect will run faster but with lower sound quality.
1210
+ stretch_type (str): Algorithm used for stretching (`tempo` or `stretch`).
1211
+ audio_type (str): Sets which time segments are most optmial when finding
1212
+ the best overlapping points for time stretching.
1213
+ """
1214
+
1215
+ def __init__(self, sample_rate, fix_length=True, name='TimeStretch', parameters=None):
1216
+ """
1217
+ Initialize processor.
1218
+
1219
+ Args:
1220
+ sample_rate (int): Sample rate of input audio.
1221
+ fix_length (bool): If True, then output has same length as input.
1222
+ name (str): Name of processor.
1223
+ parameters (parameter_list): Parameters for this processor.
1224
+ """
1225
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
1226
+
1227
+ if not parameters:
1228
+ self.parameters = ParameterList()
1229
+ self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1/1.33, maximum=1.33))
1230
+ self.parameters.add(Parameter('quick', False, 'bool'))
1231
+ self.parameters.add(Parameter('stretch_type', 'tempo', 'string', options=['tempo', 'stretch']))
1232
+ self.parameters.add(Parameter('audio_type', 'l', 'string', options=['m', 's', 'l']))
1233
+
1234
+ self.fix_length = fix_length
1235
+
1236
+ def process(self, x):
1237
+ """
1238
+ Process audio.
1239
+
1240
+ Args:
1241
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1242
+
1243
+ Returns:
1244
+ (Numpy array): time-stretched audio of size `n_samples x n_channels`.
1245
+ """
1246
+ if self.parameters.factor.value == 1.0:
1247
+ y = x
1248
+ else:
1249
+ scale = np.max(np.abs(x))
1250
+ if scale > 0.9:
1251
+ clips = True
1252
+ x = x * (0.9 / scale)
1253
+ else:
1254
+ clips = False
1255
+
1256
+ tfm = sox.Transformer()
1257
+ if self.parameters.stretch_type.value == 'stretch':
1258
+ tfm.stretch(self.parameters.factor.value)
1259
+ elif self.parameters.stretch_type.value == 'tempo':
1260
+ tfm.tempo(self.parameters.factor.value,
1261
+ audio_type=self.parameters.audio_type.value,
1262
+ quick=bool(self.parameters.quick.value))
1263
+ y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
1264
+
1265
+ if clips:
1266
+ y *= scale / 0.9 # rescale output to original scale
1267
+
1268
+ if self.fix_length:
1269
+ n_samples_input = x.shape[0]
1270
+ n_samples_output = y.shape[0]
1271
+ if n_samples_input < n_samples_output:
1272
+ idx1 = (n_samples_output - n_samples_input) // 2
1273
+ idx2 = idx1 + n_samples_input
1274
+ y = y[idx1:idx2]
1275
+ elif n_samples_input > n_samples_output:
1276
+ n_pad = n_samples_input - n_samples_output
1277
+ y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
1278
+
1279
+ return y
1280
+
1281
+
1282
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PLAYBACK SPEED %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1283
+ class PlaybackSpeed(Processor):
1284
+ """
1285
+ Simple playback speed effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
1286
+
1287
+ Processor parameters:
1288
+ factor (float): Playback speed factor.
1289
+ """
1290
+
1291
+ def __init__(self, sample_rate, fix_length=True, name='PlaybackSpeed', parameters=None):
1292
+ """
1293
+ Initialize processor.
1294
+
1295
+ Args:
1296
+ sample_rate (int): Sample rate of input audio.
1297
+ fix_length (bool): If True, then output has same length as input.
1298
+ name (str): Name of processor.
1299
+ parameters (parameter_list): Parameters for this processor.
1300
+ """
1301
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
1302
+
1303
+ if not parameters:
1304
+ self.parameters = ParameterList()
1305
+ self.parameters.add(Parameter('factor', 1.0, 'float', minimum=1./1.33, maximum=1.33))
1306
+
1307
+ self.fix_length = fix_length
1308
+
1309
+ def process(self, x):
1310
+ """
1311
+ Process audio.
1312
+
1313
+ Args:
1314
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1315
+
1316
+ Returns:
1317
+ (Numpy array): resampled audio of size `n_samples x n_channels`.
1318
+ """
1319
+ if self.parameters.factor.value == 1.0:
1320
+ y = x
1321
+ else:
1322
+ scale = np.max(np.abs(x))
1323
+ if scale > 0.9:
1324
+ clips = True
1325
+ x = x * (0.9 / scale)
1326
+ else:
1327
+ clips = False
1328
+
1329
+ tfm = sox.Transformer()
1330
+ tfm.speed(self.parameters.factor.value)
1331
+ y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
1332
+
1333
+ if clips:
1334
+ y *= scale / 0.9 # rescale output to original scale
1335
+
1336
+ if self.fix_length:
1337
+ n_samples_input = x.shape[0]
1338
+ n_samples_output = y.shape[0]
1339
+ if n_samples_input < n_samples_output:
1340
+ idx1 = (n_samples_output - n_samples_input) // 2
1341
+ idx2 = idx1 + n_samples_input
1342
+ y = y[idx1:idx2]
1343
+ elif n_samples_input > n_samples_output:
1344
+ n_pad = n_samples_input - n_samples_output
1345
+ y = np.pad(y, ((n_pad//2, n_pad - n_pad//2), (0, 0)))
1346
+
1347
+ return y
1348
+
1349
+
1350
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BEND %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1351
+ class Bend(Processor):
1352
+ """
1353
+ Simple bend effect using SoX and soxbindings (https://github.com/pseeth/soxbindings).
1354
+
1355
+ Processor parameters:
1356
+ n_bends (int): Number of segments or intervals to pitch shift
1357
+ """
1358
+
1359
+ def __init__(self, sample_rate, pitch_range=(-600, 600), fix_length=True, name='Bend', parameters=None):
1360
+ """
1361
+ Initialize processor.
1362
+
1363
+ Args:
1364
+ sample_rate (int): Sample rate of input audio.
1365
+ pitch_range (tuple of ints): min and max pitch bending ranges in cents
1366
+ fix_length (bool): If True, then output has same length as input.
1367
+ name (str): Name of processor.
1368
+ parameters (parameter_list): Parameters for this processor.
1369
+ """
1370
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate)
1371
+
1372
+ if not parameters:
1373
+ self.parameters = ParameterList()
1374
+ self.parameters.add(Parameter('n_bends', 2, 'int', minimum=2, maximum=10))
1375
+ self.pitch_range_min, self.pitch_range_max = pitch_range
1376
+
1377
+ def process(self, x):
1378
+ """
1379
+ Process audio.
1380
+
1381
+ Args:
1382
+ x (Numpy array): input audio of size `n_samples x n_channels`.
1383
+
1384
+ Returns:
1385
+ (Numpy array): pitch-bended audio of size `n_samples x n_channels`.
1386
+ """
1387
+ n_bends = self.parameters.n_bends.value
1388
+ max_length = x.shape[0] / self.sample_rate
1389
+
1390
+ # Generates random non-overlapping segments
1391
+ delta = 1. / self.sample_rate
1392
+ boundaries = np.sort(delta + np.random.rand(n_bends-1) * (max_length - delta))
1393
+
1394
+ start, end = np.zeros(n_bends), np.zeros(n_bends)
1395
+ start[0] = delta
1396
+ for i, b in enumerate(boundaries):
1397
+ end[i] = b
1398
+ start[i+1] = b
1399
+ end[-1] = max_length
1400
+
1401
+ # randomly sample pitch-shifts in cents
1402
+ cents = np.random.randint(self.pitch_range_min, self.pitch_range_max+1, n_bends)
1403
+
1404
+ # remove segment if cent value is zero or start == end (as SoX does not allow such values)
1405
+ idx_keep = np.logical_and(cents != 0, start != end)
1406
+ n_bends, start, end, cents = sum(idx_keep), start[idx_keep], end[idx_keep], cents[idx_keep]
1407
+
1408
+ scale = np.max(np.abs(x))
1409
+ if scale > 0.9:
1410
+ clips = True
1411
+ x = x * (0.9 / scale)
1412
+ else:
1413
+ clips = False
1414
+
1415
+ tfm = sox.Transformer()
1416
+ tfm.bend(n_bends=int(n_bends), start_times=list(start), end_times=list(end), cents=list(cents))
1417
+ y = tfm.build_array(input_array=x, sample_rate_in=self.sample_rate).astype(np.float32)
1418
+
1419
+ if clips:
1420
+ y *= scale / 0.9 # rescale output to original scale
1421
+
1422
+ return y
1423
+
1424
+
1425
+
1426
+
1427
+
1428
+ # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ALGORITHMIC REVERB %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1429
+ class AlgorithmicReverb(Processor):
1430
+ def __init__(self, name="algoreverb", parameters=None, sample_rate=44100, **kwargs):
1431
+
1432
+ super().__init__(name=name, parameters=parameters, block_size=None, sample_rate=sample_rate, **kwargs)
1433
+
1434
+ if not parameters:
1435
+ self.parameters = ParameterList()
1436
+ self.parameters.add(Parameter("room_size", 0.5, "float", minimum=0.05, maximum=0.85))
1437
+ self.parameters.add(Parameter("damping", 0.1, "float", minimum=0.0, maximum=1.0))
1438
+ self.parameters.add(Parameter("dry_mix", 0.9, "float", minimum=0.0, maximum=1.0))
1439
+ self.parameters.add(Parameter("wet_mix", 0.1, "float", minimum=0.0, maximum=1.0))
1440
+ self.parameters.add(Parameter("width", 0.7, "float", minimum=0.0, maximum=1.0))
1441
+
1442
+ # Tuning
1443
+ self.stereospread = 23
1444
+ self.scalegain = 0.2
1445
+
1446
+
1447
+ def process(self, data):
1448
+
1449
+ if data.ndim >= 2:
1450
+ dataL = data[:,0]
1451
+ if data.shape[1] == 2:
1452
+ dataR = data[:,1]
1453
+ else:
1454
+ dataR = data[:,0]
1455
+ else:
1456
+ dataL = data
1457
+ dataR = data
1458
+
1459
+ output = np.zeros((data.shape[0], 2))
1460
+
1461
+ xL, xR = self.process_filters(dataL.copy(), dataR.copy())
1462
+
1463
+ wet1_g = self.parameters.wet_mix.value * ((self.parameters.width.value/2) + 0.5)
1464
+ wet2_g = self.parameters.wet_mix.value * ((1-self.parameters.width.value)/2)
1465
+ dry_g = self.parameters.dry_mix.value
1466
+
1467
+ output[:,0] = (wet1_g * xL) + (wet2_g * xR) + (dry_g * dataL)
1468
+ output[:,1] = (wet1_g * xR) + (wet2_g * xL) + (dry_g * dataR)
1469
+
1470
+ return output
1471
+
1472
+ def process_filters(self, dataL, dataR):
1473
+
1474
+ xL = self.combL1.process(dataL.copy() * self.scalegain)
1475
+ xL += self.combL2.process(dataL.copy() * self.scalegain)
1476
+ xL += self.combL3.process(dataL.copy() * self.scalegain)
1477
+ xL += self.combL4.process(dataL.copy() * self.scalegain)
1478
+ xL = self.combL5.process(dataL.copy() * self.scalegain)
1479
+ xL += self.combL6.process(dataL.copy() * self.scalegain)
1480
+ xL += self.combL7.process(dataL.copy() * self.scalegain)
1481
+ xL += self.combL8.process(dataL.copy() * self.scalegain)
1482
+
1483
+ xR = self.combR1.process(dataR.copy() * self.scalegain)
1484
+ xR += self.combR2.process(dataR.copy() * self.scalegain)
1485
+ xR += self.combR3.process(dataR.copy() * self.scalegain)
1486
+ xR += self.combR4.process(dataR.copy() * self.scalegain)
1487
+ xR = self.combR5.process(dataR.copy() * self.scalegain)
1488
+ xR += self.combR6.process(dataR.copy() * self.scalegain)
1489
+ xR += self.combR7.process(dataR.copy() * self.scalegain)
1490
+ xR += self.combR8.process(dataR.copy() * self.scalegain)
1491
+
1492
+ yL1 = self.allpassL1.process(xL)
1493
+ yL2 = self.allpassL2.process(yL1)
1494
+ yL3 = self.allpassL3.process(yL2)
1495
+ yL4 = self.allpassL4.process(yL3)
1496
+
1497
+ yR1 = self.allpassR1.process(xR)
1498
+ yR2 = self.allpassR2.process(yR1)
1499
+ yR3 = self.allpassR3.process(yR2)
1500
+ yR4 = self.allpassR4.process(yR3)
1501
+
1502
+ return yL4, yR4
1503
+
1504
+ def update(self, parameter_name):
1505
+
1506
+ rs = self.parameters.room_size.value
1507
+ dp = self.parameters.damping.value
1508
+ ss = self.stereospread
1509
+
1510
+ # initialize allpass and feedback comb-filters
1511
+ # (with coefficients optimized for fs=44.1kHz)
1512
+ self.allpassL1 = pymc.components.allpass.Allpass(556, rs, self.block_size)
1513
+ self.allpassR1 = pymc.components.allpass.Allpass(556+ss, rs, self.block_size)
1514
+ self.allpassL2 = pymc.components.allpass.Allpass(441, rs, self.block_size)
1515
+ self.allpassR2 = pymc.components.allpass.Allpass(441+ss, rs, self.block_size)
1516
+ self.allpassL3 = pymc.components.allpass.Allpass(341, rs, self.block_size)
1517
+ self.allpassR3 = pymc.components.allpass.Allpass(341+ss, rs, self.block_size)
1518
+ self.allpassL4 = pymc.components.allpass.Allpass(225, rs, self.block_size)
1519
+ self.allpassR4 = pymc.components.allpass.Allpass(255+ss, rs, self.block_size)
1520
+
1521
+ self.combL1 = pymc.components.comb.Comb(1116, dp, rs, self.block_size)
1522
+ self.combR1 = pymc.components.comb.Comb(1116+ss, dp, rs, self.block_size)
1523
+ self.combL2 = pymc.components.comb.Comb(1188, dp, rs, self.block_size)
1524
+ self.combR2 = pymc.components.comb.Comb(1188+ss, dp, rs, self.block_size)
1525
+ self.combL3 = pymc.components.comb.Comb(1277, dp, rs, self.block_size)
1526
+ self.combR3 = pymc.components.comb.Comb(1277+ss, dp, rs, self.block_size)
1527
+ self.combL4 = pymc.components.comb.Comb(1356, dp, rs, self.block_size)
1528
+ self.combR4 = pymc.components.comb.Comb(1356+ss, dp, rs, self.block_size)
1529
+ self.combL5 = pymc.components.comb.Comb(1422, dp, rs, self.block_size)
1530
+ self.combR5 = pymc.components.comb.Comb(1422+ss, dp, rs, self.block_size)
1531
+ self.combL6 = pymc.components.comb.Comb(1491, dp, rs, self.block_size)
1532
+ self.combR6 = pymc.components.comb.Comb(1491+ss, dp, rs, self.block_size)
1533
+ self.combL7 = pymc.components.comb.Comb(1557, dp, rs, self.block_size)
1534
+ self.combR7 = pymc.components.comb.Comb(1557+ss, dp, rs, self.block_size)
1535
+ self.combL8 = pymc.components.comb.Comb(1617, dp, rs, self.block_size)
1536
+ self.combR8 = pymc.components.comb.Comb(1617+ss, dp, rs, self.block_size)
1537
+
modules/common_miscellaneous.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common miscellaneous functions.
3
+
4
+ AI Music Technology Group, Sony Group Corporation
5
+ AI Speech and Sound Group, Sony Europe
6
+
7
+ This implementation originally belongs to Sony Group Corporation,
8
+ which has been introduced in the work "Automatic music mixing with deep learning and out-of-domain data".
9
+ Original repo link: https://github.com/sony/FxNorm-automix
10
+ """
11
+ import os
12
+ import psutil
13
+ import sys
14
+ import numpy as np
15
+ import librosa
16
+ import torch
17
+ import math
18
+
19
+
20
+ def uprint(s):
21
+ """
22
+ Unbuffered print to stdout.
23
+
24
+ We also flush stderr to have the log-file in sync.
25
+
26
+ Args:
27
+ s: string to print
28
+ """
29
+ print(s)
30
+ sys.stdout.flush()
31
+ sys.stderr.flush()
32
+
33
+
34
+ def recursive_getattr(obj, attr):
35
+ """
36
+ Run `getattr` recursively (e.g., for `fc1.weight`).
37
+
38
+ Args:
39
+ obj: object
40
+ attr: attribute to get
41
+
42
+ Returns:
43
+ object
44
+ """
45
+ for a in attr.split('.'):
46
+ obj = getattr(obj, a)
47
+ return obj
48
+
49
+
50
+ def compute_stft(samples, hop_length, fft_size, stft_window):
51
+ """
52
+ Compute the STFT of `samples` applying a Hann window of size `FFT_SIZE`, shifted for each frame by `hop_length`.
53
+
54
+ Args:
55
+ samples: num samples x channels
56
+ hop_length: window shift in samples
57
+ fft_size: FFT size which is also the window size
58
+ stft_window: STFT analysis window
59
+
60
+ Returns:
61
+ stft: frames x channels x freqbins
62
+ """
63
+ n_channels = samples.shape[1]
64
+ n_frames = 1+int((samples.shape[0] - fft_size)/hop_length)
65
+ stft = np.empty((n_frames, n_channels, fft_size//2+1), dtype=np.complex64)
66
+
67
+ # convert into f_contiguous (such that [:,n] slicing is c_contiguous)
68
+ samples = np.asfortranarray(samples)
69
+
70
+ for n in range(n_channels):
71
+ # compute STFT (output has size `n_frames x N_BINS`)
72
+ stft[:, n, :] = librosa.stft(samples[:, n],
73
+ n_fft=fft_size,
74
+ hop_length=hop_length,
75
+ window=stft_window,
76
+ center=False).transpose()
77
+ return stft
78
+
79
+
80
+ def compute_istft(stft, hop_length, stft_window):
81
+ """
82
+ Compute the inverse STFT of `stft`.
83
+
84
+ Args:
85
+ stft: frames x channels x freqbins
86
+ hop_length: window shift in samples
87
+ stft_window: STFT synthesis window
88
+
89
+ Returns:
90
+ samples: num samples x channels
91
+ """
92
+ for n in range(stft.shape[1]):
93
+ s = librosa.istft(stft[:, n, :].transpose(),
94
+ hop_length=hop_length, window=stft_window, center=False)
95
+ if n == 0:
96
+ samples = s
97
+ else:
98
+ samples = np.column_stack((samples, s))
99
+
100
+ # ensure that we have a 2d array (monaural files are just loaded as vectors)
101
+ if samples.ndim == 1:
102
+ samples = samples[:, np.newaxis]
103
+
104
+ return samples
105
+
106
+
107
+ def get_size(obj):
108
+ """
109
+ Recursively find size of objects (in bytes).
110
+
111
+ Args:
112
+ obj: object
113
+
114
+ Returns:
115
+ size of object
116
+ """
117
+ size = sys.getsizeof(obj)
118
+
119
+ import functools
120
+
121
+ if isinstance(obj, dict):
122
+ size += sum([get_size(v) for v in obj.values()])
123
+ size += sum([get_size(k) for k in obj.keys()])
124
+ elif isinstance(obj, functools.partial):
125
+ size += sum([get_size(v) for v in obj.keywords.values()])
126
+ size += sum([get_size(k) for k in obj.keywords.keys()])
127
+ elif isinstance(obj, list):
128
+ size += sum([get_size(i) for i in obj])
129
+ elif isinstance(obj, tuple):
130
+ size += sum([get_size(i) for i in obj])
131
+ return size
132
+
133
+
134
+ def get_process_memory():
135
+ """
136
+ Return memory consumption in GBytes.
137
+
138
+ Returns:
139
+ memory used by the process
140
+ """
141
+ return psutil.Process(os.getpid()).memory_info()[0] / (2 ** 30)
142
+
143
+
144
+ def check_complete_convolution(input_size, kernel_size, stride=1,
145
+ padding=0, dilation=1, note=''):
146
+ """
147
+ Check where the convolution is complete.
148
+
149
+ Returns true if no time steps left over in a Conv1d
150
+
151
+ Args:
152
+ input_size: size of input
153
+ kernel_size: size of kernel
154
+ stride: stride
155
+ padding: padding
156
+ dilation: dilation
157
+ note: string for additional notes
158
+ """
159
+ is_complete = ((input_size + 2*padding - dilation * (kernel_size - 1) - 1)
160
+ / stride + 1).is_integer()
161
+ uprint(f'{note} {is_complete}')
162
+
163
+
164
+ def pad_to_shape(x: torch.Tensor, y: int) -> torch.Tensor:
165
+ """
166
+ Right-pad or right-trim first argument last dimension to have same size as second argument.
167
+
168
+ Args:
169
+ x: Tensor to be padded.
170
+ y: Size to pad/trim x last dimension to
171
+
172
+ Returns:
173
+ `x` padded to match `y`'s dimension.
174
+ """
175
+ inp_len = y
176
+ output_len = x.shape[-1]
177
+ return torch.nn.functional.pad(x, [0, inp_len - output_len])
178
+
179
+
180
+ def valid_length(input_size, kernel_size, stride=1, padding=0, dilation=1):
181
+ """
182
+ Return the nearest valid upper length to use with the model so that there is no time steps left over in a 1DConv.
183
+
184
+ For all layers, size of the (input - kernel_size) % stride = 0.
185
+ Here valid means that there is no left over frame neglected and discarded.
186
+
187
+ Args:
188
+ input_size: size of input
189
+ kernel_size: size of kernel
190
+ stride: stride
191
+ padding: padding
192
+ dilation: dilation
193
+
194
+ Returns:
195
+ valid length for convolution
196
+ """
197
+ length = math.ceil((input_size + 2*padding - dilation * (kernel_size - 1) - 1)/stride) + 1
198
+ length = (length - 1) * stride - 2*padding + dilation * (kernel_size - 1) + 1
199
+
200
+ return int(length)
201
+
202
+
203
+ def td_length_from_fd(fd_length: int, fft_size: int, fft_hop: int) -> int:
204
+ """
205
+ Return the length in time domain, given the length in frequency domain.
206
+
207
+ Return the necessary length in the time domain of a signal to be transformed into
208
+ a signal of length `fd_length` in time-frequency domain with the given STFT
209
+ parameters `fft_size` and `fft_hop`. No padding is assumed.
210
+
211
+ Args:
212
+ fd_length: length in frequency domain
213
+ fft_size: size of FFT
214
+ fft_hop: hop length
215
+
216
+ Returns:
217
+ length in time domain
218
+ """
219
+ return (fd_length - 1) * fft_hop + fft_size
modules/data_normalization.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of the 'audio effects chain normalization'
3
+ """
4
+ import numpy as np
5
+ import scipy
6
+ import soundfile as sf
7
+ import pyloudnorm
8
+
9
+ from glob import glob
10
+ import os
11
+ import sys
12
+ currentdir = os.path.dirname(os.path.realpath(__file__))
13
+ sys.path.append(currentdir)
14
+ from utils_data_normalization import *
15
+ from normalization_imager import *
16
+
17
+
18
+ '''
19
+ Audio Effects Chain Normalization
20
+ process: normalizes input stems according to given precomputed features
21
+ '''
22
+ class Audio_Effects_Normalizer:
23
+ def __init__(self, precomputed_feature_path=None, \
24
+ STEMS=['drums', 'bass', 'other', 'vocals'], \
25
+ EFFECTS=['eq', 'compression', 'imager', 'loudness'], \
26
+ audio_extension='wav'):
27
+ self.STEMS = STEMS # Stems to be normalized
28
+ self.EFFECTS = EFFECTS # Effects to be normalized, order matters
29
+ self.audio_extension = audio_extension
30
+ self.precomputed_feature_path = precomputed_feature_path
31
+
32
+ # Audio settings
33
+ self.SR = 44100
34
+ self.SUBTYPE = 'PCM_16'
35
+
36
+ # General Settings
37
+ self.FFT_SIZE = 2**16
38
+ self.HOP_LENGTH = self.FFT_SIZE//4
39
+
40
+ # Loudness
41
+ self.NTAPS = 1001
42
+ self.LUFS = -30
43
+ self.MIN_DB = -40 # Min amplitude to apply EQ matching
44
+
45
+ # Compressor
46
+ self.COMP_USE_EXPANDER = False
47
+ self.COMP_PEAK_NORM = -10.0
48
+ self.COMP_TRUE_PEAK = False
49
+ self.COMP_PERCENTILE = 75 # features_mean (v1) was done with 25
50
+ self.COMP_MIN_TH = -40
51
+ self.COMP_MAX_RATIO = 20
52
+ comp_settings = {key:{} for key in self.STEMS}
53
+ for key in comp_settings:
54
+ if key=='vocals':
55
+ comp_settings[key]['attack'] = 7.5
56
+ comp_settings[key]['release'] = 400.0
57
+ comp_settings[key]['ratio'] = 4
58
+ comp_settings[key]['n_mels'] = 128
59
+ elif key=='drums':
60
+ comp_settings[key]['attack'] = 10.0
61
+ comp_settings[key]['release'] = 180.0
62
+ comp_settings[key]['ratio'] = 6
63
+ comp_settings[key]['n_mels'] = 128
64
+ elif key=='bass':
65
+ comp_settings[key]['attack'] = 10.0
66
+ comp_settings[key]['release'] = 500.0
67
+ comp_settings[key]['ratio'] = 5
68
+ comp_settings[key]['n_mels'] = 16
69
+ elif key=='other' or key=='mixture':
70
+ comp_settings[key]['attack'] = 15.0
71
+ comp_settings[key]['release'] = 666.0
72
+ comp_settings[key]['ratio'] = 4
73
+ comp_settings[key]['n_mels'] = 128
74
+ self.comp_settings = comp_settings
75
+
76
+ if precomputed_feature_path!=None and os.path.isfile(precomputed_feature_path):
77
+ # Load Pre-computed Audio Effects Features
78
+ features_mean = np.load(precomputed_feature_path, allow_pickle='TRUE')[()]
79
+ self.features_mean = self.smooth_feature(features_mean)
80
+
81
+
82
+ # compute audio effects' mean feature values
83
+ def compute_mean(self, base_dir_path, save_feat=True, single_file=False):
84
+
85
+ audio_path_dict = {}
86
+ for cur_stem in self.STEMS:
87
+ # if single_file=True, base_dir_path = the target file path
88
+ audio_path_dict[cur_stem] = [base_dir_path] if single_file else glob(os.path.join(base_dir_path, "**", f"{cur_stem}.{self.audio_extension}"), recursive=True)
89
+
90
+ features_dict = {}
91
+ features_mean = {}
92
+ for effect in self.EFFECTS:
93
+ features_dict[effect] = {key:[] for key in self.STEMS}
94
+ features_mean[effect] = {key:[] for key in self.STEMS}
95
+
96
+ stems_names = self.STEMS.copy()
97
+ for effect in self.EFFECTS:
98
+ print(f'{effect} ...')
99
+ j=0
100
+ for key in self.STEMS:
101
+ print(f'{key} ...')
102
+ i = []
103
+ for i_, p_ in enumerate(audio_path_dict[key]):
104
+ i.append(i_)
105
+ i = np.asarray(i) + j
106
+ j += len(i)
107
+
108
+ features_ = []
109
+ for cur_i, cur_audio_path in enumerate(audio_path_dict[key]):
110
+ print(f'getting {effect} features for {key}- stem {cur_i} of {len(audio_path_dict[key])-1} {cur_audio_path}')
111
+ features_.append(self.get_norm_feature(cur_audio_path, cur_i, effect, key))
112
+
113
+ features_dict[effect][key] = features_
114
+
115
+ print(effect, key, len(features_dict[effect][key]))
116
+ s = np.asarray(features_dict[effect][key])
117
+ s = np.mean(s, axis=0)
118
+ features_mean[effect][key] = s
119
+
120
+ if effect == 'eq':
121
+ assert len(s)==1+self.FFT_SIZE//2, len(s)
122
+ elif effect == 'compression':
123
+ assert len(s)==2, len(s)
124
+ elif effect == 'panning':
125
+ assert len(s)==1+self.FFT_SIZE//2, len(s)
126
+ elif effect == 'loudness':
127
+ assert len(s)==1, len(s)
128
+
129
+ if effect == 'eq':
130
+ if key in ['other', 'vocals', 'mixture']:
131
+ f = 401
132
+ else:
133
+ f = 151
134
+ features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
135
+ f, 1, mode='mirror')
136
+ elif effect == 'panning':
137
+ features_mean[effect][key] = scipy.signal.savgol_filter(features_mean[effect][key],
138
+ 501, 1, mode='mirror')
139
+ if save_feat:
140
+ np.save(self.precomputed_feature_path, features_mean)
141
+ self.features_mean = self.smooth_feature(features_mean)
142
+ print('---feature mean computation completed---')
143
+
144
+ return self.features_mean
145
+
146
+
147
+ def get_norm_feature(self, path, i, effect, stem):
148
+
149
+ if isinstance(path, str):
150
+ audio, fs = sf.read(path)
151
+ assert(fs == self.SR)
152
+ else:
153
+ audio = path
154
+ fs = self.SR
155
+ all_zeros = not np.any(audio)
156
+
157
+ if all_zeros == False:
158
+
159
+ audio = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
160
+
161
+ max_db = amp_to_db(np.max(np.abs(audio)))
162
+
163
+ if max_db > self.MIN_DB:
164
+
165
+ if effect == 'loudness':
166
+ meter = pyln.Meter(self.SR)
167
+ loudness = meter.integrated_loudness(audio)
168
+ return [loudness]
169
+
170
+ elif effect == 'eq':
171
+ audio = lufs_normalize(audio, self.SR, self.LUFS, log=False)
172
+ audio_spec = compute_stft(audio,
173
+ self.HOP_LENGTH,
174
+ self.FFT_SIZE,
175
+ np.sqrt(np.hanning(self.FFT_SIZE+1)[:-1]))
176
+ audio_spec = np.abs(audio_spec)
177
+ audio_spec_avg = np.mean(audio_spec, axis=(0,1))
178
+ return audio_spec_avg
179
+
180
+ elif effect == 'panning':
181
+ phi = get_SPS(audio,
182
+ n_fft=self.FFT_SIZE,
183
+ hop_length=self.HOP_LENGTH,
184
+ smooth=False,
185
+ frames=False)
186
+ return(phi[1])
187
+
188
+ elif effect == 'compression':
189
+ x = pyln.normalize.peak(audio, self.COMP_PEAK_NORM)
190
+ peak_std = get_mean_peak(x,
191
+ sr=self.SR,
192
+ true_peak=self.COMP_TRUE_PEAK,
193
+ percentile=self.COMP_PERCENTILE,
194
+ n_mels=self.comp_settings[stem]['n_mels'])
195
+
196
+ if peak_std is not None:
197
+ return peak_std
198
+ else:
199
+ return None
200
+
201
+ elif effect == 'imager':
202
+ mid, side = lr_to_ms(audio[:,0], audio[:,1])
203
+ return print_balance(mid, side, verbose=False)
204
+
205
+ else:
206
+ print(f'{path} is silence...')
207
+ return None
208
+
209
+ else:
210
+
211
+ print(f'{path} is only zeros...')
212
+ return None
213
+
214
+
215
+ # normalize current audio input with the order of designed audio FX
216
+ def normalize_audio(self, audio, src):
217
+ assert src in self.STEMS
218
+
219
+ normalized_audio = audio
220
+ for cur_effect in self.EFFECTS:
221
+ normalized_audio = self.normalize_audio_per_effect(normalized_audio, src=src, effect=cur_effect)
222
+
223
+ return normalized_audio
224
+
225
+
226
+ # normalize current audio input with current targeted audio FX
227
+ def normalize_audio_per_effect(self, audio, src, effect):
228
+ audio = audio.astype(dtype=np.float32)
229
+ audio_track = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant')
230
+
231
+ assert len(audio_track.shape) == 2 # Always expects two dimensions
232
+
233
+ if audio_track.shape[1] == 1: # Converts mono to stereo with repeated channels
234
+ audio_track = np.repeat(audio_track, 2, axis=-1)
235
+
236
+ output_audio = audio_track.copy()
237
+
238
+ max_db = amp_to_db(np.max(np.abs(output_audio)))
239
+ if max_db > self.MIN_DB:
240
+
241
+ if effect == 'eq':
242
+ # normalize each channel
243
+ for ch in range(audio_track.shape[1]):
244
+ audio_eq_matched = get_eq_matching(output_audio[:, ch],
245
+ self.features_mean[effect][src],
246
+ sr=self.SR,
247
+ n_fft=self.FFT_SIZE,
248
+ hop_length=self.HOP_LENGTH,
249
+ min_db=self.MIN_DB,
250
+ ntaps=self.NTAPS,
251
+ lufs=self.LUFS)
252
+ np.copyto(output_audio[:,ch], audio_eq_matched)
253
+
254
+ elif effect == 'compression':
255
+ assert(len(self.features_mean[effect][src])==2)
256
+ # normalize each channel
257
+ for ch in range(audio_track.shape[1]):
258
+ try:
259
+ audio_comp_matched = get_comp_matching(output_audio[:, ch],
260
+ self.features_mean[effect][src][0],
261
+ self.features_mean[effect][src][1],
262
+ self.comp_settings[src]['ratio'],
263
+ self.comp_settings[src]['attack'],
264
+ self.comp_settings[src]['release'],
265
+ sr=self.SR,
266
+ min_db=self.MIN_DB,
267
+ min_th=self.COMP_MIN_TH,
268
+ comp_peak_norm=self.COMP_PEAK_NORM,
269
+ max_ratio=self.COMP_MAX_RATIO,
270
+ n_mels=self.comp_settings[src]['n_mels'],
271
+ true_peak=self.COMP_TRUE_PEAK,
272
+ percentile=self.COMP_PERCENTILE,
273
+ expander=self.COMP_USE_EXPANDER)
274
+
275
+ np.copyto(output_audio[:,ch], audio_comp_matched[:, 0])
276
+ except:
277
+ break
278
+
279
+ elif effect == 'loudness':
280
+ output_audio = lufs_normalize(output_audio, self.SR, self.features_mean[effect][src], log=False)
281
+
282
+ elif effect == 'imager':
283
+ # threshold of applying Haas effects
284
+ mono_threshold = 0.99 if src=='bass' else 0.975
285
+ audio_imager_matched = normalize_imager(output_audio, \
286
+ target_side_mid_bal=self.features_mean[effect][src][0], \
287
+ mono_threshold=mono_threshold, \
288
+ sr=self.SR)
289
+
290
+ np.copyto(output_audio, audio_imager_matched)
291
+
292
+ output_audio = output_audio[self.FFT_SIZE:self.FFT_SIZE+audio.shape[0]]
293
+
294
+ return output_audio
295
+
296
+
297
+ def smooth_feature(self, feature_dict_):
298
+
299
+ for effect in self.EFFECTS:
300
+ for key in self.STEMS:
301
+ if effect == 'eq':
302
+ if key in ['other', 'vocals', 'mixture']:
303
+ f = 401
304
+ else:
305
+ f = 151
306
+ feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
307
+ f, 1, mode='mirror')
308
+ elif effect == 'panning':
309
+ feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key],
310
+ 501, 1, mode='mirror')
311
+ return feature_dict_
312
+
313
+
314
+ # compute "normalization" based on a single sample
315
+ def feature_matching(self, src_aud_path, ref_aud_path):
316
+ # compute mean features from reference audio
317
+ mean_feature = self.compute_mean(ref_aud_path, save_feat=False, single_file=True)
318
+ print(mean_feature)
319
+
320
+ src_aud, sr = sf.read(src_aud_path)
321
+ normalized_audio = self.normalize_audio(src_aud, 'mixture')
322
+
323
+ return normalized_audio
324
+
325
+
326
+
327
+ def lufs_normalize(x, sr, lufs, log=True):
328
+
329
+ # measure the loudness first
330
+ meter = pyloudnorm.Meter(sr) # create BS.1770 meter
331
+ loudness = meter.integrated_loudness(x+1e-10)
332
+ if log:
333
+ print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
334
+
335
+ loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
336
+
337
+ maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
338
+ loudness_normalized_audio /= maxabs_amp
339
+
340
+ loudness = meter.integrated_loudness(loudness_normalized_audio)
341
+ if log:
342
+ print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
modules/fx_utils.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
3
+
4
+ import numpy as np
5
+ import scipy
6
+ import math
7
+ import librosa
8
+ import librosa.display
9
+ import fnmatch
10
+ import os
11
+ from functools import partial
12
+ import pyloudnorm
13
+ from scipy.signal import lfilter
14
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
15
+ from sklearn.metrics.pairwise import paired_distances
16
+
17
+
18
+ import matplotlib.pyplot as plt
19
+
20
+ def db(x):
21
+ """Computes the decible energy of a signal"""
22
+ return 20*np.log10(np.sqrt(np.mean(np.square(x))))
23
+
24
+ def melspectrogram(y, mirror_pad=False):
25
+ """Compute melspectrogram feature extraction
26
+
27
+ Keyword arguments:
28
+ signal -- input audio as a signal in a numpy object
29
+ inputnorm -- normalization of output
30
+ mirror_pad -- pre and post-pend mirror signals
31
+
32
+ Returns freq x time
33
+
34
+
35
+ Assumes the input sampling rate is 22050Hz
36
+ """
37
+
38
+ # Extract mel.
39
+ fftsize = 1024
40
+ window = 1024
41
+ hop = 512
42
+ melBin = 128
43
+ sr = 22050
44
+
45
+ # mirror pad signal
46
+ # first embedding centered on time 0
47
+ # last embedding centered on end of signal
48
+ if mirror_pad:
49
+ y = np.insert(y, 0, y[0:int(half_frame_length_sec * sr)][::-1])
50
+ y = np.insert(y, len(y), y[-int(half_frame_length_sec * sr):][::-1])
51
+
52
+ S = librosa.core.stft(y,n_fft=fftsize,hop_length=hop,win_length=window)
53
+ X = np.abs(S)
54
+ mel_basis = librosa.filters.mel(sr,n_fft=fftsize,n_mels=melBin)
55
+ mel_S = np.dot(mel_basis,X)
56
+
57
+ # value log compression
58
+ mel_S = np.log10(1+10*mel_S)
59
+ mel_S = mel_S.astype(np.float32)
60
+
61
+
62
+ return mel_S
63
+
64
+
65
+ def getFilesPath(directory, extension):
66
+
67
+ n_path=[]
68
+ for path, subdirs, files in os.walk(directory):
69
+ for name in files:
70
+ if fnmatch.fnmatch(name, extension):
71
+ n_path.append(os.path.join(path,name))
72
+ n_path.sort()
73
+
74
+ return n_path
75
+
76
+
77
+
78
+ def getRandomTrim(x, length, pad=0, start=None):
79
+
80
+ length = length+pad
81
+ if x.shape[0] <= length:
82
+ x_ = x
83
+ while(x.shape[0] <= length):
84
+ x_ = np.concatenate((x_,x_))
85
+ else:
86
+ if start is None:
87
+ start = np.random.randint(0, x.shape[0]-length, size=None)
88
+ end = length+start
89
+ if end > x.shape[0]:
90
+ x_ = x[start:]
91
+ x_ = np.concatenate((x_, x[:length-x.shape[0]]))
92
+ else:
93
+ x_ = x[start:length+start]
94
+
95
+ return x_[:length]
96
+
97
+ def fadeIn(x, length=128):
98
+
99
+ w = scipy.signal.hann(length*2, sym=True)
100
+ w1 = w[0:length]
101
+ ones = np.ones(int(x.shape[0]-length))
102
+ w = np.append(w1, ones)
103
+
104
+ return x*w
105
+
106
+ def fadeOut(x, length=128):
107
+
108
+ w = scipy.signal.hann(length*2, sym=True)
109
+ w2 = w[length:length*2]
110
+ ones = np.ones(int(x.shape[0]-length))
111
+ w = np.append(ones, w2)
112
+
113
+ return x*w
114
+
115
+
116
+ def plotTimeFreq(audio, sr, n_fft=512, hop_length=128, ylabels=None):
117
+
118
+ n = len(audio)
119
+ # plt.figure(figsize=(14, 4*n))
120
+ colors = list(plt.cm.viridis(np.linspace(0,1,n)))
121
+
122
+ X = []
123
+ X_db = []
124
+ maxs = np.zeros((n,))
125
+ mins = np.zeros((n,))
126
+ maxs_t = np.zeros((n,))
127
+ for i, x in enumerate(audio):
128
+
129
+ if x.ndim == 2 and x.shape[-1] == 2:
130
+ x = librosa.core.to_mono(x.T)
131
+ X_ = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
132
+ X_db_ = librosa.amplitude_to_db(abs(X_))
133
+ X.append(X_)
134
+ X_db.append(X_db_)
135
+ maxs[i] = np.max(X_db_)
136
+ mins[i] = np.min(X_db_)
137
+ maxs_t[i] = np.max(np.abs(x))
138
+ vmax = np.max(maxs)
139
+ vmin = np.min(mins)
140
+ tmax = np.max(maxs_t)
141
+ for i, x in enumerate(audio):
142
+
143
+ if x.ndim == 2 and x.shape[-1] == 2:
144
+ x = librosa.core.to_mono(x.T)
145
+
146
+ plt.subplot(n, 2, 2*i+1)
147
+ librosa.display.waveplot(x, sr=sr, color=colors[i])
148
+ if ylabels:
149
+ plt.ylabel(ylabels[i])
150
+
151
+ plt.ylim(-tmax,tmax)
152
+ plt.subplot(n, 2, 2*i+2)
153
+ librosa.display.specshow(X_db[i], sr=sr, x_axis='time', y_axis='log',
154
+ hop_length=hop_length, cmap='GnBu', vmax=vmax, vmin=vmin)
155
+ # plt.colorbar(format='%+2.0f dB')
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+ def slicing(x, win_length, hop_length, center = True, windowing = False, pad = 0):
165
+ # Pad the time series so that frames are centered
166
+ if center:
167
+ # x = np.pad(x, int((win_length-hop_length+pad) // 2), mode='constant')
168
+ x = np.pad(x, ((int((win_length-hop_length+pad)//2), int((win_length+hop_length+pad)//2)),), mode='constant')
169
+
170
+ # Window the time series.
171
+ y_frames = librosa.util.frame(x, frame_length=win_length, hop_length=hop_length)
172
+ if windowing:
173
+ window = scipy.signal.hann(win_length, sym=False)
174
+ else:
175
+ window = 1.0
176
+ f = []
177
+ for i in range(len(y_frames.T)):
178
+ f.append(y_frames.T[i]*window)
179
+ return np.float32(np.asarray(f))
180
+
181
+
182
+ def overlap(x, x_len, win_length, hop_length, windowing = True, rate = 1):
183
+ x = x.reshape(x.shape[0],x.shape[1]).T
184
+ if windowing:
185
+ window = scipy.signal.hann(win_length, sym=False)
186
+ rate = rate*hop_length/win_length
187
+ else:
188
+ window = 1
189
+ rate = 1
190
+ n_frames = x_len / hop_length
191
+ expected_signal_len = int(win_length + hop_length * (n_frames))
192
+ y = np.zeros(expected_signal_len)
193
+ for i in range(int(n_frames)):
194
+ sample = i * hop_length
195
+ w = x[:, i]
196
+ y[sample:(sample + win_length)] = y[sample:(sample + win_length)] + w*window
197
+ y = y[int(win_length // 2):-int(win_length // 2)]
198
+ return np.float32(y*rate)
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+ def highpassFiltering(x_list, f0, sr):
207
+
208
+ b1, a1 = scipy.signal.butter(4, f0/(sr/2),'highpass')
209
+ x_f = []
210
+ for x in x_list:
211
+ x_f_ = scipy.signal.filtfilt(b1, a1, x).copy(order='F')
212
+ x_f.append(x_f_)
213
+ return x_f
214
+
215
+ def lineartodB(x):
216
+ return 20*np.log10(x)
217
+ def dBtoLinear(x):
218
+ return np.power(10,x/20)
219
+
220
+ def lufs_normalize(x, sr, lufs, log=True):
221
+
222
+ # measure the loudness first
223
+ meter = pyloudnorm.Meter(sr) # create BS.1770 meter
224
+ loudness = meter.integrated_loudness(x+1e-10)
225
+ if log:
226
+ print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
227
+
228
+ loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
229
+
230
+ maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
231
+ loudness_normalized_audio /= maxabs_amp
232
+
233
+ loudness = meter.integrated_loudness(loudness_normalized_audio)
234
+ if log:
235
+ print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
236
+
237
+
238
+ return loudness_normalized_audio
239
+
240
+ import soxbindings as sox
241
+
242
+ def lufs_normalize_compand(x, sr, lufs):
243
+
244
+ tfm = sox.Transformer()
245
+ tfm.compand(attack_time = 0.001,
246
+ decay_time = 0.01,
247
+ soft_knee_db = 1.0,
248
+ tf_points = [(-70, -70), (-0.1, -20), (0, 0)])
249
+
250
+ x = tfm.build_array(input_array=x, sample_rate_in=sr).astype(np.float32)
251
+
252
+ # measure the loudness first
253
+ meter = pyloudnorm.Meter(sr) # create BS.1770 meter
254
+ loudness = meter.integrated_loudness(x)
255
+ print("original loudness: ", loudness," max value: ", np.max(np.abs(x)))
256
+
257
+ loudness_normalized_audio = pyloudnorm.normalize.loudness(x, loudness, lufs)
258
+
259
+ maxabs_amp = np.maximum(1.0, 1e-6 + np.max(np.abs(loudness_normalized_audio)))
260
+ loudness_normalized_audio /= maxabs_amp
261
+
262
+ loudness = meter.integrated_loudness(loudness_normalized_audio)
263
+ print("new loudness: ", loudness," max value: ", np.max(np.abs(loudness_normalized_audio)))
264
+
265
+ return loudness_normalized_audio
266
+
267
+
268
+
269
+
270
+
271
+ def getDistances(x,y):
272
+
273
+ distances = {}
274
+ distances['mae'] = mean_absolute_error(x, y)
275
+ distances['mse'] = mean_squared_error(x, y)
276
+ distances['euclidean'] = np.mean(paired_distances(x, y, metric='euclidean'))
277
+ distances['manhattan'] = np.mean(paired_distances(x, y, metric='manhattan'))
278
+ distances['cosine'] = np.mean(paired_distances(x, y, metric='cosine'))
279
+
280
+ distances['mae'] = round(distances['mae'], 5)
281
+ distances['mse'] = round(distances['mse'], 5)
282
+ distances['euclidean'] = round(distances['euclidean'], 5)
283
+ distances['manhattan'] = round(distances['manhattan'], 5)
284
+ distances['cosine'] = round(distances['cosine'], 5)
285
+
286
+ return distances
287
+
288
+ def getMFCC(x, sr, mels=128, mfcc=13, mean_norm=False):
289
+
290
+ melspec = librosa.feature.melspectrogram(y=x, sr=sr, S=None,
291
+ n_fft=1024, hop_length=256,
292
+ n_mels=mels, power=2.0)
293
+ melspec_dB = librosa.power_to_db(melspec, ref=np.max)
294
+ mfcc = librosa.feature.mfcc(S=melspec_dB, sr=sr, n_mfcc=mfcc)
295
+ if mean_norm:
296
+ mfcc -= (np.mean(mfcc, axis=0))
297
+ return mfcc
298
+
299
+
300
+ def getMSE_MFCC(y_true, y_pred, sr, mels=128, mfcc=13, mean_norm=False):
301
+
302
+ ratio = np.mean(np.abs(y_true))/np.mean(np.abs(y_pred))
303
+ y_pred = ratio*y_pred
304
+
305
+ y_mfcc = getMFCC(y_true, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
306
+ z_mfcc = getMFCC(y_pred, sr, mels=mels, mfcc=mfcc, mean_norm=mean_norm)
307
+
308
+ return getDistances(y_mfcc[:,:], z_mfcc[:,:])
modules/normalization_imager.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of the normalization process of stereo-imaging and panning effects
3
+ """
4
+ import numpy as np
5
+ import sys
6
+ import os
7
+
8
+ currentdir = os.path.dirname(os.path.realpath(__file__))
9
+ sys.path.append(currentdir)
10
+ from common_audioeffects import AugmentationChain, Haas
11
+
12
+
13
+ '''
14
+ ### normalization algorithm for stereo imaging and panning effects ###
15
+ process :
16
+ 1. inputs 2-channeled audio
17
+ 2. apply Haas effects if the input audio is almost mono
18
+ 3. normalize mid-side channels according to target precomputed feature value
19
+ 4. normalize left-right channels 50-50
20
+ 5. normalize mid-side channels again
21
+ '''
22
+ def normalize_imager(data, \
23
+ target_side_mid_bal=0.9, \
24
+ mono_threshold=0.95, \
25
+ sr=44100, \
26
+ eps=1e-04, \
27
+ verbose=False):
28
+
29
+ # to mid-side channels
30
+ mid, side = lr_to_ms(data[:,0], data[:,1])
31
+
32
+ if verbose:
33
+ print_balance(data[:,0], data[:,1])
34
+ print_balance(mid, side)
35
+ print()
36
+
37
+ # apply mid-side weights according to energy
38
+ mid_e, side_e = np.sum(mid**2), np.sum(side**2)
39
+ total_e = mid_e + side_e
40
+ # apply haas effect to almost-mono signal
41
+ if mid_e/total_e > mono_threshold:
42
+ aug_chain = AugmentationChain(fxs=[(Haas(sample_rate=sr), 1, True)])
43
+ data = aug_chain([data])[0]
44
+ mid, side = lr_to_ms(data[:,0], data[:,1])
45
+
46
+ if verbose:
47
+ print_balance(data[:,0], data[:,1])
48
+ print_balance(mid, side)
49
+ print()
50
+
51
+ # normalize mid-side channels (stereo imaging)
52
+ new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
53
+ left, right = ms_to_lr(new_mid, new_side)
54
+ imaged = np.stack([left, right], 1)
55
+
56
+ if verbose:
57
+ print_balance(new_mid, new_side)
58
+ print_balance(left, right)
59
+ print()
60
+
61
+ # normalize panning to have the balance of left-right channels 50-50
62
+ left, right = process_balance(left, right, tgt_e1_bal=0.5, eps=eps)
63
+ mid, side = lr_to_ms(left, right)
64
+
65
+ if verbose:
66
+ print_balance(mid, side)
67
+ print_balance(left, right)
68
+ print()
69
+
70
+ # normalize again mid-side channels (stereo imaging)
71
+ new_mid, new_side = process_balance(mid, side, tgt_e1_bal=target_side_mid_bal, eps=eps)
72
+ left, right = ms_to_lr(new_mid, new_side)
73
+ imaged = np.stack([left, right], 1)
74
+
75
+ if verbose:
76
+ print_balance(new_mid, new_side)
77
+ print_balance(left, right)
78
+ print()
79
+
80
+ return imaged
81
+
82
+
83
+ # balance out 2 input data's energy according to given balance
84
+ # tgt_e1_bal range = [0.0, 1.0]
85
+ # tgt_e2_bal = 1.0 - tgt_e1_bal_range
86
+ def process_balance(data_1, data_2, tgt_e1_bal=0.5, eps=1e-04):
87
+
88
+ e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
89
+ total_e = e_1 + e_2
90
+
91
+ tgt_1_gain = np.sqrt(tgt_e1_bal * total_e / (e_1 + eps))
92
+
93
+ new_data_1 = data_1 * tgt_1_gain
94
+ new_e_1 = e_1 * (tgt_1_gain ** 2)
95
+ left_e_1 = total_e - new_e_1
96
+ tgt_2_gain = np.sqrt(left_e_1 / (e_2 + 1e-3))
97
+ new_data_2 = data_2 * tgt_2_gain
98
+
99
+ return new_data_1, new_data_2
100
+
101
+
102
+ # left-right channeled signal to mid-side signal
103
+ def lr_to_ms(left, right):
104
+ mid = left + right
105
+ side = left - right
106
+ return mid, side
107
+
108
+
109
+ # mid-side channeled signal to left-right signal
110
+ def ms_to_lr(mid, side):
111
+ left = (mid + side) / 2
112
+ right = (mid - side) / 2
113
+ return left, right
114
+
115
+
116
+ # print energy balance of 2 inputs
117
+ def print_balance(data_1, data_2, verbose=True):
118
+ e_1, e_2 = np.sum(data_1**2), np.sum(data_2**2)
119
+ total_e = e_1 + e_2
120
+ if verbose:
121
+ print(total_e, e_1/total_e, e_2/total_e)
122
+ return e_1/total_e, e_2/total_e
123
+
modules/utils_data_normalization.py ADDED
@@ -0,0 +1,992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import sys
4
+ import time
5
+ import numpy as np
6
+ import scipy
7
+ import librosa
8
+ import pyloudnorm as pyln
9
+
10
+ sys.setrecursionlimit(int(1e6))
11
+
12
+ import sklearn
13
+
14
+ currentdir = os.path.dirname(os.path.realpath(__file__))
15
+ sys.path.append(currentdir)
16
+ from common_miscellaneous import compute_stft, compute_istft
17
+ from common_audioeffects import Panner, Compressor, AugmentationChain, ConvolutionalReverb, Equaliser, AlgorithmicReverb
18
+ import fx_utils
19
+
20
+ import soundfile as sf
21
+ import aubio
22
+
23
+ import time
24
+
25
+ import warnings
26
+
27
+ import torch
28
+ import torchaudio.functional as F
29
+
30
+ # Functions
31
+
32
+ def print_dict(dict_):
33
+ for i in dict_:
34
+ print(i)
35
+ for j in dict_[i]:
36
+ print('\t', j)
37
+
38
+ def amp_to_db(x):
39
+ return 20*np.log10(x + 1e-30)
40
+
41
+ def db_to_amp(x):
42
+ return 10**(x/20)
43
+
44
+ def get_running_stats(x, features, N=20):
45
+ mean = []
46
+ std = []
47
+ for i in range(len(features)):
48
+ mean_, std_ = running_mean_std(x[:,i], N)
49
+ mean.append(mean_)
50
+ std.append(std_)
51
+ mean = np.asarray(mean)
52
+ std = np.asarray(std)
53
+
54
+ return mean, std
55
+
56
+ def running_mean_std(x, N):
57
+
58
+ with warnings.catch_warnings():
59
+ warnings.simplefilter("ignore", category=RuntimeWarning)
60
+ cumsum = np.cumsum(np.insert(x, 0, 0))
61
+ cumsum2 = np.cumsum(np.insert(x**2, 0, 0))
62
+ mean = (cumsum[N:] - cumsum[:-N]) / float(N)
63
+
64
+ std = np.sqrt(((cumsum2[N:] - cumsum2[:-N]) / N) - (mean * mean))
65
+
66
+ return mean, std
67
+
68
+ def get_eq_matching(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
69
+ min_db=-50, ntaps=101, lufs=-30):
70
+
71
+ audio_t = np.copy(audio_t)
72
+ max_db = amp_to_db(np.max(np.abs(audio_t)))
73
+ if max_db > min_db:
74
+
75
+ audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
76
+ audio_D = compute_stft(np.expand_dims(audio_t, 1),
77
+ hop_length,
78
+ n_fft,
79
+ np.sqrt(np.hanning(n_fft+1)[:-1]))
80
+ audio_D = np.abs(audio_D)
81
+ audio_D_avg = np.mean(audio_D, axis=0)[0]
82
+
83
+ m = ref_spec.shape[0]
84
+
85
+ Ts = 1.0/sr # sampling interval
86
+ n = m # length of the signal
87
+ kk = np.arange(n)
88
+ T = n/sr
89
+ frq = kk/T # two sides frequency range
90
+ frq /=2
91
+
92
+ diff_eq = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
93
+ diff_eq = db_to_amp(diff_eq)
94
+ diff_eq = np.sqrt(diff_eq)
95
+
96
+ diff_filter = scipy.signal.firwin2(ntaps,
97
+ frq/np.max(frq),
98
+ diff_eq,
99
+ nfreqs=None, window='hamming',
100
+ nyq=None, antisymmetric=False)
101
+
102
+ output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
103
+ axis=-1, padtype='odd', padlen=None,
104
+ method='pad', irlen=None)
105
+
106
+ else:
107
+ output = audio_t
108
+
109
+ return output
110
+
111
+ def get_eq_matching_gpu(audio_t, ref_spec, sr=44100, n_fft=65536, hop_length=16384,
112
+ min_db=-50, ntaps=101, lufs=-30):
113
+
114
+ audio_t = np.copy(audio_t)
115
+ max_db = amp_to_db(np.max(np.abs(audio_t)))
116
+ if max_db > min_db:
117
+
118
+
119
+ start_time = time.time()
120
+
121
+ audio_t = fx_utils.lufs_normalize(audio_t, sr, lufs, log=False)
122
+ # audio_D = compute_stft(np.expand_dims(audio_t, 1),
123
+ # hop_length,
124
+ # n_fft,
125
+ # np.sqrt(np.hanning(n_fft+1)[:-1]))
126
+ audio_D = compute_stft(audio_t,
127
+ hop_length,
128
+ n_fft,
129
+ np.sqrt(np.hanning(n_fft+1)[:-1]))
130
+ audio_D = np.abs(audio_D)
131
+ # audio_D_avg = np.mean(audio_D, axis=0)
132
+ audio_D_avg = np.mean(audio_D, axis=0)[0]
133
+
134
+ m = ref_spec.shape[0]
135
+
136
+ Ts = 1.0/sr # sampling interval
137
+ n = m # length of the signal
138
+ kk = np.arange(n)
139
+ T = n/sr
140
+ frq = kk/T # two sides frequency range
141
+ frq /=2
142
+
143
+ diff_eq_l = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
144
+ diff_eq_l = db_to_amp(diff_eq_l)
145
+ diff_eq_l = np.sqrt(diff_eq_l)
146
+ diff_eq_r = amp_to_db(ref_spec)-amp_to_db(audio_D_avg)
147
+ diff_eq_r = db_to_amp(diff_eq_r)
148
+ diff_eq_r = np.sqrt(diff_eq_r)
149
+
150
+ diff_filter_l = scipy.signal.firwin2(ntaps,
151
+ frq/np.max(frq),
152
+ diff_eq_l,
153
+ nfreqs=None, window='hamming',
154
+ nyq=None, antisymmetric=False)
155
+ diff_filter_r = scipy.signal.firwin2(ntaps,
156
+ frq/np.max(frq),
157
+ diff_eq_r,
158
+ nfreqs=None, window='hamming',
159
+ nyq=None, antisymmetric=False)
160
+ diff_filter = np.stack((diff_filter_l, diff_filter_r), axis=0)
161
+
162
+ # output = scipy.signal.filtfilt(diff_filter, 1, audio_t,
163
+ # axis=-1, padtype='odd', padlen=None,
164
+ # method='pad', irlen=None)
165
+
166
+ print(f"\t\tall previous: {time.time()-start_time}")
167
+
168
+ start_time = time.time()
169
+
170
+ # device = torch.cuda()
171
+ audio_t = torch.from_numpy(audio_t.transpose()).float().cuda()
172
+ diff_filter = torch.from_numpy(diff_filter).float().cuda()
173
+ denom_coef = torch.ones(diff_filter.size()).cuda()
174
+ print(f'input to gpu - audio shape: {audio_t.shape}')
175
+ # audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False).transpose()
176
+ audio_t = F.filtfilt(waveform=audio_t, a_coeffs=denom_coef, b_coeffs=diff_filter, clamp=False)
177
+ audio_t = audio_t.transpose(1, 0)
178
+ print(audio_t.shape)
179
+ print('filtered')
180
+ print(f"\t\tgpu filtfilt: {time.time()-start_time}")
181
+ print(torch.mean(audio_t))
182
+ output = audio_t.detach()
183
+ print(f"\t\t1gpu filtfilt: {time.time()-start_time}")
184
+ output = audio_t.cpu()
185
+ print(f"\t\t2gpu filtfilt: {time.time()-start_time}")
186
+ output = audio_t.detach().cpu().numpy()
187
+ print(f"\t\t3gpu filtfilt: {time.time()-start_time}")
188
+
189
+
190
+ else:
191
+ output = audio_t
192
+
193
+ return output
194
+
195
+ def get_SPS(x, n_fft=2048, hop_length=1024, smooth=False, frames=False):
196
+
197
+ x = np.copy(x)
198
+ eps = 1e-20
199
+
200
+ audio_D = compute_stft(x,
201
+ hop_length,
202
+ n_fft,
203
+ np.sqrt(np.hanning(n_fft+1)[:-1]))
204
+
205
+ audio_D_l = np.abs(audio_D[:, 0, :] + eps)
206
+ audio_D_r = np.abs(audio_D[:, 1, :] + eps)
207
+
208
+ phi = 2 * (np.abs(audio_D_l*np.conj(audio_D_r)))/(np.abs(audio_D_l)**2+np.abs(audio_D_r)**2)
209
+
210
+ phi_l = np.abs(audio_D_l*np.conj(audio_D_r))/(np.abs(audio_D_l)**2)
211
+ phi_r = np.abs(audio_D_r*np.conj(audio_D_l))/(np.abs(audio_D_r)**2)
212
+ delta = phi_l - phi_r
213
+ delta_ = np.sign(delta)
214
+ SPS = (1-phi)*delta_
215
+
216
+ phi_mean = np.mean(phi, axis=0)
217
+ if smooth:
218
+ phi_mean = scipy.signal.savgol_filter(phi_mean, 501, 1, mode='mirror')
219
+
220
+ SPS_mean = np.mean(SPS, axis=0)
221
+ if smooth:
222
+ SPS_mean = scipy.signal.savgol_filter(SPS_mean, 501, 1, mode='mirror')
223
+
224
+
225
+ return SPS_mean, phi_mean, SPS, phi
226
+
227
+
228
+ def get_mean_side(sps, freqs=[50,2500], sr=44100, n_fft=2048):
229
+
230
+ sign = np.sign(sps+ 1e-10)
231
+
232
+ idx1 = freqs[0]
233
+ idx2 = freqs[1]
234
+
235
+ f1 = int(np.floor(idx1*n_fft/sr))
236
+ f2 = int(np.floor(idx2*n_fft/sr))
237
+
238
+ sign_mean = np.mean(sign[f1:f2])/np.abs(np.mean(sign[f1:f2]))
239
+ sign_mean
240
+
241
+ return sign_mean
242
+
243
+ def get_panning_param_values(phi, side):
244
+
245
+ p = np.zeros_like(phi)
246
+
247
+ g = (np.clip(phi+1e-30, 0, 1))/2
248
+
249
+ for i, g_ in enumerate(g):
250
+
251
+ if side > 0:
252
+ p[i] = 1 - g_
253
+
254
+ elif side < 0:
255
+ p[i] = g_
256
+
257
+ else:
258
+ p[i] = 0.5
259
+
260
+ g_l = 1-p
261
+ g_r = p
262
+
263
+ return p, [g_l, g_r]
264
+
265
+ def get_panning_matching(audio, ref_phi,
266
+ sr=44100, n_fft=2048, hop_length=1024,
267
+ min_db_f=-10, max_freq_pan=16000, frames=True):
268
+
269
+ eps = 1e-20
270
+ window = np.sqrt(np.hanning(n_fft+1)[:-1])
271
+ audio = np.copy(audio)
272
+ audio_t = np.pad(audio, ((n_fft, n_fft), (0, 0)), mode='constant')
273
+
274
+ sps_mean_, phi_mean_, _, _ = get_SPS(audio_t, n_fft=n_fft, hop_length=hop_length, smooth=True)
275
+
276
+ side = get_mean_side(sps_mean_, sr=sr, n_fft=n_fft)
277
+
278
+ if side > 0:
279
+ alpha = 0.7
280
+ else:
281
+ alpha = 0.3
282
+
283
+ processor = Panner()
284
+ processor.parameters.pan.value = alpha
285
+ processor.parameters.pan_law.value = 'linear'
286
+ processor.update()
287
+ audio_t_ = processor.process(audio_t)
288
+
289
+ sps_mean_, phi_mean, sps_frames, phi_frames = get_SPS(audio_t_, n_fft=n_fft,
290
+ hop_length=hop_length,
291
+ smooth=True, frames=frames)
292
+
293
+ if frames:
294
+
295
+ p_i_ = []
296
+ g_i_ = []
297
+ p_ref = []
298
+ g_ref = []
299
+ for i in range(len(sps_frames)):
300
+ sps_ = sps_frames[i]
301
+ phi_ = phi_frames[i]
302
+ p_, g_ = get_panning_param_values(phi_, side)
303
+ p_i_.append(p_)
304
+ g_i_.append(g_)
305
+ p_, g_ = get_panning_param_values(ref_phi, side)
306
+ p_ref.append(p_)
307
+ g_ref.append(g_)
308
+ ratio = (np.asarray(g_ref)/(np.asarray(g_i_)+eps))
309
+ g_l = ratio[:,0,:]
310
+ g_r = ratio[:,1,:]
311
+
312
+
313
+ else:
314
+ p, g = get_panning_param_values(ref_phi, side)
315
+ p_i, g_i = get_panning_param_values(phi_mean, side)
316
+ ratio = (np.asarray(g)/np.asarray(g_i))
317
+ g_l = ratio[0]
318
+ g_r = ratio[1]
319
+
320
+ audio_new_D = compute_stft(audio_t_,
321
+ hop_length,
322
+ n_fft,
323
+ window)
324
+
325
+ audio_new_D_mono = audio_new_D.copy()
326
+ audio_new_D_mono = audio_new_D_mono[:, 0, :] + audio_new_D_mono[:, 1, :]
327
+ audio_new_D_mono = np.abs(audio_new_D_mono)
328
+
329
+ audio_new_D_phase = np.angle(audio_new_D)
330
+ audio_new_D = np.abs(audio_new_D)
331
+
332
+ audio_new_D_l = audio_new_D[:, 0, :]
333
+ audio_new_D_r = audio_new_D[:, 1, :]
334
+
335
+ if frames:
336
+ for i, frame in enumerate(audio_new_D_mono):
337
+ max_db = amp_to_db(np.max(np.abs(frame)))
338
+ if max_db < min_db_f:
339
+ g_r[i] = np.ones_like(frame)
340
+ g_l[i] = np.ones_like(frame)
341
+
342
+ idx1 = max_freq_pan
343
+ f1 = int(np.floor(idx1*n_fft/sr))
344
+ ones = np.ones_like(g_l)
345
+ g_l[f1:] = ones[f1:]
346
+ g_r[f1:] = ones[f1:]
347
+
348
+ audio_new_D_l = audio_new_D_l*g_l
349
+ audio_new_D_r = audio_new_D_r*g_r
350
+
351
+ audio_new_D_l = np.expand_dims(audio_new_D_l, 0)
352
+ audio_new_D_r = np.expand_dims(audio_new_D_r, 0)
353
+
354
+ audio_new_D_ = np.concatenate((audio_new_D_l,audio_new_D_r))
355
+
356
+ audio_new_D_ = np.moveaxis(audio_new_D_, 0, 1)
357
+
358
+ audio_new_D_ = audio_new_D_ * (np.cos(audio_new_D_phase) + np.sin(audio_new_D_phase)*1j)
359
+
360
+ audio_new_t = compute_istft(audio_new_D_,
361
+ hop_length,
362
+ window)
363
+
364
+ audio_new_t = audio_new_t[n_fft:n_fft+audio.shape[0]]
365
+
366
+ return audio_new_t
367
+
368
+
369
+
370
+ def get_mean_peak(audio, sr=44100, true_peak=False, n_mels=128, percentile=75):
371
+
372
+ # Returns mean peak value in dB after the 1Q is removed.
373
+ # Input should be in the shape samples x channel
374
+
375
+ audio_ = audio
376
+ window_size = 2**10 # FFT size
377
+ hop_size = window_size
378
+
379
+ peak = []
380
+ std = []
381
+ for ch in range(audio_.shape[-1]):
382
+ x = np.ascontiguousarray(audio_[:, ch])
383
+
384
+ if true_peak:
385
+ x = librosa.resample(x, sr, 4*sr)
386
+ sr = 4*sr
387
+ window_size = 4*window_size
388
+ hop_size = 4*hop_size
389
+
390
+ onset_func = aubio.onset('hfc', buf_size=window_size, hop_size=hop_size, samplerate=sr)
391
+
392
+ frames = np.float32(librosa.util.frame(x, frame_length=window_size, hop_length=hop_size))
393
+
394
+ onset_times = []
395
+ for frame in frames.T:
396
+
397
+ if onset_func(frame):
398
+
399
+ onset_time = onset_func.get_last()
400
+ onset_times.append(onset_time)
401
+
402
+ samples=[]
403
+ if onset_times:
404
+ for i, p in enumerate(onset_times[:-1]):
405
+ samples.append(onset_times[i]+np.argmax(np.abs(x[onset_times[i]:onset_times[i+1]])))
406
+ samples.append(onset_times[-1]+np.argmax(np.abs(x[onset_times[-1]:])))
407
+
408
+ p_value = []
409
+ for p in samples:
410
+ p_ = amp_to_db(np.abs(x[p]))
411
+ p_value.append(p_)
412
+ p_value_=[]
413
+ for p in p_value:
414
+ if p > np.percentile(p_value, percentile):
415
+ p_value_.append(p)
416
+ if p_value_:
417
+ peak.append(np.mean(p_value_))
418
+ std.append(np.std(p_value_))
419
+ elif p_value:
420
+ peak.append(np.mean(p_value))
421
+ std.append(np.std(p_value))
422
+ else:
423
+ return None
424
+ return [np.mean(peak), np.mean(std)]
425
+
426
+ def compress(processor, audio, sr, th, ratio, attack, release):
427
+
428
+ eps = 1e-20
429
+ x = audio
430
+
431
+ processor.parameters.threshold.value = th
432
+ processor.parameters.ratio.value = ratio
433
+ processor.parameters.attack_time.value = attack
434
+ processor.parameters.release_time.value = release
435
+ processor.update()
436
+ output = processor.process(x)
437
+
438
+ if np.max(np.abs(output)) >= 1.0:
439
+ output = np.clip(output, -1.0, 1.0)
440
+
441
+ return output
442
+
443
+ def get_comp_matching(audio,
444
+ ref_peak, ref_std,
445
+ ratio, attack, release, sr=44100,
446
+ min_db=-50, comp_peak_norm=-10.0,
447
+ min_th=-40, max_ratio=20, n_mels=128,
448
+ true_peak=False, percentile=75, expander=True):
449
+
450
+ x = audio.copy()
451
+
452
+ if x.ndim < 2:
453
+ x = np.expand_dims(x, 1)
454
+
455
+ max_db = amp_to_db(np.max(np.abs(x)))
456
+ if max_db > min_db:
457
+
458
+ x = pyln.normalize.peak(x, comp_peak_norm)
459
+
460
+ peak, std = get_mean_peak(x, sr,
461
+ n_mels=n_mels,
462
+ true_peak=true_peak,
463
+ percentile=percentile)
464
+
465
+ if peak > (ref_peak - ref_std) and peak < (ref_peak + ref_std):
466
+ return x
467
+
468
+ # DownwardCompress
469
+ elif peak > (ref_peak - ref_std):
470
+ processor = Compressor(sample_rate=sr)
471
+ # print('compress')
472
+ ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
473
+ ths = np.linspace(-1-9, min_th, 2*np.abs(min_th)-1-18)
474
+ for rt in ratios:
475
+ for th in ths:
476
+ y = compress(processor, x, sr, th, rt, attack, release)
477
+ peak, std = get_mean_peak(y, sr,
478
+ n_mels=n_mels,
479
+ true_peak=true_peak,
480
+ percentile=percentile)
481
+ if peak < (ref_peak + ref_std):
482
+ break
483
+ else:
484
+ continue
485
+ break
486
+
487
+ return y
488
+
489
+ # Upward Expand
490
+ elif peak < (ref_peak + ref_std):
491
+
492
+ if expander:
493
+ processor = Compressor(sample_rate=sr)
494
+ ratios = np.linspace(ratio, max_ratio, max_ratio-ratio+1)
495
+ ths = np.linspace(-1, min_th, 2*np.abs(min_th)-1)[::-1]
496
+
497
+ for rt in ratios:
498
+ for th in ths:
499
+ y = compress(processor, x, sr, th, 1/rt, attack, release)
500
+ peak, std = get_mean_peak(y, sr,
501
+ n_mels=n_mels,
502
+ true_peak=true_peak,
503
+ percentile=percentile)
504
+ if peak > (ref_peak - ref_std):
505
+ break
506
+ else:
507
+ continue
508
+ break
509
+
510
+ return y
511
+
512
+ else:
513
+ return x
514
+ else:
515
+ return x
516
+
517
+
518
+
519
+ # REVERB
520
+
521
+
522
+ def get_reverb_send(audio, eq_parameters, rv_parameters, impulse_responses=None,
523
+ eq_prob=1.0, rv_prob=1.0, parallel=True, shuffle=False, sr=44100, bands=['low_shelf', 'high_shelf']):
524
+
525
+ x = audio.copy()
526
+
527
+ if x.ndim < 2:
528
+ x = np.expand_dims(x, 1)
529
+
530
+ channels = x.shape[-1]
531
+ eq_gain = eq_parameters.low_shelf_gain.value
532
+
533
+
534
+ eq = Equaliser(n_channels=channels,
535
+ sample_rate=sr,
536
+ gain_range=(eq_gain, eq_gain),
537
+ bands=bands,
538
+ hard_clip=False,
539
+ name='Equaliser', parameters=eq_parameters)
540
+ eq.randomize()
541
+
542
+ if impulse_responses:
543
+
544
+ reverb = ConvolutionalReverb(impulse_responses=impulse_responses,
545
+ sample_rate=sr,
546
+ parameters=rv_parameters)
547
+
548
+ else:
549
+
550
+ reverb = AlgorithmicReverb(sample_rate=sr,
551
+ parameters=rv_parameters)
552
+
553
+ reverb.randomize()
554
+
555
+ fxchain = AugmentationChain([
556
+ (eq, rv_prob, False),
557
+ (reverb, eq_prob, False)
558
+ ],
559
+ shuffle=shuffle, parallel=parallel)
560
+
561
+ output = fxchain(x)
562
+
563
+ return output
564
+
565
+
566
+
567
+ # FUNCTIONS TO COMPUTE FEATURES
568
+
569
+ def compute_loudness_features(args_):
570
+
571
+ audio_out_ = args_[0]
572
+ audio_tar_ = args_[1]
573
+ idx = args_[2]
574
+ sr = args_[3]
575
+
576
+ loudness_ = {key:[] for key in ['d_lufs', 'd_peak',]}
577
+
578
+ peak_tar = np.max(np.abs(audio_tar_))
579
+ peak_tar_db = 20.0 * np.log10(peak_tar)
580
+
581
+ peak_out = np.max(np.abs(audio_out_))
582
+ peak_out_db = 20.0 * np.log10(peak_out)
583
+
584
+ with warnings.catch_warnings():
585
+ warnings.simplefilter("ignore", category=RuntimeWarning)
586
+ meter = pyln.Meter(sr) # create BS.1770 meter
587
+ loudness_tar = meter.integrated_loudness(audio_tar_)
588
+ loudness_out = meter.integrated_loudness(audio_out_)
589
+
590
+ loudness_['d_lufs'].append(sklearn.metrics.mean_absolute_percentage_error([loudness_tar], [loudness_out]))
591
+ loudness_['d_peak'].append(sklearn.metrics.mean_absolute_percentage_error([peak_tar_db], [peak_out_db]))
592
+
593
+ return loudness_
594
+
595
+ def compute_spectral_features(args_):
596
+
597
+ audio_out_ = args_[0]
598
+ audio_tar_ = args_[1]
599
+ idx = args_[2]
600
+ sr = args_[3]
601
+ fft_size = args_[4]
602
+ hop_length = args_[5]
603
+ channels = args_[6]
604
+
605
+ audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
606
+ audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
607
+
608
+ spec_out_ = compute_stft(audio_out_,
609
+ hop_length,
610
+ fft_size,
611
+ np.sqrt(np.hanning(fft_size+1)[:-1]))
612
+ spec_out_ = np.transpose(spec_out_, axes=[1, -1, 0])
613
+ spec_out_ = np.abs(spec_out_)
614
+
615
+ spec_tar_ = compute_stft(audio_tar_,
616
+ hop_length,
617
+ fft_size,
618
+ np.sqrt(np.hanning(fft_size+1)[:-1]))
619
+ spec_tar_ = np.transpose(spec_tar_, axes=[1, -1, 0])
620
+ spec_tar_ = np.abs(spec_tar_)
621
+
622
+ spectral_ = {key:[] for key in ['centroid_mean',
623
+ 'bandwidth_mean',
624
+ 'contrast_l_mean',
625
+ 'contrast_m_mean',
626
+ 'contrast_h_mean',
627
+ 'rolloff_mean',
628
+ 'flatness_mean',
629
+ 'mape_mean',
630
+ ]}
631
+
632
+ centroid_mean_ = []
633
+ centroid_std_ = []
634
+ bandwidth_mean_ = []
635
+ bandwidth_std_ = []
636
+ contrast_l_mean_ = []
637
+ contrast_l_std_ = []
638
+ contrast_m_mean_ = []
639
+ contrast_m_std_ = []
640
+ contrast_h_mean_ = []
641
+ contrast_h_std_ = []
642
+ rolloff_mean_ = []
643
+ rolloff_std_ = []
644
+ flatness_mean_ = []
645
+
646
+ for ch in range(channels):
647
+ tar = spec_tar_[ch]
648
+ out = spec_out_[ch]
649
+
650
+ tar_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=tar,
651
+ n_fft=fft_size, hop_length=hop_length)
652
+
653
+ out_sc = librosa.feature.spectral_centroid(y=None, sr=sr, S=out,
654
+ n_fft=fft_size, hop_length=hop_length)
655
+
656
+ tar_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=tar,
657
+ n_fft=fft_size, hop_length=hop_length,
658
+ centroid=tar_sc, norm=True, p=2)
659
+
660
+ out_bw = librosa.feature.spectral_bandwidth(y=None, sr=sr, S=out,
661
+ n_fft=fft_size, hop_length=hop_length,
662
+ centroid=out_sc, norm=True, p=2)
663
+ # l = 0-250, m = 1-2-3 = 250 - 2000, h = 2000 - SR/2
664
+ tar_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=tar,
665
+ n_fft=fft_size, hop_length=hop_length,
666
+ fmin=250.0, n_bands=4, quantile=0.02, linear=False)
667
+
668
+ out_ct = librosa.feature.spectral_contrast(y=None, sr=sr, S=out,
669
+ n_fft=fft_size, hop_length=hop_length,
670
+ fmin=250.0, n_bands=4, quantile=0.02, linear=False)
671
+
672
+ tar_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=tar,
673
+ n_fft=fft_size, hop_length=hop_length,
674
+ roll_percent=0.85)
675
+
676
+ out_ro = librosa.feature.spectral_rolloff(y=None, sr=sr, S=out,
677
+ n_fft=fft_size, hop_length=hop_length,
678
+ roll_percent=0.85)
679
+
680
+ tar_ft = librosa.feature.spectral_flatness(y=None, S=tar,
681
+ n_fft=fft_size, hop_length=hop_length,
682
+ amin=1e-10, power=2.0)
683
+
684
+ out_ft = librosa.feature.spectral_flatness(y=None, S=out,
685
+ n_fft=fft_size, hop_length=hop_length,
686
+ amin=1e-10, power=2.0)
687
+
688
+
689
+ eps = 1e-0
690
+ N = 40
691
+ mean_sc_tar, std_sc_tar = get_running_stats(tar_sc.T+eps, [0], N=N)
692
+ mean_sc_out, std_sc_out = get_running_stats(out_sc.T+eps, [0], N=N)
693
+
694
+ assert np.isnan(mean_sc_tar).any() == False, f'NAN values mean_sc_tar {idx}'
695
+ assert np.isnan(mean_sc_out).any() == False, f'NAN values mean_sc_out {idx}'
696
+
697
+
698
+ mean_bw_tar, std_bw_tar = get_running_stats(tar_bw.T+eps, [0], N=N)
699
+ mean_bw_out, std_bw_out = get_running_stats(out_bw.T+eps, [0], N=N)
700
+
701
+ assert np.isnan(mean_bw_tar).any() == False, f'NAN values tar mean bw {idx}'
702
+ assert np.isnan(mean_bw_out).any() == False, f'NAN values out mean bw {idx}'
703
+
704
+ mean_ct_tar, std_ct_tar = get_running_stats(tar_ct.T, list(range(tar_ct.shape[0])), N=N)
705
+ mean_ct_out, std_ct_out = get_running_stats(out_ct.T, list(range(out_ct.shape[0])), N=N)
706
+
707
+ assert np.isnan(mean_ct_tar).any() == False, f'NAN values tar mean ct {idx}'
708
+ assert np.isnan(mean_ct_out).any() == False, f'NAN values out mean ct {idx}'
709
+
710
+ mean_ro_tar, std_ro_tar = get_running_stats(tar_ro.T+eps, [0], N=N)
711
+ mean_ro_out, std_ro_out = get_running_stats(out_ro.T+eps, [0], N=N)
712
+
713
+ assert np.isnan(mean_ro_tar).any() == False, f'NAN values tar mean ro {idx}'
714
+ assert np.isnan(mean_ro_out).any() == False, f'NAN values out mean ro {idx}'
715
+
716
+ mean_ft_tar, std_ft_tar = get_running_stats(tar_ft.T, [0], N=800) # gives very high numbers due to N (80) value
717
+ mean_ft_out, std_ft_out = get_running_stats(out_ft.T, [0], N=800)
718
+
719
+ mape_mean_sc = sklearn.metrics.mean_absolute_percentage_error(mean_sc_tar[0], mean_sc_out[0])
720
+
721
+ mape_mean_bw = sklearn.metrics.mean_absolute_percentage_error(mean_bw_tar[0], mean_bw_out[0])
722
+
723
+ mape_mean_ct_l = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[0], mean_ct_out[0])
724
+
725
+ mape_mean_ct_m = sklearn.metrics.mean_absolute_percentage_error(np.mean(mean_ct_tar[1:4], axis=0),
726
+ np.mean(mean_ct_out[1:4], axis=0))
727
+
728
+ mape_mean_ct_h = sklearn.metrics.mean_absolute_percentage_error(mean_ct_tar[-1], mean_ct_out[-1])
729
+
730
+ mape_mean_ro = sklearn.metrics.mean_absolute_percentage_error(mean_ro_tar[0], mean_ro_out[0])
731
+
732
+ mape_mean_ft = sklearn.metrics.mean_absolute_percentage_error(mean_ft_tar[0], mean_ft_out[0])
733
+
734
+ centroid_mean_.append(mape_mean_sc)
735
+ bandwidth_mean_.append(mape_mean_bw)
736
+ contrast_l_mean_.append(mape_mean_ct_l)
737
+ contrast_m_mean_.append(mape_mean_ct_m)
738
+ contrast_h_mean_.append(mape_mean_ct_h)
739
+ rolloff_mean_.append(mape_mean_ro)
740
+ flatness_mean_.append(mape_mean_ft)
741
+
742
+ spectral_['centroid_mean'].append(np.mean(centroid_mean_))
743
+
744
+ spectral_['bandwidth_mean'].append(np.mean(bandwidth_mean_))
745
+
746
+ spectral_['contrast_l_mean'].append(np.mean(contrast_l_mean_))
747
+
748
+ spectral_['contrast_m_mean'].append(np.mean(contrast_m_mean_))
749
+
750
+ spectral_['contrast_h_mean'].append(np.mean(contrast_h_mean_))
751
+
752
+ spectral_['rolloff_mean'].append(np.mean(rolloff_mean_))
753
+
754
+ spectral_['flatness_mean'].append(np.mean(flatness_mean_))
755
+
756
+ spectral_['mape_mean'].append(np.mean([np.mean(centroid_mean_),
757
+ np.mean(bandwidth_mean_),
758
+ np.mean(contrast_l_mean_),
759
+ np.mean(contrast_m_mean_),
760
+ np.mean(contrast_h_mean_),
761
+ np.mean(rolloff_mean_),
762
+ np.mean(flatness_mean_),
763
+ ]))
764
+
765
+ return spectral_
766
+
767
+ # PANNING
768
+ def get_panning_rms_frame(sps_frame, freqs=[0,22050], sr=44100, n_fft=2048):
769
+
770
+ idx1 = freqs[0]
771
+ idx2 = freqs[1]
772
+
773
+ f1 = int(np.floor(idx1*n_fft/sr))
774
+ f2 = int(np.floor(idx2*n_fft/sr))
775
+
776
+ p_rms = np.sqrt((1/(f2-f1)) * np.sum(sps_frame[f1:f2]**2))
777
+
778
+ return p_rms
779
+ def get_panning_rms(sps, freqs=[[0, 22050]], sr=44100, n_fft=2048):
780
+
781
+ p_rms = []
782
+ for frame in sps:
783
+ p_rms_ = []
784
+ for f in freqs:
785
+ rms = get_panning_rms_frame(frame, freqs=f, sr=sr, n_fft=n_fft)
786
+ p_rms_.append(rms)
787
+ p_rms.append(p_rms_)
788
+
789
+ return np.asarray(p_rms)
790
+
791
+
792
+
793
+ def compute_panning_features(args_):
794
+
795
+ audio_out_ = args_[0]
796
+ audio_tar_ = args_[1]
797
+ idx = args_[2]
798
+ sr = args_[3]
799
+ fft_size = args_[4]
800
+ hop_length = args_[5]
801
+
802
+ audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
803
+ audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
804
+
805
+ panning_ = {}
806
+
807
+ freqs=[[0, sr//2], [0, 250], [250, 2500], [2500, sr//2]]
808
+
809
+ _, _, sps_frames_tar, _ = get_SPS(audio_tar_, n_fft=fft_size,
810
+ hop_length=hop_length,
811
+ smooth=True, frames=True)
812
+
813
+ _, _, sps_frames_out, _ = get_SPS(audio_out_, n_fft=fft_size,
814
+ hop_length=hop_length,
815
+ smooth=True, frames=True)
816
+
817
+
818
+ p_rms_tar = get_panning_rms(sps_frames_tar,
819
+ freqs=freqs,
820
+ sr=sr,
821
+ n_fft=fft_size)
822
+
823
+ p_rms_out = get_panning_rms(sps_frames_out,
824
+ freqs=freqs,
825
+ sr=sr,
826
+ n_fft=fft_size)
827
+
828
+ # to avoid num instability, deletes frames with zero rms from target
829
+ if np.min(p_rms_tar) == 0.0:
830
+ id_zeros = np.where(p_rms_tar.T[0] == 0)
831
+ p_rms_tar_ = []
832
+ p_rms_out_ = []
833
+ for i in range(len(freqs)):
834
+ temp_tar = np.delete(p_rms_tar.T[i], id_zeros)
835
+ temp_out = np.delete(p_rms_out.T[i], id_zeros)
836
+ p_rms_tar_.append(temp_tar)
837
+ p_rms_out_.append(temp_out)
838
+ p_rms_tar_ = np.asarray(p_rms_tar_)
839
+ p_rms_tar = p_rms_tar_.T
840
+ p_rms_out_ = np.asarray(p_rms_out_)
841
+ p_rms_out = p_rms_out_.T
842
+
843
+ N = 40
844
+
845
+ mean_tar, std_tar = get_running_stats(p_rms_tar, freqs, N=N)
846
+ mean_out, std_out = get_running_stats(p_rms_out, freqs, N=N)
847
+
848
+ panning_['P_t_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[0], mean_out[0])]
849
+ panning_['P_l_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[1], mean_out[1])]
850
+ panning_['P_m_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[2], mean_out[2])]
851
+ panning_['P_h_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_tar[3], mean_out[3])]
852
+
853
+ panning_['mape_mean'] = [np.mean([panning_['P_t_mean'],
854
+ panning_['P_l_mean'],
855
+ panning_['P_m_mean'],
856
+ panning_['P_h_mean'],
857
+ ])]
858
+
859
+ return panning_
860
+
861
+ # DYNAMIC
862
+
863
+ def get_rms_dynamic_crest(x, frame_length, hop_length):
864
+
865
+ rms = []
866
+ dynamic_spread = []
867
+ crest = []
868
+ for ch in range(x.shape[-1]):
869
+ frames = librosa.util.frame(x[:, ch], frame_length=frame_length, hop_length=hop_length)
870
+ rms_ = []
871
+ dynamic_spread_ = []
872
+ crest_ = []
873
+ for i in frames.T:
874
+ x_rms = amp_to_db(np.sqrt(np.sum(i**2)/frame_length))
875
+ x_d = np.sum(amp_to_db(np.abs(i)) - x_rms)/frame_length
876
+ x_c = amp_to_db(np.max(np.abs(i)))/x_rms
877
+
878
+ rms_.append(x_rms)
879
+ dynamic_spread_.append(x_d)
880
+ crest_.append(x_c)
881
+ rms.append(rms_)
882
+ dynamic_spread.append(dynamic_spread_)
883
+ crest.append(crest_)
884
+
885
+ rms = np.asarray(rms)
886
+ dynamic_spread = np.asarray(dynamic_spread)
887
+ crest = np.asarray(crest)
888
+
889
+ rms = np.mean(rms, axis=0)
890
+ dynamic_spread = np.mean(dynamic_spread, axis=0)
891
+ crest = np.mean(crest, axis=0)
892
+
893
+ rms = np.expand_dims(rms, axis=0)
894
+ dynamic_spread = np.expand_dims(dynamic_spread, axis=0)
895
+ crest = np.expand_dims(crest, axis=0)
896
+
897
+ return rms, dynamic_spread, crest
898
+
899
+ def lowpassFiltering(x, f0, sr):
900
+
901
+ b1, a1 = scipy.signal.butter(4, f0/(sr/2),'lowpass')
902
+ x_f = []
903
+ for ch in range(x.shape[-1]):
904
+ x_f_ = scipy.signal.filtfilt(b1, a1, x[:, ch]).copy(order='F')
905
+ x_f.append(x_f_)
906
+ return np.asarray(x_f).T
907
+
908
+
909
+ def get_low_freq_weighting(x, sr, n_fft, hop_length, f0 = 1000):
910
+
911
+ x_low = lowpassFiltering(x, f0, sr)
912
+
913
+ X_low = compute_stft(x_low,
914
+ hop_length,
915
+ n_fft,
916
+ np.sqrt(np.hanning(n_fft+1)[:-1]))
917
+ X_low = np.transpose(X_low, axes=[1, -1, 0])
918
+ X_low = np.abs(X_low)
919
+
920
+ X = compute_stft(x,
921
+ hop_length,
922
+ n_fft,
923
+ np.sqrt(np.hanning(n_fft+1)[:-1]))
924
+ X = np.transpose(X, axes=[1, -1, 0])
925
+ X = np.abs(X)
926
+
927
+ eps = 1e-5
928
+ ratio = (X_low)/(X+eps)
929
+ ratio = np.sum(ratio, axis = 1)
930
+ ratio = np.mean(ratio, axis = 0)
931
+
932
+ return np.expand_dims(ratio, axis=0)
933
+
934
+ def compute_dynamic_features(args_):
935
+
936
+ audio_out_ = args_[0]
937
+ audio_tar_ = args_[1]
938
+ idx = args_[2]
939
+ sr = args_[3]
940
+ fft_size = args_[4]
941
+ hop_length = args_[5]
942
+
943
+ audio_out_ = pyln.normalize.peak(audio_out_, -1.0)
944
+ audio_tar_ = pyln.normalize.peak(audio_tar_, -1.0)
945
+
946
+ dynamic_ = {}
947
+
948
+ with warnings.catch_warnings():
949
+ warnings.simplefilter("ignore", category=UserWarning)
950
+
951
+ rms_tar, dyn_tar, crest_tar = get_rms_dynamic_crest(audio_tar_, fft_size, hop_length)
952
+ rms_out, dyn_out, crest_out = get_rms_dynamic_crest(audio_out_, fft_size, hop_length)
953
+
954
+ low_ratio_tar = get_low_freq_weighting(audio_tar_, sr, fft_size, hop_length, f0=1000)
955
+
956
+ low_ratio_out = get_low_freq_weighting(audio_out_, sr, fft_size, hop_length, f0=1000)
957
+
958
+ N = 40
959
+
960
+ eps = 1e-10
961
+
962
+ rms_tar = (-1*rms_tar) + 1.0
963
+ rms_out = (-1*rms_out) + 1.0
964
+ dyn_tar = (-1*dyn_tar) + 1.0
965
+ dyn_out = (-1*dyn_out) + 1.0
966
+
967
+ mean_rms_tar, std_rms_tar = get_running_stats(rms_tar.T, [0], N=N)
968
+ mean_rms_out, std_rms_out = get_running_stats(rms_out.T, [0], N=N)
969
+
970
+ mean_dyn_tar, std_dyn_tar = get_running_stats(dyn_tar.T, [0], N=N)
971
+ mean_dyn_out, std_dyn_out = get_running_stats(dyn_out.T, [0], N=N)
972
+
973
+ mean_crest_tar, std_crest_tar = get_running_stats(crest_tar.T, [0], N=N)
974
+ mean_crest_out, std_crest_out = get_running_stats(crest_out.T, [0], N=N)
975
+
976
+ mean_low_ratio_tar, std_low_ratio_tar = get_running_stats(low_ratio_tar.T, [0], N=N)
977
+ mean_low_ratio_out, std_low_ratio_out = get_running_stats(low_ratio_out.T, [0], N=N)
978
+
979
+ dynamic_['rms_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_rms_tar, mean_rms_out)]
980
+ dynamic_['dyn_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_dyn_tar, mean_dyn_out)]
981
+ dynamic_['crest_mean'] = [sklearn.metrics.mean_absolute_percentage_error(mean_crest_tar, mean_crest_out)]
982
+
983
+ dynamic_['l_ratio_mean_mape'] = [sklearn.metrics.mean_absolute_percentage_error(mean_low_ratio_tar, mean_low_ratio_out)]
984
+ dynamic_['l_ratio_mean_l2'] = [sklearn.metrics.mean_squared_error(mean_low_ratio_tar, mean_low_ratio_out)]
985
+
986
+ dynamic_['mape_mean'] = [np.mean([dynamic_['rms_mean'],
987
+ dynamic_['dyn_mean'],
988
+ dynamic_['crest_mean'],
989
+ ])]
990
+
991
+ return dynamic_
992
+