Spaces:
Running
Running
modify fx norm
Browse files- app.py +22 -39
- inference.py +0 -2
app.py
CHANGED
@@ -63,31 +63,34 @@ def process_audio_with_youtube(input_audio, input_youtube_url, reference_audio,
|
|
63 |
|
64 |
return process_audio(input_audio, reference_audio)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def process_audio(input_audio, reference_audio):
|
67 |
output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
|
68 |
input_audio, reference_audio
|
69 |
)
|
70 |
|
71 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
72 |
-
|
73 |
-
# Convert output_audio to numpy array if it's a tensor
|
74 |
-
if isinstance(output_audio, torch.Tensor):
|
75 |
-
output_audio = output_audio.cpu().numpy()
|
76 |
-
|
77 |
-
if output_audio.ndim == 1:
|
78 |
-
output_audio = output_audio.reshape(-1, 1)
|
79 |
-
elif output_audio.ndim > 2:
|
80 |
-
output_audio = output_audio.squeeze()
|
81 |
-
|
82 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
83 |
-
if output_audio.shape[1] > output_audio.shape[0]:
|
84 |
-
output_audio = output_audio.transpose(1,0)
|
85 |
|
|
|
|
|
|
|
86 |
# Normalize output audio
|
87 |
-
output_audio = loudness_normalize(output_audio, sr)
|
88 |
# Denormalize the audio to int16
|
89 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
90 |
-
normalized_input = denormalize_audio(normalized_input, dtype=np.int16)
|
91 |
|
92 |
return (sr, output_audio), param_output, (sr, normalized_input)
|
93 |
|
@@ -125,18 +128,8 @@ def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, op
|
|
125 |
current_output = last_result['audio']
|
126 |
ito_param_output = mastering_transfer.get_param_output_string(last_result['params'])
|
127 |
|
128 |
-
# Convert
|
129 |
-
|
130 |
-
current_output = current_output.cpu().numpy()
|
131 |
-
|
132 |
-
if current_output.ndim == 1:
|
133 |
-
current_output = current_output.reshape(-1, 1)
|
134 |
-
elif current_output.ndim > 2:
|
135 |
-
current_output = current_output.squeeze()
|
136 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
137 |
-
if current_output.shape[1] > current_output.shape[0]:
|
138 |
-
current_output = current_output.transpose(1,0)
|
139 |
-
|
140 |
# Loudness normalize output audio
|
141 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
142 |
# Denormalize the audio to int16
|
@@ -149,18 +142,8 @@ def update_ito_output(all_results, selected_step):
|
|
149 |
current_output = selected_result['audio']
|
150 |
ito_param_output = mastering_transfer.get_param_output_string(selected_result['params'])
|
151 |
|
152 |
-
# Convert
|
153 |
-
|
154 |
-
current_output = current_output.cpu().numpy()
|
155 |
-
|
156 |
-
if current_output.ndim == 1:
|
157 |
-
current_output = current_output.reshape(-1, 1)
|
158 |
-
elif current_output.ndim > 2:
|
159 |
-
current_output = current_output.squeeze()
|
160 |
-
# Ensure the audio is in the correct shape (samples, channels)
|
161 |
-
if current_output.shape[1] > current_output.shape[0]:
|
162 |
-
current_output = current_output.transpose(1,0)
|
163 |
-
|
164 |
# Loudness normalize output audio
|
165 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
166 |
# Denormalize the audio to int16
|
|
|
63 |
|
64 |
return process_audio(input_audio, reference_audio)
|
65 |
|
66 |
+
def to_numpy_audio(audio):
|
67 |
+
# Convert output_audio to numpy array if it's a tensor
|
68 |
+
if isinstance(audio, torch.Tensor):
|
69 |
+
audio = audio.cpu().numpy()
|
70 |
+
# check dimension
|
71 |
+
if audio.ndim == 1:
|
72 |
+
audio = audio.reshape(-1, 1)
|
73 |
+
elif audio.ndim > 2:
|
74 |
+
audio = audio.squeeze()
|
75 |
+
# Ensure the audio is in the correct shape (samples, channels)
|
76 |
+
if audio.shape[1] > audio.shape[0]:
|
77 |
+
audio = audio.transpose(1,0)
|
78 |
+
return audio
|
79 |
+
|
80 |
def process_audio(input_audio, reference_audio):
|
81 |
output_audio, predicted_params, sr, normalized_input = mastering_transfer.process_audio(
|
82 |
input_audio, reference_audio
|
83 |
)
|
84 |
|
85 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
# Convert to numpy audio
|
88 |
+
output_audio = to_numpy_audio(output_audio)
|
89 |
+
normalized_input = to_numpy_audio(normalized_input)
|
90 |
# Normalize output audio
|
91 |
+
output_audio = loudness_normalize(output_audio, sr)
|
92 |
# Denormalize the audio to int16
|
93 |
output_audio = denormalize_audio(output_audio, dtype=np.int16)
|
|
|
94 |
|
95 |
return (sr, output_audio), param_output, (sr, normalized_input)
|
96 |
|
|
|
128 |
current_output = last_result['audio']
|
129 |
ito_param_output = mastering_transfer.get_param_output_string(last_result['params'])
|
130 |
|
131 |
+
# Convert to numpy audio
|
132 |
+
current_output = to_numpy_audio(current_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
# Loudness normalize output audio
|
134 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
135 |
# Denormalize the audio to int16
|
|
|
142 |
current_output = selected_result['audio']
|
143 |
ito_param_output = mastering_transfer.get_param_output_string(selected_result['params'])
|
144 |
|
145 |
+
# Convert to numpy audio
|
146 |
+
current_output = to_numpy_audio(current_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
# Loudness normalize output audio
|
148 |
current_output = loudness_normalize(current_output, args.sample_rate)
|
149 |
# Denormalize the audio to int16
|
inference.py
CHANGED
@@ -153,8 +153,6 @@ class MasteringStyleTransfer:
|
|
153 |
def process_audio(self, input_audio, reference_audio):
|
154 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, normalize=True)
|
155 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
156 |
-
print(f"input_tensor: {input_tensor.shape}")
|
157 |
-
print(f"reference_tensor: {reference_tensor.shape}")
|
158 |
|
159 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
160 |
|
|
|
153 |
def process_audio(self, input_audio, reference_audio):
|
154 |
input_tensor = self.preprocess_audio(input_audio, self.args.sample_rate, normalize=True)
|
155 |
reference_tensor = self.preprocess_audio(reference_audio, self.args.sample_rate)
|
|
|
|
|
156 |
|
157 |
reference_feature = self.get_reference_embedding(reference_tensor)
|
158 |
|