Spaces:
Runtime error
Runtime error
normalize in remix
Browse files- audiodiffusion/__init__.py +1 -1
- notebooks/test_model.ipynb +25 -32
audiodiffusion/__init__.py
CHANGED
@@ -92,7 +92,7 @@ class AudioDiffusion:
|
|
92 |
images = noise = torch.randn(
|
93 |
(1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
|
94 |
self.ddpm.unet.sample_size),
|
95 |
-
generator=generator
|
96 |
)
|
97 |
|
98 |
if audio_file is not None or raw_audio is not None:
|
|
|
92 |
images = noise = torch.randn(
|
93 |
(1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
|
94 |
self.ddpm.unet.sample_size),
|
95 |
+
generator=generator
|
96 |
)
|
97 |
|
98 |
if audio_file is not None or raw_audio is not None:
|
notebooks/test_model.ipynb
CHANGED
@@ -87,6 +87,16 @@
|
|
87 |
"audio_diffusion = AudioDiffusion(model_id=model_id)"
|
88 |
]
|
89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
{
|
91 |
"cell_type": "markdown",
|
92 |
"id": "011fb5a1",
|
@@ -171,7 +181,7 @@
|
|
171 |
},
|
172 |
{
|
173 |
"cell_type": "markdown",
|
174 |
-
"id": "
|
175 |
"metadata": {},
|
176 |
"source": [
|
177 |
"### Generate continuations (\"out-painting\")"
|
@@ -180,7 +190,7 @@
|
|
180 |
{
|
181 |
"cell_type": "code",
|
182 |
"execution_count": null,
|
183 |
-
"id": "
|
184 |
"metadata": {},
|
185 |
"outputs": [],
|
186 |
"source": [
|
@@ -230,7 +240,7 @@
|
|
230 |
" from google.colab import files\n",
|
231 |
" audio_file = list(files.upload().keys())[0]\n",
|
232 |
"except:\n",
|
233 |
-
" audio_file = \"/home/teticio/Music/
|
234 |
]
|
235 |
},
|
236 |
{
|
@@ -244,43 +254,44 @@
|
|
244 |
"source": [
|
245 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
246 |
"overlap_secs = 2 #@param {type:\"integer\"}\n",
|
247 |
-
"mel = Mel(x_res=256, y_res=256)\n",
|
248 |
"mel.load_audio(audio_file)\n",
|
249 |
"overlap_samples = overlap_secs * mel.get_sample_rate()\n",
|
250 |
-
"slice_size =
|
251 |
"stride = slice_size - overlap_samples\n",
|
252 |
"generator = torch.Generator()\n",
|
253 |
"seed = generator.seed()\n",
|
254 |
"track = np.array([])\n",
|
255 |
"for sample in range(len(mel.audio) // stride):\n",
|
256 |
" generator.manual_seed(seed)\n",
|
257 |
-
" audio = mel.audio[sample * stride:sample * stride + slice_size]\n",
|
|
|
258 |
" if len(track) > 0:\n",
|
259 |
-
"
|
|
|
|
|
260 |
" _, (sample_rate,\n",
|
261 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
262 |
" raw_audio=audio,\n",
|
263 |
" start_step=start_step,\n",
|
264 |
" generator=generator,\n",
|
265 |
-
" mask_start_secs=
|
266 |
-
" display(Audio(audio
|
267 |
-
" display(Audio(audio2, rate=sample_rate))\n",
|
268 |
" track = np.concatenate([track, audio2[overlap_samples:]])"
|
269 |
]
|
270 |
},
|
271 |
{
|
272 |
"cell_type": "code",
|
273 |
"execution_count": null,
|
274 |
-
"id": "
|
275 |
"metadata": {},
|
276 |
"outputs": [],
|
277 |
"source": [
|
278 |
-
"
|
279 |
]
|
280 |
},
|
281 |
{
|
282 |
"cell_type": "markdown",
|
283 |
-
"id": "
|
284 |
"metadata": {},
|
285 |
"source": [
|
286 |
"### Fill the gap (\"in-painting\")"
|
@@ -289,7 +300,7 @@
|
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
"execution_count": null,
|
292 |
-
"id": "
|
293 |
"metadata": {},
|
294 |
"outputs": [],
|
295 |
"source": [
|
@@ -313,16 +324,6 @@
|
|
313 |
"### Compare results with random sample from training set"
|
314 |
]
|
315 |
},
|
316 |
-
{
|
317 |
-
"cell_type": "code",
|
318 |
-
"execution_count": null,
|
319 |
-
"id": "f028a3c8",
|
320 |
-
"metadata": {},
|
321 |
-
"outputs": [],
|
322 |
-
"source": [
|
323 |
-
"mel = Mel(x_res=256, y_res=256)"
|
324 |
-
]
|
325 |
-
},
|
326 |
{
|
327 |
"cell_type": "code",
|
328 |
"execution_count": null,
|
@@ -354,14 +355,6 @@
|
|
354 |
"audio = mel.image_to_audio(image)\n",
|
355 |
"Audio(data=audio, rate=mel.get_sample_rate())"
|
356 |
]
|
357 |
-
},
|
358 |
-
{
|
359 |
-
"cell_type": "code",
|
360 |
-
"execution_count": null,
|
361 |
-
"id": "d32afb5e",
|
362 |
-
"metadata": {},
|
363 |
-
"outputs": [],
|
364 |
-
"source": []
|
365 |
}
|
366 |
],
|
367 |
"metadata": {
|
|
|
87 |
"audio_diffusion = AudioDiffusion(model_id=model_id)"
|
88 |
]
|
89 |
},
|
90 |
+
{
|
91 |
+
"cell_type": "code",
|
92 |
+
"execution_count": null,
|
93 |
+
"id": "6e16ed0e",
|
94 |
+
"metadata": {},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"mel = Mel(x_res=256, y_res=256)"
|
98 |
+
]
|
99 |
+
},
|
100 |
{
|
101 |
"cell_type": "markdown",
|
102 |
"id": "011fb5a1",
|
|
|
181 |
},
|
182 |
{
|
183 |
"cell_type": "markdown",
|
184 |
+
"id": "c3b05163",
|
185 |
"metadata": {},
|
186 |
"source": [
|
187 |
"### Generate continuations (\"out-painting\")"
|
|
|
190 |
{
|
191 |
"cell_type": "code",
|
192 |
"execution_count": null,
|
193 |
+
"id": "4add9643",
|
194 |
"metadata": {},
|
195 |
"outputs": [],
|
196 |
"source": [
|
|
|
240 |
" from google.colab import files\n",
|
241 |
" audio_file = list(files.upload().keys())[0]\n",
|
242 |
"except:\n",
|
243 |
+
" audio_file = \"/home/teticio/Music/liked/El Michels Affair - Glaciers Of Ice.mp3\""
|
244 |
]
|
245 |
},
|
246 |
{
|
|
|
254 |
"source": [
|
255 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
256 |
"overlap_secs = 2 #@param {type:\"integer\"}\n",
|
|
|
257 |
"mel.load_audio(audio_file)\n",
|
258 |
"overlap_samples = overlap_secs * mel.get_sample_rate()\n",
|
259 |
+
"slice_size = mel.x_res * mel.hop_length\n",
|
260 |
"stride = slice_size - overlap_samples\n",
|
261 |
"generator = torch.Generator()\n",
|
262 |
"seed = generator.seed()\n",
|
263 |
"track = np.array([])\n",
|
264 |
"for sample in range(len(mel.audio) // stride):\n",
|
265 |
" generator.manual_seed(seed)\n",
|
266 |
+
" audio = np.array(mel.audio[sample * stride:sample * stride + slice_size])\n",
|
267 |
+
" display(Audio(audio, rate=sample_rate))\n",
|
268 |
" if len(track) > 0:\n",
|
269 |
+
" # Normalize and re-insert generated audio\n",
|
270 |
+
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
271 |
+
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
272 |
" _, (sample_rate,\n",
|
273 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
274 |
" raw_audio=audio,\n",
|
275 |
" start_step=start_step,\n",
|
276 |
" generator=generator,\n",
|
277 |
+
" mask_start_secs=overlap_secs if len(track) > 0 else 0)\n",
|
278 |
+
" display(Audio(audio2http://localhost:8889/notebooks/huggingface/audio-diffusion/notebooks/test_model.ipynb#, rate=sample_rate))\n",
|
|
|
279 |
" track = np.concatenate([track, audio2[overlap_samples:]])"
|
280 |
]
|
281 |
},
|
282 |
{
|
283 |
"cell_type": "code",
|
284 |
"execution_count": null,
|
285 |
+
"id": "6e54802a",
|
286 |
"metadata": {},
|
287 |
"outputs": [],
|
288 |
"source": [
|
289 |
+
"Audio(track, rate=sample_rate)"
|
290 |
]
|
291 |
},
|
292 |
{
|
293 |
"cell_type": "markdown",
|
294 |
+
"id": "2147bddb",
|
295 |
"metadata": {},
|
296 |
"source": [
|
297 |
"### Fill the gap (\"in-painting\")"
|
|
|
300 |
{
|
301 |
"cell_type": "code",
|
302 |
"execution_count": null,
|
303 |
+
"id": "c9de4e17",
|
304 |
"metadata": {},
|
305 |
"outputs": [],
|
306 |
"source": [
|
|
|
324 |
"### Compare results with random sample from training set"
|
325 |
]
|
326 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
{
|
328 |
"cell_type": "code",
|
329 |
"execution_count": null,
|
|
|
355 |
"audio = mel.image_to_audio(image)\n",
|
356 |
"Audio(data=audio, rate=mel.get_sample_rate())"
|
357 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
}
|
359 |
],
|
360 |
"metadata": {
|