teticio commited on
Commit
b7c9dfd
1 Parent(s): 0ff9228

normalize in remix

Browse files
audiodiffusion/__init__.py CHANGED
@@ -92,7 +92,7 @@ class AudioDiffusion:
92
  images = noise = torch.randn(
93
  (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
94
  self.ddpm.unet.sample_size),
95
- generator=generator,
96
  )
97
 
98
  if audio_file is not None or raw_audio is not None:
 
92
  images = noise = torch.randn(
93
  (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
94
  self.ddpm.unet.sample_size),
95
+ generator=generator
96
  )
97
 
98
  if audio_file is not None or raw_audio is not None:
notebooks/test_model.ipynb CHANGED
@@ -87,6 +87,16 @@
87
  "audio_diffusion = AudioDiffusion(model_id=model_id)"
88
  ]
89
  },
 
 
 
 
 
 
 
 
 
 
90
  {
91
  "cell_type": "markdown",
92
  "id": "011fb5a1",
@@ -171,7 +181,7 @@
171
  },
172
  {
173
  "cell_type": "markdown",
174
- "id": "97da7c6d",
175
  "metadata": {},
176
  "source": [
177
  "### Generate continuations (\"out-painting\")"
@@ -180,7 +190,7 @@
180
  {
181
  "cell_type": "code",
182
  "execution_count": null,
183
- "id": "4581936c",
184
  "metadata": {},
185
  "outputs": [],
186
  "source": [
@@ -230,7 +240,7 @@
230
  " from google.colab import files\n",
231
  " audio_file = list(files.upload().keys())[0]\n",
232
  "except:\n",
233
- " audio_file = \"/home/teticio/Music/Music/Sven Väth/In the Mix_ The Sound of the Sixteenth S/14 Eclipse.m4a\""
234
  ]
235
  },
236
  {
@@ -244,43 +254,44 @@
244
  "source": [
245
  "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
246
  "overlap_secs = 2 #@param {type:\"integer\"}\n",
247
- "mel = Mel(x_res=256, y_res=256)\n",
248
  "mel.load_audio(audio_file)\n",
249
  "overlap_samples = overlap_secs * mel.get_sample_rate()\n",
250
- "slice_size = audio_diffusion.mel.x_res * audio_diffusion.mel.hop_length\n",
251
  "stride = slice_size - overlap_samples\n",
252
  "generator = torch.Generator()\n",
253
  "seed = generator.seed()\n",
254
  "track = np.array([])\n",
255
  "for sample in range(len(mel.audio) // stride):\n",
256
  " generator.manual_seed(seed)\n",
257
- " audio = mel.audio[sample * stride:sample * stride + slice_size]\n",
 
258
  " if len(track) > 0:\n",
259
- " audio[:overlap_samples] = audio2[-overlap_samples:]\n",
 
 
260
  " _, (sample_rate,\n",
261
  " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
262
  " raw_audio=audio,\n",
263
  " start_step=start_step,\n",
264
  " generator=generator,\n",
265
- " mask_start_secs=1 if len(track) > 0 else 0)\n",
266
- " display(Audio(audio, rate=sample_rate))\n",
267
- " display(Audio(audio2, rate=sample_rate))\n",
268
  " track = np.concatenate([track, audio2[overlap_samples:]])"
269
  ]
270
  },
271
  {
272
  "cell_type": "code",
273
  "execution_count": null,
274
- "id": "90457786",
275
  "metadata": {},
276
  "outputs": [],
277
  "source": [
278
- "display(Audio(track, rate=sample_rate))"
279
  ]
280
  },
281
  {
282
  "cell_type": "markdown",
283
- "id": "d9910e82",
284
  "metadata": {},
285
  "source": [
286
  "### Fill the gap (\"in-painting\")"
@@ -289,7 +300,7 @@
289
  {
290
  "cell_type": "code",
291
  "execution_count": null,
292
- "id": "fd3eb365",
293
  "metadata": {},
294
  "outputs": [],
295
  "source": [
@@ -313,16 +324,6 @@
313
  "### Compare results with random sample from training set"
314
  ]
315
  },
316
- {
317
- "cell_type": "code",
318
- "execution_count": null,
319
- "id": "f028a3c8",
320
- "metadata": {},
321
- "outputs": [],
322
- "source": [
323
- "mel = Mel(x_res=256, y_res=256)"
324
- ]
325
- },
326
  {
327
  "cell_type": "code",
328
  "execution_count": null,
@@ -354,14 +355,6 @@
354
  "audio = mel.image_to_audio(image)\n",
355
  "Audio(data=audio, rate=mel.get_sample_rate())"
356
  ]
357
- },
358
- {
359
- "cell_type": "code",
360
- "execution_count": null,
361
- "id": "d32afb5e",
362
- "metadata": {},
363
- "outputs": [],
364
- "source": []
365
  }
366
  ],
367
  "metadata": {
 
87
  "audio_diffusion = AudioDiffusion(model_id=model_id)"
88
  ]
89
  },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "id": "6e16ed0e",
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "mel = Mel(x_res=256, y_res=256)"
98
+ ]
99
+ },
100
  {
101
  "cell_type": "markdown",
102
  "id": "011fb5a1",
 
181
  },
182
  {
183
  "cell_type": "markdown",
184
+ "id": "c3b05163",
185
  "metadata": {},
186
  "source": [
187
  "### Generate continuations (\"out-painting\")"
 
190
  {
191
  "cell_type": "code",
192
  "execution_count": null,
193
+ "id": "4add9643",
194
  "metadata": {},
195
  "outputs": [],
196
  "source": [
 
240
  " from google.colab import files\n",
241
  " audio_file = list(files.upload().keys())[0]\n",
242
  "except:\n",
243
+ " audio_file = \"/home/teticio/Music/liked/El Michels Affair - Glaciers Of Ice.mp3\""
244
  ]
245
  },
246
  {
 
254
  "source": [
255
  "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
256
  "overlap_secs = 2 #@param {type:\"integer\"}\n",
 
257
  "mel.load_audio(audio_file)\n",
258
  "overlap_samples = overlap_secs * mel.get_sample_rate()\n",
259
+ "slice_size = mel.x_res * mel.hop_length\n",
260
  "stride = slice_size - overlap_samples\n",
261
  "generator = torch.Generator()\n",
262
  "seed = generator.seed()\n",
263
  "track = np.array([])\n",
264
  "for sample in range(len(mel.audio) // stride):\n",
265
  " generator.manual_seed(seed)\n",
266
+ " audio = np.array(mel.audio[sample * stride:sample * stride + slice_size])\n",
267
+ " display(Audio(audio, rate=sample_rate))\n",
268
  " if len(track) > 0:\n",
269
+ " # Normalize and re-insert generated audio\n",
270
+ " audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
271
+ " audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
272
  " _, (sample_rate,\n",
273
  " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
274
  " raw_audio=audio,\n",
275
  " start_step=start_step,\n",
276
  " generator=generator,\n",
277
+ " mask_start_secs=overlap_secs if len(track) > 0 else 0)\n",
278
+ " display(Audio(audio2http://localhost:8889/notebooks/huggingface/audio-diffusion/notebooks/test_model.ipynb#, rate=sample_rate))\n",
 
279
  " track = np.concatenate([track, audio2[overlap_samples:]])"
280
  ]
281
  },
282
  {
283
  "cell_type": "code",
284
  "execution_count": null,
285
+ "id": "6e54802a",
286
  "metadata": {},
287
  "outputs": [],
288
  "source": [
289
+ "Audio(track, rate=sample_rate)"
290
  ]
291
  },
292
  {
293
  "cell_type": "markdown",
294
+ "id": "2147bddb",
295
  "metadata": {},
296
  "source": [
297
  "### Fill the gap (\"in-painting\")"
 
300
  {
301
  "cell_type": "code",
302
  "execution_count": null,
303
+ "id": "c9de4e17",
304
  "metadata": {},
305
  "outputs": [],
306
  "source": [
 
324
  "### Compare results with random sample from training set"
325
  ]
326
  },
 
 
 
 
 
 
 
 
 
 
327
  {
328
  "cell_type": "code",
329
  "execution_count": null,
 
355
  "audio = mel.image_to_audio(image)\n",
356
  "Audio(data=audio, rate=mel.get_sample_rate())"
357
  ]
 
 
 
 
 
 
 
 
358
  }
359
  ],
360
  "metadata": {