teticio commited on
Commit
9b96285
1 Parent(s): 3ae9402

use new models for now

Browse files
notebooks/audio_diffusion_pipeline.ipynb CHANGED
@@ -46,7 +46,7 @@
46
  "from datasets import load_dataset\n",
47
  "from IPython.display import Audio\n",
48
  "from librosa.beat import beat_track\n",
49
- "from diffusers import DiffusionPipeline, Mel"
50
  ]
51
  },
52
  {
@@ -56,8 +56,6 @@
56
  "metadata": {},
57
  "outputs": [],
58
  "source": [
59
- "mel = Mel()\n",
60
- "sample_rate = mel.get_sample_rate()\n",
61
  "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
62
  "generator = torch.Generator(device=device)"
63
  ]
@@ -91,7 +89,7 @@
91
  "\n",
92
  "#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
93
  "\n",
94
- "model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
95
  ]
96
  },
97
  {
@@ -101,7 +99,9 @@
101
  "metadata": {},
102
  "outputs": [],
103
  "source": [
104
- "audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
 
 
105
  ]
106
  },
107
  {
@@ -150,7 +150,7 @@
150
  " seed = generator.seed()\n",
151
  " print(f'Seed = {seed}')\n",
152
  " generator.manual_seed(seed)\n",
153
- " output = audio_diffusion(mel=mel, generator=generator)\n",
154
  " image = output.images[0]\n",
155
  " audio = output.audios[0, 0]\n",
156
  " display(image)\n",
@@ -187,7 +187,7 @@
187
  "source": [
188
  "seed = 2391504374279719 #@param {type:\"integer\"}\n",
189
  "generator.manual_seed(seed)\n",
190
- "output = audio_diffusion(mel=mel, generator=generator)\n",
191
  "image = output.images[0]\n",
192
  "audio = output.audios[0, 0]\n",
193
  "display(image)\n",
@@ -206,7 +206,7 @@
206
  "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
207
  "track = loop_it(audio, sample_rate, loops=1)\n",
208
  "for variation in range(12):\n",
209
- " output = audio_diffusion(mel=mel, raw_audio=audio, start_step=start_step)\n",
210
  " image2 = output.images[0]\n",
211
  " audio2 = output.audios[0, 0]\n",
212
  " display(image2)\n",
@@ -235,8 +235,7 @@
235
  "overlap_samples = overlap_secs * sample_rate\n",
236
  "track = audio\n",
237
  "for variation in range(12):\n",
238
- " output = audio_diffusion(mel=mel,\n",
239
- " raw_audio=audio[-overlap_samples:],\n",
240
  " start_step=start_step,\n",
241
  " mask_start_secs=overlap_secs)\n",
242
  " image2 = output.images[0]\n",
@@ -306,8 +305,7 @@
306
  " # Normalize and re-insert generated audio\n",
307
  " audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
308
  " audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
309
- " output = audio_diffusion(mel=mel,\n",
310
- " raw_audio=audio,\n",
311
  " start_step=start_step,\n",
312
  " generator=generator,\n",
313
  " mask_start_secs=overlap_secs * not_first)\n",
@@ -334,8 +332,7 @@
334
  "source": [
335
  "sample = 3 #@param {type:\"integer\"}\n",
336
  "raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
337
- "output = audio_diffusion(mel=mel,\n",
338
- " raw_audio=raw_audio,\n",
339
  " mask_start_secs=1,\n",
340
  " mask_end_secs=1,\n",
341
  " step_generator=torch.Generator(device=device))\n",
@@ -359,7 +356,9 @@
359
  "metadata": {},
360
  "outputs": [],
361
  "source": [
362
- "audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256').to(device)"
 
 
363
  ]
364
  },
365
  {
@@ -381,7 +380,7 @@
381
  " seed = generator.seed()\n",
382
  " print(f'Seed = {seed}')\n",
383
  " generator.manual_seed(seed)\n",
384
- " output = audio_diffusion(mel=mel, generator=generator)\n",
385
  " image = output.images[0]\n",
386
  " audio = output.audios[0, 0]\n",
387
  " display(image)\n",
@@ -410,7 +409,7 @@
410
  "metadata": {},
411
  "outputs": [],
412
  "source": [
413
- "output = audio_diffusion(mel=mel, steps=1000, generator=generator, eta=1)\n",
414
  "image = output.images[0]\n",
415
  "audio = output.audios[0, 0]\n",
416
  "display(image)\n",
@@ -509,7 +508,6 @@
509
  "source": [
510
  "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
511
  "output = audio_diffusion(\n",
512
- " mel=mel,\n",
513
  " noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
514
  " generator=generator)\n",
515
  "audio = output.audios[0, 0]\n",
@@ -534,7 +532,7 @@
534
  "metadata": {},
535
  "outputs": [],
536
  "source": [
537
- "model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
538
  ]
539
  },
540
  {
@@ -544,7 +542,9 @@
544
  "metadata": {},
545
  "outputs": [],
546
  "source": [
547
- "audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
 
 
548
  ]
549
  },
550
  {
@@ -556,7 +556,7 @@
556
  "source": [
557
  "seed = 3412253600050855 #@param {type:\"integer\"}\n",
558
  "generator.manual_seed(seed)\n",
559
- "output = audio_diffusion(mel=mel, generator=generator)\n",
560
  "image = output.images[0]\n",
561
  "audio = output.audios[0, 0]\n",
562
  "display(image)\n",
@@ -572,7 +572,7 @@
572
  "source": [
573
  "seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
574
  "generator.manual_seed(seed2)\n",
575
- "output = audio_diffusion(mel=mel, generator=generator)\n",
576
  "image2 = output.images[0]\n",
577
  "audio2 = output.audios[0, 0]\n",
578
  "display(image2)\n",
@@ -628,7 +628,6 @@
628
  "source": [
629
  "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
630
  "output = audio_diffusion(\n",
631
- " mel=mel,\n",
632
  " noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
633
  " generator=generator)\n",
634
  "audio3 = output.audios[0, 0]\n",
 
46
  "from datasets import load_dataset\n",
47
  "from IPython.display import Audio\n",
48
  "from librosa.beat import beat_track\n",
49
+ "from diffusers import DiffusionPipeline"
50
  ]
51
  },
52
  {
 
56
  "metadata": {},
57
  "outputs": [],
58
  "source": [
 
 
59
  "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
60
  "generator = torch.Generator(device=device)"
61
  ]
 
89
  "\n",
90
  "#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
91
  "\n",
92
+ "model_id = \"teticio/audio-diffusion-256-new\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
93
  ]
94
  },
95
  {
 
99
  "metadata": {},
100
  "outputs": [],
101
  "source": [
102
+ "audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
103
+ "mel = audio_diffusion.mel\n",
104
+ "sample_rate = mel.get_sample_rate()"
105
  ]
106
  },
107
  {
 
150
  " seed = generator.seed()\n",
151
  " print(f'Seed = {seed}')\n",
152
  " generator.manual_seed(seed)\n",
153
+ " output = audio_diffusion(generator=generator)\n",
154
  " image = output.images[0]\n",
155
  " audio = output.audios[0, 0]\n",
156
  " display(image)\n",
 
187
  "source": [
188
  "seed = 2391504374279719 #@param {type:\"integer\"}\n",
189
  "generator.manual_seed(seed)\n",
190
+ "output = audio_diffusion(generator=generator)\n",
191
  "image = output.images[0]\n",
192
  "audio = output.audios[0, 0]\n",
193
  "display(image)\n",
 
206
  "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
207
  "track = loop_it(audio, sample_rate, loops=1)\n",
208
  "for variation in range(12):\n",
209
+ " output = audio_diffusion(raw_audio=audio, start_step=start_step)\n",
210
  " image2 = output.images[0]\n",
211
  " audio2 = output.audios[0, 0]\n",
212
  " display(image2)\n",
 
235
  "overlap_samples = overlap_secs * sample_rate\n",
236
  "track = audio\n",
237
  "for variation in range(12):\n",
238
+ " output = audio_diffusion(raw_audio=audio[-overlap_samples:],\n",
 
239
  " start_step=start_step,\n",
240
  " mask_start_secs=overlap_secs)\n",
241
  " image2 = output.images[0]\n",
 
305
  " # Normalize and re-insert generated audio\n",
306
  " audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
307
  " audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
308
+ " output = audio_diffusion(raw_audio=audio,\n",
 
309
  " start_step=start_step,\n",
310
  " generator=generator,\n",
311
  " mask_start_secs=overlap_secs * not_first)\n",
 
332
  "source": [
333
  "sample = 3 #@param {type:\"integer\"}\n",
334
  "raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
335
+ "output = audio_diffusion(raw_audio=raw_audio,\n",
 
336
  " mask_start_secs=1,\n",
337
  " mask_end_secs=1,\n",
338
  " step_generator=torch.Generator(device=device))\n",
 
356
  "metadata": {},
357
  "outputs": [],
358
  "source": [
359
+ "audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256-new').to(device)\n",
360
+ "mel = audio_diffusion.mel\n",
361
+ "sample_rate = mel.get_sample_rate()"
362
  ]
363
  },
364
  {
 
380
  " seed = generator.seed()\n",
381
  " print(f'Seed = {seed}')\n",
382
  " generator.manual_seed(seed)\n",
383
+ " output = audio_diffusion(generator=generator)\n",
384
  " image = output.images[0]\n",
385
  " audio = output.audios[0, 0]\n",
386
  " display(image)\n",
 
409
  "metadata": {},
410
  "outputs": [],
411
  "source": [
412
+ "output = audio_diffusion(steps=1000, generator=generator, eta=1)\n",
413
  "image = output.images[0]\n",
414
  "audio = output.audios[0, 0]\n",
415
  "display(image)\n",
 
508
  "source": [
509
  "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
510
  "output = audio_diffusion(\n",
 
511
  " noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
512
  " generator=generator)\n",
513
  "audio = output.audios[0, 0]\n",
 
532
  "metadata": {},
533
  "outputs": [],
534
  "source": [
535
+ "model_id = \"teticio/latent-audio-diffusion-ddim-256-new\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
536
  ]
537
  },
538
  {
 
542
  "metadata": {},
543
  "outputs": [],
544
  "source": [
545
+ "audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
546
+ "mel = audio_diffusion.mel\n",
547
+ "sample_rate = mel.get_sample_rate()"
548
  ]
549
  },
550
  {
 
556
  "source": [
557
  "seed = 3412253600050855 #@param {type:\"integer\"}\n",
558
  "generator.manual_seed(seed)\n",
559
+ "output = audio_diffusion(generator=generator)\n",
560
  "image = output.images[0]\n",
561
  "audio = output.audios[0, 0]\n",
562
  "display(image)\n",
 
572
  "source": [
573
  "seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
574
  "generator.manual_seed(seed2)\n",
575
+ "output = audio_diffusion(generator=generator)\n",
576
  "image2 = output.images[0]\n",
577
  "audio2 = output.audios[0, 0]\n",
578
  "display(image2)\n",
 
628
  "source": [
629
  "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
630
  "output = audio_diffusion(\n",
 
631
  " noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
632
  " generator=generator)\n",
633
  "audio3 = output.audios[0, 0]\n",