Hugo Flores Garcia commited on
Commit
5a343f4
1 Parent(s): f4c9665

the refactor begins

Browse files
Dockerfile DELETED
@@ -1,39 +0,0 @@
1
- FROM us.gcr.io/lyrebird-research/research-image/audio
2
-
3
- COPY requirements.txt requirements.txt
4
- ARG GITHUB_TOKEN
5
- RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc
6
-
7
- COPY env/alias.sh /alias.sh
8
- COPY env/entry_script.sh /entry_script.sh
9
- RUN cat /alias.sh >> ~/.zshrc
10
-
11
- # USER researcher
12
- RUN pip install Cython
13
- RUN pip install madmom
14
- RUN pip install --upgrade -r requirements.txt
15
- RUN pip install --upgrade tensorflow
16
- RUN pip install --upgrade librosa
17
- RUN pip install --upgrade numba
18
- RUN pip install protobuf==3.20
19
- ENV PYTHONPATH "$PYTHONPATH:/u/home/src"
20
- ENV NUMBA_CACHE_DIR=/tmp/
21
-
22
- USER root
23
- RUN wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb
24
- RUN dpkg -i pandoc-2.18-1-amd64.deb
25
- RUN apt-get update && apt-get install task-spooler
26
-
27
- RUN head -n -1 /entry_script.sh > /entry_script_jupyter.sh
28
- RUN head -n -1 /entry_script.sh > /entry_script_tensorboard.sh
29
- RUN head -n -1 /entry_script.sh > /entry_script_gradio.sh
30
-
31
- RUN echo \
32
- 'su -p ${USER} -c "source ~/.zshrc && jupyter lab --ip=0.0.0.0"' >> \
33
- /entry_script_jupyter.sh
34
- RUN echo \
35
- 'su -p ${USER} -c "source ~/.zshrc && tensorboard --logdir=$TENSORBOARD_PATH --samples_per_plugin audio=500 --bind_all"' >> \
36
- /entry_script_tensorboard.sh
37
- RUN echo \
38
- 'su -p ${USER} -c "source ~/.zshrc && python app.py --args.load=conf/app.yml"' >> \
39
- /entry_script_gradio.sh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -2,27 +2,6 @@
2
 
3
  This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
4
 
5
- ## Install hooks
6
-
7
- First install the pre-commit util:
8
-
9
- https://pre-commit.com/#install
10
-
11
- pip install pre-commit # with pip
12
- brew install pre-commit # on Mac
13
-
14
- Then install the git hooks
15
-
16
- pre-commit install
17
- # check .pre-commit-config.yaml for details of hooks
18
-
19
- Upon `git commit`, the pre-commit hooks will be run automatically on the stage files (i.e. added by `git add`)
20
-
21
- **N.B. By default, pre-commit checks only run on staged files**
22
-
23
- If you need to run it on all files:
24
-
25
- pre-commit run --all-files
26
 
27
  ## Development
28
  ### Setting everything up
 
2
 
3
  This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  ## Development
7
  ### Setting everything up
conf/{vampnet-c2f.yml → c2f.yml} RENAMED
File without changes
conf/interface/interface-c2f-exp.yml DELETED
@@ -1,5 +0,0 @@
1
- Interface.coarse_ckpt: /runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
2
- Interface.coarse2fine_ckpt: runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
3
- Interface.codec_ckpt: /runs/codec-ckpt/codec.pth
4
- Interface.coarse_chunk_size_s: 5
5
- Interface.coarse2fine_chunk_size_s: 3
 
 
 
 
 
 
conf/interface/{interface-jazzpop.yml → jazzpop.yml} RENAMED
File without changes
conf/interface/{interface-maestro.yml → maestro.yml} RENAMED
File without changes
conf/interface/{interface-spotdl.yml → spotdl.yml} RENAMED
File without changes
conf/lora/birds.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/birds
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/birds
conf/lora/birdss.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/birds
8
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
9
+
10
+ val/AudioLoader.sources:
11
+ - /media/CHONK/hugo/spotdl/subsets/birds
12
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
conf/lora/constructions.yml CHANGED
@@ -4,7 +4,7 @@ $include:
4
  fine_tune: True
5
 
6
  train/AudioLoader.sources:
7
- - /media/CHONK/hugo/spotdl/subsets/constructions
8
 
9
  val/AudioLoader.sources:
10
- - /media/CHONK/hugo/spotdl/subsets/constructions
 
4
  fine_tune: True
5
 
6
  train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
8
 
9
  val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
conf/lora/lora-is-this-charlie-parker.yml CHANGED
@@ -4,7 +4,7 @@ $include:
4
  fine_tune: True
5
 
6
  train/AudioLoader.sources:
7
- - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
8
 
9
  val/AudioLoader.sources:
10
- - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
 
4
  fine_tune: True
5
 
6
  train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
8
 
9
  val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
conf/lora/lora.yml CHANGED
@@ -8,7 +8,7 @@ train/AudioDataset.n_examples: 10000000
8
  val/AudioDataset.n_examples: 10
9
 
10
 
11
- NoamScheduler.warmup: 250
12
 
13
  epoch_length: 100
14
  save_audio_epochs: 2
 
8
  val/AudioDataset.n_examples: 10
9
 
10
 
11
+ NoamScheduler.warmup: 400
12
 
13
  epoch_length: 100
14
  save_audio_epochs: 2
conf/lora/underworld.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioLoader.sources:
7
+ - /media/CHONK/hugo/spotdl/subsets/underworld.mp3
8
+
9
+ val/AudioLoader.sources:
10
+ - /media/CHONK/hugo/spotdl/subsets/underworld.mp3
conf/vampnet-groovemidi.yml DELETED
@@ -1,54 +0,0 @@
1
- $include:
2
- - conf/vampnet.yml
3
-
4
- VampNet.embedding_dim: 512
5
- VampNet.n_layers: 12
6
- VampNet.n_heads: 8
7
-
8
- AudioDataset.duration: 12.0
9
-
10
- train/AudioDataset.n_examples: 10000000
11
- train/AudioLoader.sources:
12
- # drummer 1 sessions 1, 2, and 3
13
- - /data/e-gmd-v1.0.0/drummer1/session1
14
- - /data/e-gmd-v1.0.0/drummer1/session2
15
- - /data/e-gmd-v1.0.0/drummer1/session3
16
- # drummer 3 sessions 1 and 2
17
- - /data/e-gmd-v1.0.0/drummer3/session1
18
- - /data/e-gmd-v1.0.0/drummer3/session2
19
- # drummer 4 session 1
20
- - /data/e-gmd-v1.0.0/drummer4/session1
21
- # drummer 5 sessions 1 and 2
22
- - /data/e-gmd-v1.0.0/drummer5/session1
23
- - /data/e-gmd-v1.0.0/drummer5/session2
24
- # drummer 6 session 1, 2, and 3
25
- - /data/e-gmd-v1.0.0/drummer6/session1
26
- - /data/e-gmd-v1.0.0/drummer6/session2
27
- - /data/e-gmd-v1.0.0/drummer6/session3
28
- # drummer 7 session 1, 2 and 3
29
- - /data/e-gmd-v1.0.0/drummer7/session1
30
- - /data/e-gmd-v1.0.0/drummer7/session2
31
- - /data/e-gmd-v1.0.0/drummer7/session3
32
- # drummer 8 session 1
33
- - /data/e-gmd-v1.0.0/drummer8/session1
34
- # drummer 9 session 1
35
- - /data/e-gmd-v1.0.0/drummer9/session1
36
- # drummer 10 session 1
37
- - /data/e-gmd-v1.0.0/drummer10/session1
38
-
39
-
40
- val/AudioDataset.n_examples: 500
41
- val/AudioLoader.sources:
42
- # drummer 1 eval session
43
- - /data/e-gmd-v1.0.0/drummer1/eval_session
44
- # drummer 5 eval session
45
- - /data/e-gmd-v1.0.0/drummer5/eval_session
46
- # drummer 7 eval session
47
- - /data/e-gmd-v1.0.0/drummer7/eval_session
48
-
49
-
50
-
51
- test/AudioDataset.n_examples: 1000
52
- test/AudioLoader.sources:
53
- # drummer 8 eval session
54
- - /data/e-gmd-v1.0.0/drummer8/eval_session
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
conf/vampnet-maestro.yml DELETED
@@ -1,21 +0,0 @@
1
- $include:
2
- - conf/vampnet.yml
3
-
4
- VampNet.embedding_dim: 512
5
- VampNet.n_layers: 12
6
- VampNet.n_heads: 8
7
-
8
- AudioDataset.duration: 12.0
9
-
10
- train/AudioDataset.n_examples: 10000000
11
- train/AudioLoader.sources:
12
- - /data/maestro-reorg/train
13
-
14
- val/AudioDataset.n_examples: 500
15
- val/AudioLoader.sources:
16
- - /data/maestro-reorg/val
17
-
18
-
19
- test/AudioDataset.n_examples: 1000
20
- test/AudioLoader.sources:
21
- - /data/maestro-reorg/test
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo.py CHANGED
@@ -62,6 +62,7 @@ def load_random_audio():
62
  def ez_vamp(
63
  input_audio, init_temp, final_temp,
64
  mask_periodic_amt, mask_periodic_width, num_steps,
 
65
  ):
66
  print(input_audio)
67
  sig = at.AudioSignal(input_audio)
@@ -74,7 +75,8 @@ def ez_vamp(
74
  prefix_dur_s=0.0,
75
  suffix_dur_s=0.0,
76
  num_vamps=1,
77
- downsample_factor=mask_periodic_amt,
 
78
  periodic_width=mask_periodic_width,
79
  periodic_dropout=0.0,
80
  periodic_width_dropout=0.0,
@@ -105,7 +107,7 @@ def vamp(
105
  num_vamps, mode, use_beats, num_steps, snap_to_beats,
106
  beat_unmask_drop, mask_periodic_width,
107
  mask_periodic_dropout, mask_periodic_width_dropout,
108
- n_conditioning_codebooks, use_coarse2fine
109
  ):
110
  # try:
111
  print(input_audio)
@@ -146,6 +148,7 @@ def vamp(
146
  suffix_dur_s=suffix_s,
147
  num_vamps=num_vamps,
148
  downsample_factor=mask_periodic_amt,
 
149
  periodic_width=mask_periodic_width,
150
  periodic_dropout=mask_periodic_dropout,
151
  periodic_width_dropout=mask_periodic_width_dropout,
@@ -158,7 +161,7 @@ def vamp(
158
 
159
  if use_coarse2fine:
160
  zv = interface.coarse_to_fine(zv)
161
- # mask = interface.to_signal(mask_z).cpu()
162
 
163
  sig = interface.to_signal(zv).cpu()
164
  print("done")
@@ -166,9 +169,9 @@ def vamp(
166
  out_dir = OUT_DIR / str(uuid.uuid4())
167
  out_dir.mkdir()
168
  sig.write(out_dir / "output.wav")
169
- # mask.write(out_dir / "mask.wav")
170
- # return sig.path_to_file, mask.path_to_file
171
- return sig.path_to_file, None
172
  # except Exception as e:
173
  # raise gr.Error(f"failed with error: {e}")
174
 
@@ -180,7 +183,7 @@ def save_vamp(
180
  mask_up_chk, up_factor,
181
  num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
182
  beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
183
- n_conditioning_codebooks, use_coarse2fine
184
  ):
185
  out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
186
  out_dir.mkdir(parents=True, exist_ok=True)
@@ -215,6 +218,7 @@ def save_vamp(
215
  "mask_periodic_width_dropout": mask_periodic_width_dropout,
216
  "n_conditioning_codebooks": n_conditioning_codebooks,
217
  "use_coarse2fine": use_coarse2fine,
 
218
  }
219
 
220
  # save with yaml
@@ -333,6 +337,14 @@ with gr.Blocks() as demo:
333
  precision=0,
334
  )
335
 
 
 
 
 
 
 
 
 
336
  mask_periodic_amt = gr.Slider(
337
  label="periodic hint (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
338
  minimum=0,
@@ -501,7 +513,7 @@ with gr.Blocks() as demo:
501
  num_vamps, mode, use_beats, num_steps, snap_to_beats,
502
  beat_unmask_drop, mask_periodic_width,
503
  mask_periodic_dropout, mask_periodic_width_dropout,
504
- n_conditioning_codebooks, use_coarse2fine
505
  ],
506
  outputs=[output_audio, audio_mask],
507
  api_name="vamp"
@@ -520,7 +532,7 @@ with gr.Blocks() as demo:
520
  notes_text, use_beats, num_steps, snap_to_beats,
521
  beat_unmask_drop, mask_periodic_width,
522
  mask_periodic_dropout, mask_periodic_width_dropout,
523
- n_conditioning_codebooks, use_coarse2fine
524
  ],
525
  outputs=[thank_you, download_file]
526
  )
@@ -529,7 +541,7 @@ with gr.Blocks() as demo:
529
  ez_vamp_button.click(
530
  fn=ez_vamp,
531
  inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
532
- mask_periodic_width, num_steps ],
533
  outputs=[output_audio],
534
  api_name="ez_vamp"
535
  )
 
62
  def ez_vamp(
63
  input_audio, init_temp, final_temp,
64
  mask_periodic_amt, mask_periodic_width, num_steps,
65
+ stretch_factor,
66
  ):
67
  print(input_audio)
68
  sig = at.AudioSignal(input_audio)
 
75
  prefix_dur_s=0.0,
76
  suffix_dur_s=0.0,
77
  num_vamps=1,
78
+ downsample_factor=mask_periodic_amt,
79
+ stretch_factor=stretch_factor,
80
  periodic_width=mask_periodic_width,
81
  periodic_dropout=0.0,
82
  periodic_width_dropout=0.0,
 
107
  num_vamps, mode, use_beats, num_steps, snap_to_beats,
108
  beat_unmask_drop, mask_periodic_width,
109
  mask_periodic_dropout, mask_periodic_width_dropout,
110
+ n_conditioning_codebooks, use_coarse2fine, stretch_factor,
111
  ):
112
  # try:
113
  print(input_audio)
 
148
  suffix_dur_s=suffix_s,
149
  num_vamps=num_vamps,
150
  downsample_factor=mask_periodic_amt,
151
+ stretch_factor=stretch_factor,
152
  periodic_width=mask_periodic_width,
153
  periodic_dropout=mask_periodic_dropout,
154
  periodic_width_dropout=mask_periodic_width_dropout,
 
161
 
162
  if use_coarse2fine:
163
  zv = interface.coarse_to_fine(zv)
164
+ mask = interface.to_signal(mask_z).cpu()
165
 
166
  sig = interface.to_signal(zv).cpu()
167
  print("done")
 
169
  out_dir = OUT_DIR / str(uuid.uuid4())
170
  out_dir.mkdir()
171
  sig.write(out_dir / "output.wav")
172
+ mask.write(out_dir / "mask.wav")
173
+ return sig.path_to_file, mask.path_to_file
174
+ # return sig.path_to_file, mask_z
175
  # except Exception as e:
176
  # raise gr.Error(f"failed with error: {e}")
177
 
 
183
  mask_up_chk, up_factor,
184
  num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
185
  beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
186
+ n_conditioning_codebooks, use_coarse2fine, stretch_factor
187
  ):
188
  out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
189
  out_dir.mkdir(parents=True, exist_ok=True)
 
218
  "mask_periodic_width_dropout": mask_periodic_width_dropout,
219
  "n_conditioning_codebooks": n_conditioning_codebooks,
220
  "use_coarse2fine": use_coarse2fine,
221
+ "stretch_factor": stretch_factor,
222
  }
223
 
224
  # save with yaml
 
337
  precision=0,
338
  )
339
 
340
+ stretch_factor = gr.Slider(
341
+ label="time stretch factor",
342
+ minimum=0,
343
+ maximum=64,
344
+ step=1,
345
+ value=1,
346
+ )
347
+
348
  mask_periodic_amt = gr.Slider(
349
  label="periodic hint (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
350
  minimum=0,
 
513
  num_vamps, mode, use_beats, num_steps, snap_to_beats,
514
  beat_unmask_drop, mask_periodic_width,
515
  mask_periodic_dropout, mask_periodic_width_dropout,
516
+ n_conditioning_codebooks, use_coarse2fine, stretch_factor
517
  ],
518
  outputs=[output_audio, audio_mask],
519
  api_name="vamp"
 
532
  notes_text, use_beats, num_steps, snap_to_beats,
533
  beat_unmask_drop, mask_periodic_width,
534
  mask_periodic_dropout, mask_periodic_width_dropout,
535
+ n_conditioning_codebooks, use_coarse2fine, stretch_factor
536
  ],
537
  outputs=[thank_you, download_file]
538
  )
 
541
  ez_vamp_button.click(
542
  fn=ez_vamp,
543
  inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
544
+ mask_periodic_width, num_steps, stretch_factor ],
545
  outputs=[output_audio],
546
  api_name="ez_vamp"
547
  )
docker-compose.yml DELETED
@@ -1,92 +0,0 @@
1
-
2
- version: "3.5"
3
- services:
4
- tensorrt:
5
- build:
6
- context: .
7
- dockerfile: ./deployment_build/dockerfile
8
- args:
9
- GITHUB_TOKEN: ${GITHUB_TOKEN}
10
- profiles:
11
- - tensorrt
12
- volumes:
13
- - ./:/u/home/src
14
- - ~/.config/gcloud:/root/.config/gcloud
15
- deploy:
16
- resources:
17
- limits:
18
- # match production limits
19
- cpus: '7'
20
- memory: 25000M
21
- reservations:
22
- devices:
23
- - driver: nvidia
24
- count: 1
25
- capabilities: [gpu]
26
- working_dir: /u/home/src
27
- entrypoint:
28
- - python
29
- - -m
30
- - wav2wav.converter
31
- base:
32
- build:
33
- context: .
34
- dockerfile: ./Dockerfile
35
- args:
36
- GITHUB_TOKEN: ${GITHUB_TOKEN}
37
- volumes:
38
- - .:/u/home/src
39
- - ~/.wav2wav:/u/home/.wav2wav
40
- - ${PATH_TO_DATA}:/data
41
- - ${PATH_TO_RUNS}:/runs
42
- - ~/.config/gcloud:/u/home/.config/gcloud
43
- - ~/.zsh_history:/u/home/.zsh_history
44
- environment:
45
- - GITHUB_TOKEN
46
- - DISCOURSE_API_USERNAME
47
- - DISCOURSE_SERVER
48
- - DISCOURSE_API_KEY
49
- - HOST_USER_ID
50
- - HOST_USER_GID
51
- - JUPYTER_TOKEN
52
- - PATH_TO_DATA=/data
53
- - PATH_TO_RUNS=/runs
54
- - TENSORBOARD_PATH
55
- - MPLCONFIGDIR=/u/home/.mplconfig
56
- shm_size: 32G
57
- working_dir: /u/home/src
58
- deploy:
59
- resources:
60
- reservations:
61
- devices:
62
- - driver: nvidia
63
- capabilities: [gpu]
64
- dev:
65
- extends: base
66
- profiles:
67
- - interactive
68
- stdin_open: true
69
- tty: true
70
- ports:
71
- - 7860:7860
72
- jupyter:
73
- extends: base
74
- ports:
75
- - ${JUPYTER_PORT}:8888
76
- entrypoint:
77
- - /bin/bash
78
- - /entry_script_jupyter.sh
79
- tensorboard:
80
- extends: base
81
- ports:
82
- - ${TENSORBOARD_PORT}:6006
83
- entrypoint:
84
- - /bin/bash
85
- - /entry_script_tensorboard.sh
86
- gradio:
87
- extends: base
88
- ports:
89
- - 7860:7860
90
- entrypoint:
91
- - /bin/bash
92
- - /entry_script_gradio.sh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,31 +0,0 @@
1
- argbind>=0.3.1
2
- pytorch-ignite
3
- rich
4
- audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
5
- lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
6
- torch==1.13.1
7
- torchaudio==0.13.1
8
- tqdm
9
- tensorboard
10
- google-cloud-logging==2.2.0
11
- pytest
12
- pytest-cov
13
- pynvml
14
- psutil
15
- pandas
16
- onnx
17
- onnx-simplifier
18
- seaborn
19
- jupyterlab
20
- jupyterlab-link-share
21
- pandas
22
- watchdog
23
- pesq
24
- tabulate
25
- torchmetrics
26
- codebraid==0.5.0
27
- jupyter-client==6.1.12
28
- tensorboardX
29
- gradio
30
- einops
31
- frechet_audio_distance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup.py CHANGED
@@ -32,12 +32,13 @@ setup(
32
  "rich",
33
  "audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
34
  "lac @ git+https://github.com/hugofloresgarcia/lac.git",
35
- "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
36
  "torch==2.0",
37
  "tqdm",
38
  "tensorboard",
39
  "google-cloud-logging==2.2.0",
40
  "einops",
41
- "frechet_audio_distance"
 
42
  ],
43
  )
 
32
  "rich",
33
  "audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
34
  "lac @ git+https://github.com/hugofloresgarcia/lac.git",
35
+ # "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
36
  "torch==2.0",
37
  "tqdm",
38
  "tensorboard",
39
  "google-cloud-logging==2.2.0",
40
  "einops",
41
+ # "frechet_audio_distance",
42
+ "gradio"
43
  ],
44
  )
vampnet/interface.py CHANGED
@@ -249,6 +249,7 @@ class Interface(torch.nn.Module):
249
  suffix_dur_s: float = 0.0,
250
  num_vamps: int = 1,
251
  downsample_factor: int = None,
 
252
  periodic_width: int = 1,
253
  periodic_dropout=0.0,
254
  periodic_width_dropout=0.0,
@@ -269,11 +270,33 @@ class Interface(torch.nn.Module):
269
  n_prefix = self.s2t(prefix_dur_s)
270
  n_suffix = self.s2t(suffix_dur_s)
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
273
  assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
274
 
275
  if swap_prefix_suffix:
276
- # swap the prefix and suffix regions in c_z
277
  assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
278
  cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
279
 
@@ -295,7 +318,7 @@ class Interface(torch.nn.Module):
295
  downsample_factor=downsample_factor,
296
  periodic_width=periodic_width,
297
  periodic_dropout=periodic_dropout,
298
- add_random_periodic_offset=True,
299
  periodic_width_dropout=periodic_width_dropout,
300
  mask=cz_mask,
301
  ext_mask=ext_mask,
 
249
  suffix_dur_s: float = 0.0,
250
  num_vamps: int = 1,
251
  downsample_factor: int = None,
252
+ stretch_factor: int = None,
253
  periodic_width: int = 1,
254
  periodic_dropout=0.0,
255
  periodic_width_dropout=0.0,
 
270
  n_prefix = self.s2t(prefix_dur_s)
271
  n_suffix = self.s2t(suffix_dur_s)
272
 
273
+
274
+ # hmm, should be a better way to do this? think we just need a mask builder class
275
+ add_random_periodic_offset = True
276
+
277
+ if stretch_factor is not None and stretch_factor > 1:
278
+ print(f"stretching by {stretch_factor}")
279
+ assert stretch_factor >= 1, "stretch factor must be >= 1"
280
+ cz = cz.repeat_interleave(stretch_factor, dim=-1)
281
+
282
+ # the downsample factor is now relative to the stretched sequence
283
+ assert downsample_factor is None or downsample_factor <= 2, "downsample_factor must be None when stretch_factor is not None"
284
+
285
+ downsample_factor = stretch_factor
286
+ add_random_periodic_offset = False
287
+
288
+ assert n_prefix == 0 and n_suffix == 0, "prefix and suffix must be 0 when stretch_factor is not None"
289
+ assert ext_mask is None, "ext_mask must be None when stretch_factor is not None"
290
+
291
+ # trim cz to the original length
292
+ cz = cz[:, :, :c_seq_len]
293
+
294
+
295
  assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
296
  assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
297
 
298
  if swap_prefix_suffix:
299
+ # swap the prefix and suffix
300
  assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
301
  cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
302
 
 
318
  downsample_factor=downsample_factor,
319
  periodic_width=periodic_width,
320
  periodic_dropout=periodic_dropout,
321
+ add_random_periodic_offset=add_random_periodic_offset,
322
  periodic_width_dropout=periodic_width_dropout,
323
  mask=cz_mask,
324
  ext_mask=ext_mask,
vampnet/modules/base.py CHANGED
@@ -71,7 +71,7 @@ class VampBase(at.ml.BaseModel):
71
  probs[i, :, -n:] = 0.0
72
 
73
  # if we have a downsample factor, set the mask prob to 0
74
- if downsample_factor is not None:
75
  if not isinstance(downsample_factor, torch.Tensor):
76
  downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
77
  for i, factor in enumerate(downsample_factor):
@@ -200,7 +200,6 @@ class VampBase(at.ml.BaseModel):
200
  # find where the mask token is and replace it with silence in the audio
201
  for tstep in range(z.shape[-1]):
202
  if torch.any(z[:, :, tstep] == self.mask_token):
203
- print("mask token found at step", tstep)
204
  sample_idx_0 = tstep * codec.hop_length
205
  sample_idx_1 = sample_idx_0 + codec.hop_length
206
  signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
 
71
  probs[i, :, -n:] = 0.0
72
 
73
  # if we have a downsample factor, set the mask prob to 0
74
+ if downsample_factor is not None and downsample_factor > 0:
75
  if not isinstance(downsample_factor, torch.Tensor):
76
  downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
77
  for i, factor in enumerate(downsample_factor):
 
200
  # find where the mask token is and replace it with silence in the audio
201
  for tstep in range(z.shape[-1]):
202
  if torch.any(z[:, :, tstep] == self.mask_token):
 
203
  sample_idx_0 = tstep * codec.hop_length
204
  sample_idx_1 = sample_idx_0 + codec.hop_length
205
  signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0