Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

Hugo Flores Garcia commited on May 26, 2023

Commit

5a343f4

•

1 Parent(s): f4c9665

the refactor begins

Browse files

Files changed (21) hide show

Dockerfile +0 -39
README.md +0 -21
conf/{vampnet-c2f.yml → c2f.yml} +0 -0
conf/interface/interface-c2f-exp.yml +0 -5
conf/interface/{interface-jazzpop.yml → jazzpop.yml} +0 -0
conf/interface/{interface-maestro.yml → maestro.yml} +0 -0
conf/interface/{interface-spotdl.yml → spotdl.yml} +0 -0
conf/lora/birds.yml +10 -0
conf/lora/birdss.yml +12 -0
conf/lora/constructions.yml +2 -2
conf/lora/lora-is-this-charlie-parker.yml +2 -2
conf/lora/lora.yml +1 -1
conf/lora/underworld.yml +10 -0
conf/vampnet-groovemidi.yml +0 -54
conf/vampnet-maestro.yml +0 -21
demo.py +22 -10
docker-compose.yml +0 -92
requirements.txt +0 -31
setup.py +3 -2
vampnet/interface.py +25 -2
vampnet/modules/base.py +1 -2

Dockerfile DELETED Viewed

@@ -1,39 +0,0 @@
-FROM us.gcr.io/lyrebird-research/research-image/audio
-COPY requirements.txt requirements.txt
-ARG GITHUB_TOKEN
-RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc
-COPY env/alias.sh /alias.sh
-COPY env/entry_script.sh /entry_script.sh
-RUN cat /alias.sh >> ~/.zshrc
-# USER researcher
-RUN pip install Cython
-RUN pip install madmom
-RUN pip install --upgrade -r requirements.txt
-RUN pip install --upgrade tensorflow
-RUN pip install --upgrade librosa
-RUN pip install --upgrade numba
-RUN pip install protobuf==3.20
-ENV PYTHONPATH "$PYTHONPATH:/u/home/src"
-ENV NUMBA_CACHE_DIR=/tmp/
-USER root
-RUN wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb
-RUN dpkg -i pandoc-2.18-1-amd64.deb
-RUN apt-get update && apt-get install task-spooler
-RUN head -n -1 /entry_script.sh > /entry_script_jupyter.sh
-RUN head -n -1 /entry_script.sh > /entry_script_tensorboard.sh
-RUN head -n -1 /entry_script.sh > /entry_script_gradio.sh
-RUN echo \
-    'su -p ${USER} -c "source ~/.zshrc && jupyter lab --ip=0.0.0.0"' >> \
-    /entry_script_jupyter.sh
-RUN echo \
-    'su -p ${USER} -c "source ~/.zshrc && tensorboard --logdir=$TENSORBOARD_PATH --samples_per_plugin audio=500 --bind_all"' >> \
-    /entry_script_tensorboard.sh
-RUN echo \
-    'su -p ${USER} -c "source ~/.zshrc && python app.py --args.load=conf/app.yml"' >> \
-    /entry_script_gradio.sh

README.md CHANGED Viewed

@@ -2,27 +2,6 @@
 This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
-## Install hooks
-First install the pre-commit util:
-https://pre-commit.com/#install
-    pip install pre-commit  # with pip
-    brew install pre-commit  # on Mac
-Then install the git hooks
-    pre-commit install
-    # check .pre-commit-config.yaml for details of hooks
-Upon `git commit`, the pre-commit hooks will be run automatically on the stage files (i.e. added by `git add`)
-**N.B. By default, pre-commit checks only run on staged files**
-If you need to run it on all files:
-    pre-commit run --all-files
 ## Development
 ### Setting everything up

 This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
 ## Development
 ### Setting everything up

conf/{vampnet-c2f.yml → c2f.yml} RENAMED Viewed

File without changes

conf/interface/interface-c2f-exp.yml DELETED Viewed

@@ -1,5 +0,0 @@
-Interface.coarse_ckpt: /runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
-Interface.coarse2fine_ckpt: runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
-Interface.codec_ckpt: /runs/codec-ckpt/codec.pth
-Interface.coarse_chunk_size_s: 5
-Interface.coarse2fine_chunk_size_s: 3

conf/interface/{interface-jazzpop.yml → jazzpop.yml} RENAMED Viewed

File without changes

conf/interface/{interface-maestro.yml → maestro.yml} RENAMED Viewed

File without changes

conf/interface/{interface-spotdl.yml → spotdl.yml} RENAMED Viewed

File without changes

conf/lora/birds.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+$include:
+  - conf/lora/lora.yml
+fine_tune: True
+train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/birds
+val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/birds

conf/lora/birdss.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+$include:
+  - conf/lora/lora.yml
+fine_tune: True
+train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/birds
+  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
+val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/birds
+  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/

conf/lora/constructions.yml CHANGED Viewed

@@ -4,7 +4,7 @@ $include:
 fine_tune: True
 train/AudioLoader.sources:
-  - /media/CHONK/hugo/spotdl/subsets/constructions
 val/AudioLoader.sources:
-  - /media/CHONK/hugo/spotdl/subsets/constructions

 fine_tune: True
 train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
 val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3

conf/lora/lora-is-this-charlie-parker.yml CHANGED Viewed

@@ -4,7 +4,7 @@ $include:
 fine_tune: True
 train/AudioLoader.sources:
-  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
 val/AudioLoader.sources:
-  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/

 fine_tune: True
 train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
 val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3

conf/lora/lora.yml CHANGED Viewed

@@ -8,7 +8,7 @@ train/AudioDataset.n_examples: 10000000
 val/AudioDataset.n_examples: 10
-NoamScheduler.warmup: 250
 epoch_length: 100
 save_audio_epochs: 2

 val/AudioDataset.n_examples: 10
+NoamScheduler.warmup: 400
 epoch_length: 100
 save_audio_epochs: 2

conf/lora/underworld.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+$include:
+  - conf/lora/lora.yml
+fine_tune: True
+train/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/underworld.mp3
+val/AudioLoader.sources:
+  - /media/CHONK/hugo/spotdl/subsets/underworld.mp3

conf/vampnet-groovemidi.yml DELETED Viewed

@@ -1,54 +0,0 @@
-$include:
-  - conf/vampnet.yml
-VampNet.embedding_dim: 512
-VampNet.n_layers: 12
-VampNet.n_heads: 8
-AudioDataset.duration: 12.0
-train/AudioDataset.n_examples: 10000000
-train/AudioLoader.sources:
-  # drummer 1 sessions 1, 2, and 3
-  - /data/e-gmd-v1.0.0/drummer1/session1
-  - /data/e-gmd-v1.0.0/drummer1/session2
-  - /data/e-gmd-v1.0.0/drummer1/session3
-  # drummer 3 sessions 1 and 2
-  - /data/e-gmd-v1.0.0/drummer3/session1
-  - /data/e-gmd-v1.0.0/drummer3/session2
-  # drummer 4 session 1
-  - /data/e-gmd-v1.0.0/drummer4/session1
-  # drummer 5 sessions 1 and 2
-  - /data/e-gmd-v1.0.0/drummer5/session1
-  - /data/e-gmd-v1.0.0/drummer5/session2
-  # drummer 6 session 1, 2, and 3
-  - /data/e-gmd-v1.0.0/drummer6/session1
-  - /data/e-gmd-v1.0.0/drummer6/session2
-  - /data/e-gmd-v1.0.0/drummer6/session3
-  # drummer 7 session 1, 2 and 3
-  - /data/e-gmd-v1.0.0/drummer7/session1
-  - /data/e-gmd-v1.0.0/drummer7/session2
-  - /data/e-gmd-v1.0.0/drummer7/session3
-  # drummer 8 session 1
-  - /data/e-gmd-v1.0.0/drummer8/session1
-  # drummer 9 session 1
-  - /data/e-gmd-v1.0.0/drummer9/session1
-  # drummer 10 session 1
-  - /data/e-gmd-v1.0.0/drummer10/session1
-val/AudioDataset.n_examples: 500
-val/AudioLoader.sources:
-  # drummer 1 eval session
-  - /data/e-gmd-v1.0.0/drummer1/eval_session
-  # drummer 5 eval session
-  - /data/e-gmd-v1.0.0/drummer5/eval_session
-  # drummer 7 eval session
-  - /data/e-gmd-v1.0.0/drummer7/eval_session
-test/AudioDataset.n_examples: 1000
-test/AudioLoader.sources:
-    # drummer 8 eval session
-  - /data/e-gmd-v1.0.0/drummer8/eval_session

conf/vampnet-maestro.yml DELETED Viewed

@@ -1,21 +0,0 @@
-$include:
-  - conf/vampnet.yml
-VampNet.embedding_dim: 512
-VampNet.n_layers: 12
-VampNet.n_heads: 8
-AudioDataset.duration: 12.0
-train/AudioDataset.n_examples: 10000000
-train/AudioLoader.sources:
-  - /data/maestro-reorg/train
-val/AudioDataset.n_examples: 500
-val/AudioLoader.sources:
-  - /data/maestro-reorg/val
-test/AudioDataset.n_examples: 1000
-test/AudioLoader.sources:
-  - /data/maestro-reorg/test

demo.py CHANGED Viewed

@@ -62,6 +62,7 @@ def load_random_audio():
 def ez_vamp(
     input_audio, init_temp, final_temp,
     mask_periodic_amt, mask_periodic_width, num_steps,
 ):
     print(input_audio)
     sig = at.AudioSignal(input_audio)
@@ -74,7 +75,8 @@ def ez_vamp(
         prefix_dur_s=0.0,
         suffix_dur_s=0.0,
         num_vamps=1,
-        downsample_factor=mask_periodic_amt,
         periodic_width=mask_periodic_width,
         periodic_dropout=0.0,
         periodic_width_dropout=0.0,
@@ -105,7 +107,7 @@ def vamp(
     num_vamps, mode, use_beats, num_steps, snap_to_beats,
     beat_unmask_drop,  mask_periodic_width,
     mask_periodic_dropout, mask_periodic_width_dropout,
-    n_conditioning_codebooks, use_coarse2fine
 ):
     # try:
         print(input_audio)
@@ -146,6 +148,7 @@ def vamp(
                 suffix_dur_s=suffix_s,
                 num_vamps=num_vamps,
                 downsample_factor=mask_periodic_amt,
                 periodic_width=mask_periodic_width,
                 periodic_dropout=mask_periodic_dropout,
                 periodic_width_dropout=mask_periodic_width_dropout,
@@ -158,7 +161,7 @@ def vamp(
             if use_coarse2fine:
                 zv = interface.coarse_to_fine(zv)
-            # mask = interface.to_signal(mask_z).cpu()
             sig = interface.to_signal(zv).cpu()
             print("done")
@@ -166,9 +169,9 @@ def vamp(
         out_dir = OUT_DIR / str(uuid.uuid4())
         out_dir.mkdir()
         sig.write(out_dir / "output.wav")
-        # mask.write(out_dir / "mask.wav")
-        # return sig.path_to_file, mask.path_to_file
-        return sig.path_to_file, None
     # except Exception as e:
     #     raise gr.Error(f"failed with error: {e}")
@@ -180,7 +183,7 @@ def save_vamp(
     mask_up_chk, up_factor,
     num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
     beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
-    n_conditioning_codebooks, use_coarse2fine
 ):
     out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -215,6 +218,7 @@ def save_vamp(
         "mask_periodic_width_dropout": mask_periodic_width_dropout,
         "n_conditioning_codebooks": n_conditioning_codebooks,
         "use_coarse2fine": use_coarse2fine,
     }
     # save with yaml
@@ -333,6 +337,14 @@ with gr.Blocks() as demo:
                 precision=0,
             )
             mask_periodic_amt = gr.Slider(
                 label="periodic hint  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
                 minimum=0,
@@ -501,7 +513,7 @@ with gr.Blocks() as demo:
             num_vamps, mode, use_beats, num_steps, snap_to_beats,
             beat_unmask_drop, mask_periodic_width,
             mask_periodic_dropout, mask_periodic_width_dropout,
-            n_conditioning_codebooks, use_coarse2fine
         ],
         outputs=[output_audio, audio_mask],
         api_name="vamp"
@@ -520,7 +532,7 @@ with gr.Blocks() as demo:
             notes_text, use_beats, num_steps, snap_to_beats,
             beat_unmask_drop, mask_periodic_width,
             mask_periodic_dropout, mask_periodic_width_dropout,
-            n_conditioning_codebooks, use_coarse2fine
         ],
         outputs=[thank_you, download_file]
     )
@@ -529,7 +541,7 @@ with gr.Blocks() as demo:
     ez_vamp_button.click(
         fn=ez_vamp,
         inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
-                mask_periodic_width, num_steps ],
         outputs=[output_audio],
         api_name="ez_vamp"
     )

 def ez_vamp(
     input_audio, init_temp, final_temp,
     mask_periodic_amt, mask_periodic_width, num_steps,
+    stretch_factor,
 ):
     print(input_audio)
     sig = at.AudioSignal(input_audio)
         prefix_dur_s=0.0,
         suffix_dur_s=0.0,
         num_vamps=1,
+        downsample_factor=mask_periodic_amt,
+        stretch_factor=stretch_factor,
         periodic_width=mask_periodic_width,
         periodic_dropout=0.0,
         periodic_width_dropout=0.0,
     num_vamps, mode, use_beats, num_steps, snap_to_beats,
     beat_unmask_drop,  mask_periodic_width,
     mask_periodic_dropout, mask_periodic_width_dropout,
+    n_conditioning_codebooks, use_coarse2fine, stretch_factor,
 ):
     # try:
         print(input_audio)
                 suffix_dur_s=suffix_s,
                 num_vamps=num_vamps,
                 downsample_factor=mask_periodic_amt,
+                stretch_factor=stretch_factor,
                 periodic_width=mask_periodic_width,
                 periodic_dropout=mask_periodic_dropout,
                 periodic_width_dropout=mask_periodic_width_dropout,
             if use_coarse2fine:
                 zv = interface.coarse_to_fine(zv)
+            mask = interface.to_signal(mask_z).cpu()
             sig = interface.to_signal(zv).cpu()
             print("done")
         out_dir = OUT_DIR / str(uuid.uuid4())
         out_dir.mkdir()
         sig.write(out_dir / "output.wav")
+        mask.write(out_dir / "mask.wav")
+        return sig.path_to_file, mask.path_to_file
+        # return sig.path_to_file, mask_z
     # except Exception as e:
     #     raise gr.Error(f"failed with error: {e}")
     mask_up_chk, up_factor,
     num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
     beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
+    n_conditioning_codebooks, use_coarse2fine, stretch_factor
 ):
     out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
     out_dir.mkdir(parents=True, exist_ok=True)
         "mask_periodic_width_dropout": mask_periodic_width_dropout,
         "n_conditioning_codebooks": n_conditioning_codebooks,
         "use_coarse2fine": use_coarse2fine,
+        "stretch_factor": stretch_factor,
     }
     # save with yaml
                 precision=0,
             )
+            stretch_factor = gr.Slider(
+                label="time stretch factor",
+                minimum=0,
+                maximum=64,
+                step=1,
+                value=1,
+            )
             mask_periodic_amt = gr.Slider(
                 label="periodic hint  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
                 minimum=0,
             num_vamps, mode, use_beats, num_steps, snap_to_beats,
             beat_unmask_drop, mask_periodic_width,
             mask_periodic_dropout, mask_periodic_width_dropout,
+            n_conditioning_codebooks, use_coarse2fine, stretch_factor
         ],
         outputs=[output_audio, audio_mask],
         api_name="vamp"
             notes_text, use_beats, num_steps, snap_to_beats,
             beat_unmask_drop, mask_periodic_width,
             mask_periodic_dropout, mask_periodic_width_dropout,
+            n_conditioning_codebooks, use_coarse2fine, stretch_factor
         ],
         outputs=[thank_you, download_file]
     )
     ez_vamp_button.click(
         fn=ez_vamp,
         inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
+                mask_periodic_width, num_steps, stretch_factor ],
         outputs=[output_audio],
         api_name="ez_vamp"
     )

docker-compose.yml DELETED Viewed

@@ -1,92 +0,0 @@
-version: "3.5"
-services:
-  tensorrt:
-    build:
-      context: .
-      dockerfile: ./deployment_build/dockerfile
-      args:
-        GITHUB_TOKEN: ${GITHUB_TOKEN}
-    profiles:
-      - tensorrt
-    volumes:
-      - ./:/u/home/src
-      - ~/.config/gcloud:/root/.config/gcloud
-    deploy:
-      resources:
-        limits:
-          # match production limits
-          cpus: '7'
-          memory: 25000M
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    working_dir: /u/home/src
-    entrypoint:
-      - python
-      - -m
-      - wav2wav.converter
-  base:
-    build:
-      context: .
-      dockerfile: ./Dockerfile
-      args:
-        GITHUB_TOKEN: ${GITHUB_TOKEN}
-    volumes:
-      - .:/u/home/src
-      - ~/.wav2wav:/u/home/.wav2wav
-      - ${PATH_TO_DATA}:/data
-      - ${PATH_TO_RUNS}:/runs
-      - ~/.config/gcloud:/u/home/.config/gcloud
-      - ~/.zsh_history:/u/home/.zsh_history
-    environment:
-      - GITHUB_TOKEN
-      - DISCOURSE_API_USERNAME
-      - DISCOURSE_SERVER
-      - DISCOURSE_API_KEY
-      - HOST_USER_ID
-      - HOST_USER_GID
-      - JUPYTER_TOKEN
-      - PATH_TO_DATA=/data
-      - PATH_TO_RUNS=/runs
-      - TENSORBOARD_PATH
-      - MPLCONFIGDIR=/u/home/.mplconfig
-    shm_size: 32G
-    working_dir: /u/home/src
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: [gpu]
-  dev:
-    extends: base
-    profiles:
-      - interactive
-    stdin_open: true
-    tty: true
-    ports:
-      - 7860:7860
-  jupyter:
-    extends: base
-    ports:
-      - ${JUPYTER_PORT}:8888
-    entrypoint:
-      - /bin/bash
-      - /entry_script_jupyter.sh
-  tensorboard:
-    extends: base
-    ports:
-      - ${TENSORBOARD_PORT}:6006
-    entrypoint:
-      - /bin/bash
-      - /entry_script_tensorboard.sh
-  gradio:
-    extends: base
-    ports:
-      - 7860:7860
-    entrypoint:
-      - /bin/bash
-      - /entry_script_gradio.sh

requirements.txt DELETED Viewed

@@ -1,31 +0,0 @@
-argbind>=0.3.1
-pytorch-ignite
-rich
-audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
-lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
-torch==1.13.1
-torchaudio==0.13.1
-tqdm
-tensorboard
-google-cloud-logging==2.2.0
-pytest
-pytest-cov
-pynvml
-psutil
-pandas
-onnx
-onnx-simplifier
-seaborn
-jupyterlab
-jupyterlab-link-share
-pandas
-watchdog
-pesq
-tabulate
-torchmetrics
-codebraid==0.5.0
-jupyter-client==6.1.12
-tensorboardX
-gradio
-einops
-frechet_audio_distance

setup.py CHANGED Viewed

@@ -32,12 +32,13 @@ setup(
         "rich",
         "audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
         "lac @ git+https://github.com/hugofloresgarcia/lac.git",
-        "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
         "torch==2.0",
         "tqdm",
         "tensorboard",
         "google-cloud-logging==2.2.0",
         "einops",
-        "frechet_audio_distance"
     ],
 )

         "rich",
         "audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
         "lac @ git+https://github.com/hugofloresgarcia/lac.git",
+        # "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
         "torch==2.0",
         "tqdm",
         "tensorboard",
         "google-cloud-logging==2.2.0",
         "einops",
+        # "frechet_audio_distance",
+        "gradio"
     ],
 )

vampnet/interface.py CHANGED Viewed

@@ -249,6 +249,7 @@ class Interface(torch.nn.Module):
         suffix_dur_s: float = 0.0,
         num_vamps: int = 1,
         downsample_factor: int = None,
         periodic_width: int = 1,
         periodic_dropout=0.0,
         periodic_width_dropout=0.0,
@@ -269,11 +270,33 @@ class Interface(torch.nn.Module):
         n_prefix = self.s2t(prefix_dur_s)
         n_suffix = self.s2t(suffix_dur_s)
         assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
         assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
         if swap_prefix_suffix:
-            # swap the prefix and suffix regions in c_z
             assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
             cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
@@ -295,7 +318,7 @@ class Interface(torch.nn.Module):
                 downsample_factor=downsample_factor,
                 periodic_width=periodic_width,
                 periodic_dropout=periodic_dropout,
-                add_random_periodic_offset=True,
                 periodic_width_dropout=periodic_width_dropout,
                 mask=cz_mask,
                 ext_mask=ext_mask,

         suffix_dur_s: float = 0.0,
         num_vamps: int = 1,
         downsample_factor: int = None,
+        stretch_factor: int = None,
         periodic_width: int = 1,
         periodic_dropout=0.0,
         periodic_width_dropout=0.0,
         n_prefix = self.s2t(prefix_dur_s)
         n_suffix = self.s2t(suffix_dur_s)
+        # hmm, should be a better way to do this? think we just need a mask builder class
+        add_random_periodic_offset = True
+        if stretch_factor is not None and stretch_factor > 1:
+            print(f"stretching by {stretch_factor}")
+            assert stretch_factor >= 1, "stretch factor must be >= 1"
+            cz = cz.repeat_interleave(stretch_factor, dim=-1)
+            # the downsample factor is now relative to the stretched sequence
+            assert downsample_factor is None or downsample_factor <= 2, "downsample_factor must be None when stretch_factor is not None"
+            downsample_factor = stretch_factor
+            add_random_periodic_offset = False
+            assert n_prefix == 0 and n_suffix == 0, "prefix and suffix must be 0 when stretch_factor is not None"
+            assert ext_mask is None, "ext_mask must be None when stretch_factor is not None"
+            # trim cz to the original length
+            cz = cz[:, :, :c_seq_len]
         assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
         assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
         if swap_prefix_suffix:
+            # swap the prefix and suffix
             assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
             cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
                 downsample_factor=downsample_factor,
                 periodic_width=periodic_width,
                 periodic_dropout=periodic_dropout,
+                add_random_periodic_offset=add_random_periodic_offset,
                 periodic_width_dropout=periodic_width_dropout,
                 mask=cz_mask,
                 ext_mask=ext_mask,

vampnet/modules/base.py CHANGED Viewed

@@ -71,7 +71,7 @@ class VampBase(at.ml.BaseModel):
                         probs[i, :, -n:] = 0.0
             # if we have a downsample factor, set the mask prob to 0
-            if downsample_factor is not None:
                 if not isinstance(downsample_factor, torch.Tensor):
                     downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
                 for i, factor in enumerate(downsample_factor):
@@ -200,7 +200,6 @@ class VampBase(at.ml.BaseModel):
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
             if torch.any(z[:, :, tstep] == self.mask_token):
-                print("mask token found at step", tstep)
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0

                         probs[i, :, -n:] = 0.0
             # if we have a downsample factor, set the mask prob to 0
+            if downsample_factor is not None and downsample_factor > 0:
                 if not isinstance(downsample_factor, torch.Tensor):
                     downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
                 for i, factor in enumerate(downsample_factor):
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
             if torch.any(z[:, :, tstep] == self.mask_token):
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0