Hugo Flores Garcia commited on
Commit
c940f25
1 Parent(s): 881d56d

fix dropout bug for masks, refactor interfaces, add finetune setup script

Browse files
conf/generated/berta-goldman-speech/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ save_path: ./runs/berta-goldman-speech/c2f
12
+ train/AudioLoader.sources:
13
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
14
+ val/AudioLoader.sources:
15
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
conf/generated/berta-goldman-speech/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ save_path: ./runs/berta-goldman-speech/coarse
5
+ train/AudioLoader.sources:
6
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
7
+ val/AudioLoader.sources:
8
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
conf/generated/berta-goldman-speech/interface.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/Berta-Caceres-2015-Goldman-Speech.mp3
3
+ Interface.coarse2fine_ckpt: ./runs/berta-goldman-speech/c2f/best/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/berta-goldman-speech/coarse/best/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/generated/nasralla/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ save_path: ./runs/nasralla/c2f
12
+ train/AudioLoader.sources:
13
+ - /media/CHONK/hugo/nasralla
14
+ val/AudioLoader.sources:
15
+ - /media/CHONK/hugo/nasralla
conf/generated/nasralla/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ save_path: ./runs/nasralla/coarse
5
+ train/AudioLoader.sources:
6
+ - /media/CHONK/hugo/nasralla
7
+ val/AudioLoader.sources:
8
+ - /media/CHONK/hugo/nasralla
conf/generated/nasralla/interface.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - /media/CHONK/hugo/nasralla
3
+ Interface.coarse2fine_ckpt: ./runs/nasralla/c2f/best/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/nasralla/coarse/best/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/spotdl/codec.pth
conf/interface/spotdl.yml CHANGED
@@ -7,5 +7,6 @@ Interface.coarse2fine_chunk_size_s: 3
7
 
8
 
9
  AudioLoader.sources:
10
- - /data/spotdl/audio/val
11
- - /data/spotdl/audio/test
 
 
7
 
8
 
9
  AudioLoader.sources:
10
+ # - /media/CHONK/hugo/spotdl/subsets/jazz-blues
11
+ - /media/CHONK/null
12
+
demo.py CHANGED
@@ -63,9 +63,11 @@ def load_random_audio():
63
 
64
 
65
  def _vamp(data, return_mask=False):
66
- print(data)
67
- print(data[input_audio])
68
  sig = at.AudioSignal(data[input_audio])
 
 
69
 
70
  # TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
71
 
@@ -98,7 +100,9 @@ def _vamp(data, return_mask=False):
98
  mask = pmask.dropout(mask, data[dropout])
99
  mask = pmask.codebook_unmask(mask, ncc)
100
 
101
- print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}")
 
 
102
 
103
  zv, mask_z = interface.coarse_vamp(
104
  z,
@@ -114,8 +118,7 @@ def _vamp(data, return_mask=False):
114
  sig = interface.to_signal(zv).cpu()
115
  print("done")
116
 
117
- out_dir = OUT_DIR / str(uuid.uuid4())
118
- out_dir.mkdir()
119
 
120
  sig.write(out_dir / "output.wav")
121
 
@@ -136,13 +139,13 @@ def save_vamp(data):
136
  out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
137
  out_dir.mkdir(parents=True, exist_ok=True)
138
 
139
- sig_in = at.AudioSignal(input_audio)
140
- sig_out = at.AudioSignal(output_audio)
141
 
142
  sig_in.write(out_dir / "input.wav")
143
  sig_out.write(out_dir / "output.wav")
144
 
145
- data = {
146
  "init_temp": data[init_temp],
147
  "final_temp": data[final_temp],
148
  "prefix_s": data[prefix_s],
@@ -159,7 +162,7 @@ def save_vamp(data):
159
 
160
  # save with yaml
161
  with open(out_dir / "data.yaml", "w") as f:
162
- yaml.dump(data, f)
163
 
164
  import zipfile
165
  zip_path = out_dir.with_suffix(".zip")
@@ -321,6 +324,8 @@ with gr.Blocks() as demo:
321
  type="filepath"
322
  )
323
 
 
 
324
 
325
  # with gr.Column():
326
  # with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
@@ -386,9 +391,15 @@ with gr.Blocks() as demo:
386
  api_name="vamp"
387
  )
388
 
 
 
 
 
 
 
389
  save_button.click(
390
  fn=save_vamp,
391
- inputs=_inputs | {notes_text},
392
  outputs=[thank_you, download_file]
393
  )
394
 
 
63
 
64
 
65
  def _vamp(data, return_mask=False):
66
+ out_dir = OUT_DIR / str(uuid.uuid4())
67
+ out_dir.mkdir()
68
  sig = at.AudioSignal(data[input_audio])
69
+ #pitch shift input
70
+ sig = sig.shift_pitch(data[input_pitch_shift])
71
 
72
  # TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
73
 
 
100
  mask = pmask.dropout(mask, data[dropout])
101
  mask = pmask.codebook_unmask(mask, ncc)
102
 
103
+ print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[init_temp]}, final temp {data[final_temp]}, use coarse2fine {data[use_coarse2fine]}")
104
+ # save the mask as a txt file
105
+ np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
106
 
107
  zv, mask_z = interface.coarse_vamp(
108
  z,
 
118
  sig = interface.to_signal(zv).cpu()
119
  print("done")
120
 
121
+
 
122
 
123
  sig.write(out_dir / "output.wav")
124
 
 
139
  out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
140
  out_dir.mkdir(parents=True, exist_ok=True)
141
 
142
+ sig_in = at.AudioSignal(data[input_audio])
143
+ sig_out = at.AudioSignal(data[output_audio])
144
 
145
  sig_in.write(out_dir / "input.wav")
146
  sig_out.write(out_dir / "output.wav")
147
 
148
+ _data = {
149
  "init_temp": data[init_temp],
150
  "final_temp": data[final_temp],
151
  "prefix_s": data[prefix_s],
 
162
 
163
  # save with yaml
164
  with open(out_dir / "data.yaml", "w") as f:
165
+ yaml.dump(_data, f)
166
 
167
  import zipfile
168
  zip_path = out_dir.with_suffix(".zip")
 
324
  type="filepath"
325
  )
326
 
327
+ use_as_input_button = gr.Button("use as input")
328
+
329
 
330
  # with gr.Column():
331
  # with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
 
391
  api_name="vamp"
392
  )
393
 
394
+ use_as_input_button.click(
395
+ fn=lambda x: x,
396
+ inputs=[output_audio],
397
+ outputs=[input_audio]
398
+ )
399
+
400
  save_button.click(
401
  fn=save_vamp,
402
+ inputs=_inputs | {notes_text, output_audio},
403
  outputs=[thank_you, download_file]
404
  )
405
 
scripts/exp/fine_tune.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argbind
2
+ from pathlib import Path
3
+ import yaml
4
+
5
+
6
+
7
+
8
+ """example output: (yaml)
9
+
10
+ """
11
+
12
+ @argbind.bind(without_prefix=True, positional=True)
13
+ def fine_tune(audio_file_or_folder: str, name: str):
14
+
15
+ conf_dir = Path("conf")
16
+ assert conf_dir.exists(), "conf directory not found. are you in the vampnet directory?"
17
+
18
+ conf_dir = conf_dir / "generated"
19
+ conf_dir.mkdir(exist_ok=True)
20
+
21
+ finetune_dir = conf_dir / name
22
+ finetune_dir.mkdir(exist_ok=True)
23
+
24
+ finetune_c2f_conf = {
25
+ "$include": ["conf/lora/lora.yml"],
26
+ "fine_tune": True,
27
+ "train/AudioLoader.sources": [audio_file_or_folder],
28
+ "val/AudioLoader.sources": [audio_file_or_folder],
29
+ "VampNet.n_codebooks": 14,
30
+ "VampNet.n_conditioning_codebooks": 4,
31
+ "VampNet.embedding_dim": 1280,
32
+ "VampNet.n_layers": 16,
33
+ "VampNet.n_heads": 20,
34
+ "AudioDataset.duration": 3.0,
35
+ "AudioDataset.loudness_cutoff": -40.0,
36
+ "save_path": f"./runs/{name}/c2f",
37
+ }
38
+
39
+ finetune_coarse_conf = {
40
+ "$include": ["conf/lora/lora.yml"],
41
+ "fine_tune": True,
42
+ "train/AudioLoader.sources": [audio_file_or_folder],
43
+ "val/AudioLoader.sources": [audio_file_or_folder],
44
+ "save_path": f"./runs/{name}/coarse",
45
+ }
46
+
47
+ interface_conf = {
48
+ "Interface.coarse_ckpt": f"./runs/{name}/coarse/best/vampnet/weights.pth",
49
+ "Interface.coarse2fine_ckpt": f"./runs/{name}/c2f/best/vampnet/weights.pth",
50
+ "Interface.codec_ckpt": "./models/spotdl/codec.pth",
51
+ "AudioLoader.sources": [audio_file_or_folder],
52
+ }
53
+
54
+ # save the confs
55
+ with open(finetune_dir / "c2f.yml", "w") as f:
56
+ yaml.dump(finetune_c2f_conf, f)
57
+
58
+ with open(finetune_dir / "coarse.yml", "w") as f:
59
+ yaml.dump(finetune_coarse_conf, f)
60
+
61
+ with open(finetune_dir / "interface.yml", "w") as f:
62
+ yaml.dump(interface_conf, f)
63
+
64
+ # copy the starter weights to the save paths
65
+ import shutil
66
+
67
+ def pmkdir(path):
68
+ Path(path).parent.mkdir(exist_ok=True, parents=True)
69
+ return path
70
+
71
+ shutil.copy("./models/spotdl/c2f.pth", pmkdir(f"./runs/{name}/c2f/starter/vampnet/weights.pth"))
72
+ shutil.copy("./models/spotdl/coarse.pth", pmkdir(f"./runs/{name}/coarse/starter/vampnet/weights.pth"))
73
+
74
+
75
+ print(f"generated confs in {finetune_dir}. run training jobs with `python scripts/exp/train.py --args.load {finetune_dir}/<c2f/coarse>.yml --resume --load_weights --tag starter` ")
76
+
77
+ if __name__ == "__main__":
78
+ args = argbind.parse_args()
79
+
80
+ with argbind.scope(args):
81
+ fine_tune()
82
+
83
+
84
+
85
+
vampnet/mask.py CHANGED
@@ -151,9 +151,13 @@ def dropout(
151
  mask: torch.Tensor,
152
  p: float,
153
  ):
154
- # negate the mask (we want the 0s to be 1s, since we want to drop the prompt, not the mask)
155
- mask = (~(mask.bool())).long()
156
- return torch.nn.functional.dropout(mask.float(), p=p, training=True).long().bool().long()
 
 
 
 
157
 
158
  def mask_or(
159
  mask1: torch.Tensor,
@@ -191,7 +195,8 @@ def onset_mask(
191
  onset_indices = librosa.onset.onset_detect(
192
  y=sig.clone().to_mono().samples.cpu().numpy()[0, 0],
193
  sr=sig.sample_rate,
194
- hop_length=interface.codec.hop_length
 
195
  )
196
 
197
  # create a mask, set onset
 
151
  mask: torch.Tensor,
152
  p: float,
153
  ):
154
+ assert 0 <= p <= 1, "p must be between 0 and 1"
155
+ assert mask.max() <= 1, "mask must be binary"
156
+ assert mask.min() >= 0, "mask must be binary"
157
+ mask = (~mask.bool()).float()
158
+ mask = torch.bernoulli(mask * (1 - p))
159
+ mask = ~mask.round().bool()
160
+ return mask.long()
161
 
162
  def mask_or(
163
  mask1: torch.Tensor,
 
195
  onset_indices = librosa.onset.onset_detect(
196
  y=sig.clone().to_mono().samples.cpu().numpy()[0, 0],
197
  sr=sig.sample_rate,
198
+ hop_length=interface.codec.hop_length,
199
+ backtrack=True,
200
  )
201
 
202
  # create a mask, set onset