root commited on
Commit
a344f64
·
1 Parent(s): de86ffd

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +3 -3
  2. app.py +232 -0
  3. configs/inference.yaml +284 -0
  4. configs/inference_1.5.yaml +302 -0
  5. configs/inference_2.yaml +302 -0
  6. configs/inference_long.yaml +284 -0
  7. configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +255 -0
  8. configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +183 -0
  9. configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +483 -0
  10. configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +284 -0
  11. data/__pycache__/data.cpython-38.pyc +0 -0
  12. data/data.py +669 -0
  13. data/prepare_each_dataset.py +0 -0
  14. eval/README.md +1 -0
  15. eval/__init__.py +0 -0
  16. eval/inference.py +229 -0
  17. eval/inference.sh +55 -0
  18. eval/interactive.sh +8 -0
  19. eval/keep_run.sh +64 -0
  20. eval/submit.sh +54 -0
  21. eval/submit_2.sh +49 -0
  22. my_laion_clap/CLAP/LICENSE +121 -0
  23. my_laion_clap/CLAP/MANIFEST.in +3 -0
  24. my_laion_clap/CLAP/README.md +287 -0
  25. my_laion_clap/CLAP/assets/audioclip-arch.png +0 -0
  26. my_laion_clap/CLAP/assets/clap-zeroshot.PNG +0 -0
  27. my_laion_clap/CLAP/assets/logo.PNG +0 -0
  28. my_laion_clap/CLAP/experiment_scripts/esc50_api.py +48 -0
  29. my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh +63 -0
  30. my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh +70 -0
  31. my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh +70 -0
  32. my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh +70 -0
  33. my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh +66 -0
  34. my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh +28 -0
  35. my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh +66 -0
  36. my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh +19 -0
  37. my_laion_clap/CLAP/pyproject.toml +54 -0
  38. my_laion_clap/CLAP/requirements.txt +16 -0
  39. my_laion_clap/CLAP/src/laion_clap/__init__.py +5 -0
  40. my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc +0 -0
  41. my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc +0 -0
  42. my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py +8 -0
  43. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc +0 -0
  44. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc +0 -0
  45. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc +0 -0
  46. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc +0 -0
  47. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc +0 -0
  48. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc +0 -0
  49. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc +0 -0
  50. my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc +0 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
  title: Audio Flamingo 2
3
- emoji: 🐢
4
- colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: NVIDIA Audio Flamingo 2 Demo
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Audio Flamingo 2
3
+ emoji: 🏃
4
+ colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Audio Flamingo 2 Demo
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+ import torch
5
+ import spaces
6
+ import librosa
7
+ import argparse
8
+ import numpy as np
9
+ import gradio as gr
10
+ from tqdm import tqdm
11
+ import soundfile as sf
12
+ from pydub import AudioSegment
13
+ from safetensors.torch import load_file
14
+ from huggingface_hub import snapshot_download
15
+
16
+ from data.data import get_audiotext_dataloader
17
+ from src.factory import create_model_and_transforms
18
+ from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
19
+
20
+ def int16_to_float32(x):
21
+ return (x / 32767.0).astype(np.float32)
22
+
23
+ def float32_to_int16(x):
24
+ x = np.clip(x, a_min=-1., a_max=1.)
25
+ return (x * 32767.).astype(np.int16)
26
+
27
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
28
+
29
+ snapshot_download(repo_id="SreyanG-NVIDIA/audio-flamingo-2", local_dir="./")
30
+
31
+ config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
32
+
33
+ data_config = config['data_config']
34
+ model_config = config['model_config']
35
+ clap_config = config['clap_config']
36
+ args = Dict2Class(config['train_config'])
37
+
38
+ model, tokenizer = create_model_and_transforms(
39
+ **model_config,
40
+ clap_config=clap_config,
41
+ use_local_files=args.offline,
42
+ gradient_checkpointing=args.gradient_checkpointing,
43
+ freeze_lm_embeddings=args.freeze_lm_embeddings,
44
+ )
45
+
46
+ device_id = 0
47
+ model = model.to(device_id)
48
+ model.eval()
49
+
50
+ # Load metadata
51
+ with open("safe_ckpt/metadata.json", "r") as f:
52
+ metadata = json.load(f)
53
+
54
+ # Reconstruct the full state_dict
55
+ state_dict = {}
56
+
57
+ # Load each SafeTensors chunk
58
+ for chunk_name in metadata:
59
+ chunk_path = f"safe_ckpt/{chunk_name}.safetensors"
60
+ chunk_tensors = load_file(chunk_path)
61
+
62
+ # Merge tensors into state_dict
63
+ state_dict.update(chunk_tensors)
64
+
65
+ x,y = model.load_state_dict(state_dict, False)
66
+
67
+ autocast = get_autocast(
68
+ args.precision, cache_enabled=(not args.fsdp)
69
+ )
70
+
71
+ cast_dtype = get_cast_dtype(args.precision)
72
+
73
+ def get_num_windows(T, sr):
74
+
75
+ window_length = int(float(clap_config["window_length"]) * sr)
76
+ window_overlap = int(float(clap_config["window_overlap"]) * sr)
77
+ max_num_window = int(clap_config["max_num_window"])
78
+
79
+ num_windows = 1
80
+ if T <= window_length:
81
+ num_windows = 1
82
+ full_length = window_length
83
+ elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
84
+ num_windows = max_num_window
85
+ full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
86
+ else:
87
+ num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
88
+ full_length = num_windows * window_length - (num_windows - 1) * window_overlap
89
+
90
+ return num_windows, full_length
91
+
92
+
93
+ def read_audio(file_path, target_sr=16000, duration=30.0, start=0.0):
94
+
95
+ if file_path.endswith('.mp3'):
96
+ audio = AudioSegment.from_file(file_path)
97
+ if len(audio) > (start + duration) * 1000:
98
+ audio = audio[start * 1000:(start + duration) * 1000]
99
+
100
+ if audio.frame_rate != target_sr:
101
+ audio = audio.set_frame_rate(target_sr)
102
+
103
+ if audio.channels > 1:
104
+ audio = audio.set_channels(1)
105
+
106
+ data = np.array(audio.get_array_of_samples())
107
+ if audio.sample_width == 2:
108
+ data = data.astype(np.float32) / np.iinfo(np.int16).max
109
+ elif audio.sample_width == 4:
110
+ data = data.astype(np.float32) / np.iinfo(np.int32).max
111
+ else:
112
+ raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
113
+
114
+ else:
115
+ with sf.SoundFile(file_path) as audio:
116
+ original_sr = audio.samplerate
117
+ channels = audio.channels
118
+
119
+ max_frames = int((start + duration) * original_sr)
120
+
121
+ audio.seek(int(start * original_sr))
122
+ frames_to_read = min(max_frames, len(audio))
123
+ data = audio.read(frames_to_read)
124
+
125
+ if data.max() > 1 or data.min() < -1:
126
+ data = data / max(abs(data.max()), abs(data.min()))
127
+
128
+ if original_sr != target_sr:
129
+ if channels == 1:
130
+ data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
131
+ else:
132
+ data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
133
+ else:
134
+ if channels != 1:
135
+ data = data.T[0]
136
+
137
+ if data.min() >= 0:
138
+ data = 2 * data / abs(data.max()) - 1.0
139
+ else:
140
+ data = data / max(abs(data.max()), abs(data.min()))
141
+
142
+ assert len(data.shape) == 1, data.shape
143
+ return data
144
+
145
+ def load_audio(audio_path):
146
+
147
+ sr = 16000
148
+ window_length = int(float(clap_config["window_length"]) * sr)
149
+ window_overlap = int(float(clap_config["window_overlap"]) * sr)
150
+ max_num_window = int(clap_config["max_num_window"])
151
+ duration = max_num_window * (clap_config["window_length"] - clap_config["window_overlap"]) + clap_config["window_overlap"]
152
+
153
+ audio_data = read_audio(audio_path, sr, duration, 0.0) # hard code audio start to 0.0
154
+ T = len(audio_data)
155
+ num_windows, full_length = get_num_windows(T, sr)
156
+
157
+ # pads to the nearest multiple of window_length
158
+ if full_length > T:
159
+ audio_data = np.append(audio_data, np.zeros(full_length - T))
160
+
161
+ audio_data = audio_data.reshape(1, -1)
162
+ audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
163
+
164
+ audio_clips = []
165
+ audio_embed_mask = torch.ones(num_windows)
166
+ for i in range(num_windows):
167
+ start = i * (window_length - window_overlap)
168
+ audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
169
+ audio_clips.append(audio_data_tensor_this)
170
+
171
+ if len(audio_clips) < max_num_window:
172
+ audio_clips = audio_clips[:max_num_window]
173
+ audio_embed_mask = audio_embed_mask[:max_num_window]
174
+
175
+ audio_clips = torch.cat(audio_clips)
176
+
177
+ return audio_clips, audio_embed_mask
178
+
179
+ @spaces.GPU
180
+ def predict(filepath, question):
181
+
182
+ audio_clips, audio_embed_mask = load_audio(filepath)
183
+ audio_clips = audio_clips.to(device_id, dtype=cast_dtype, non_blocking=True)
184
+ audio_embed_mask = audio_embed_mask.to(device_id, dtype=cast_dtype, non_blocking=True)
185
+
186
+ text_prompt = str(question).lower()
187
+ text_output = str(question).lower()
188
+
189
+ sample = f"<audio>{text_prompt.strip()}{tokenizer.sep_token}"
190
+ # None<|endofchunk|>{tokenizer.eos_token}"
191
+
192
+ text = tokenizer(
193
+ sample,
194
+ max_length=512,
195
+ padding="longest",
196
+ truncation="only_first",
197
+ return_tensors="pt"
198
+ )
199
+
200
+ input_ids = text["input_ids"].to(device_id, non_blocking=True)
201
+
202
+ media_token_id = tokenizer.encode("<audio>")[-1]
203
+ sep_token_id = tokenizer.sep_token_id
204
+
205
+ prompt = input_ids
206
+
207
+ with torch.no_grad():
208
+ output = model.generate(
209
+ audio_x=audio_clips.unsqueeze(0),
210
+ audio_x_mask=audio_embed_mask.unsqueeze(0),
211
+ lang_x=prompt,
212
+ eos_token_id=tokenizer.eos_token_id,
213
+ max_new_tokens=256,
214
+ temperature=0.0)[0]
215
+
216
+ output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
217
+
218
+ return output_decoded
219
+
220
+ link = "TBD"
221
+ text = "[Github]"
222
+ paper_link = "https://github.com/NVIDIA/audio-flamingo/"
223
+ paper_text = "TBD"
224
+ demo = gr.Interface(fn=predict,
225
+ inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Edit the textbox to ask your own questions!')],
226
+ outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
227
+ cache_examples=True,
228
+ title="Audio Flamingo 2 Demo",
229
+ description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
230
+ "**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
231
+ "The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later points.")
232
+ demo.launch(share=True)
configs/inference.yaml ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3
4
+ delete_previous_checkpoint: true
5
+ batch_size: 8
6
+ gradient_accumulation_steps: 2
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ # sft_config:
29
+ # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
+ # pretrained_ckpt: checkpoint_199.pt
31
+ # unfreeze_full_lm: false
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.005
35
+
36
+ dataset_blending_config:
37
+
38
+ MMAUQA/train:
39
+ weight: 1.5
40
+
41
+ AudioSet-Temporal-Speech-Audio-QA/train:
42
+ weight: 1.0
43
+
44
+ CompA-R-AQA/train:
45
+ weight: 1.0
46
+
47
+ # Audio QA
48
+ Clotho-AQA-AQA/train:
49
+ weight: 1.0
50
+
51
+ OpenAQA-AQA/train:
52
+ weight: 1.0
53
+
54
+ SalmonnQA/train:
55
+ weight: 1.0
56
+
57
+ AudioEntailmentQA/train:
58
+ weight: 1.0
59
+
60
+ # Audio Captioning
61
+
62
+ Clotho-v2-AudioCaptioning/train:
63
+ weight: 1.0
64
+
65
+ audiocaps-AudioCaptioning/train:
66
+ weight: 1.0
67
+
68
+ Epidemic_sound-AudioCaptioning/train:
69
+ weight: 1.0
70
+
71
+ MACS-AudioCaptioning/train:
72
+ weight: 1.0
73
+
74
+ # Audio Classification
75
+
76
+ FSD50k-EventClassification/train:
77
+ weight: 1.0
78
+
79
+ CochlScene-SceneClassification/train:
80
+ weight: 1.0
81
+
82
+ NonSpeech7k-EventClassification/train:
83
+ weight: 1.0
84
+
85
+ chime-home-EventClassification/train:
86
+ weight: 1.0
87
+
88
+ SONYC-UST-EventClassification/train:
89
+ weight: 1.0
90
+
91
+ # Speech Emotion Classification
92
+
93
+ MELD-EmotionClassification/train:
94
+ weight: 0.5
95
+
96
+ MELD-SentimentClassification/train:
97
+ weight: 0.5
98
+
99
+ emov-db-EmotionClassification/train:
100
+ weight: 1.0
101
+
102
+ jl-corpus-EmotionClassification/train:
103
+ weight: 6.0
104
+
105
+ tess-EmotionClassification/train:
106
+ weight: 2.5
107
+
108
+ IEMOCAP-EmotionClassification/train:
109
+ weight: 3.0
110
+
111
+ OMGEmotion-EmotionClassification/train:
112
+ weight: 3.0
113
+
114
+ VocalSound-VocalClassification/train:
115
+ weight: 1.5
116
+
117
+ # Music QA
118
+
119
+ Music-AVQA-AQA_All/train:
120
+ weight: 3.0
121
+
122
+ MU-LLAMA-AQA/train:
123
+ weight: 1.0
124
+
125
+ # Music Captioning
126
+
127
+ LP-MusicCaps-MSD-AudioCaptioning/train:
128
+ weight: 0.06
129
+
130
+ LP-MusicCaps-MC-AudioCaptioning/train:
131
+ weight: 2.0
132
+
133
+ LP-MusicCaps-MTT-AudioCaptioning/train:
134
+ weight: 1.0
135
+
136
+ MusicCaps-AudioCaptioning/train:
137
+ weight: 6.0
138
+
139
+ musdbhq-captioning/train:
140
+ weight: 2.0
141
+
142
+ # Music Understanding
143
+
144
+ NSynth-MIR/train:
145
+ weight: 0.2
146
+
147
+ mtg-jamendo-MusicTagging/train:
148
+ weight: 0.1
149
+
150
+ FMA-GenreClassification/train:
151
+ weight: 0.5
152
+
153
+ musdbhq-InstrClassification/train:
154
+ weight: 0.8
155
+
156
+ LLARK_FMA-mir/train:
157
+ weight: 1.0
158
+
159
+ LLARK_FMA-reasoning/train:
160
+ weight: 1.0
161
+
162
+ LLARK_MagnaTagATune-mir/train:
163
+ weight: 1.0
164
+
165
+ LLARK_MTG-Jamendo-reasoning/train:
166
+ weight: 1.0
167
+
168
+ LLARK_MagnaTagATune-reasoning/train:
169
+ weight: 1.0
170
+
171
+ LLARK_MTG-Jamendo-mir/train:
172
+ weight: 1.0
173
+
174
+ MusicBenchQA/train:
175
+ weight: 1.0
176
+
177
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
178
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
+ max_tokens: 512
181
+ num_workers: 4
182
+
183
+ valid_dataset_config:
184
+
185
+ Clotho-AQA-AQA/test: true
186
+
187
+ Clotho-v2-AudioCaptioning/test: true
188
+ audiocaps-AudioCaptioning/test: true
189
+
190
+ FSD50k-EventClassification/test: true
191
+ CochlScene-SceneClassification/test: true
192
+ NonSpeech7k-EventClassification/test: true
193
+ SONYC-UST-EventClassification/test: true
194
+
195
+ MELD-EmotionClassification/test: true
196
+ MELD-SentimentClassification/test: true
197
+ emov-db-EmotionClassification/val: true
198
+ jl-corpus-EmotionClassification/val: true
199
+ tess-EmotionClassification/val: true
200
+ IEMOCAP-EmotionClassification/val: true
201
+ OMGEmotion-EmotionClassification/val: true
202
+ VocalSound-VocalClassification/test: true
203
+
204
+ Music-AVQA-AQA_All/test: true
205
+ MU-LLAMA-AQA/test: true
206
+
207
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
208
+ LP-MusicCaps-MC-AudioCaptioning/test: true
209
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
210
+ MusicCaps-AudioCaptioning/test: true
211
+
212
+ NSynth-MIR/test: true
213
+ mtg-jamendo-MusicTagging/val: true
214
+ musdbhq-InstrClassification/test: true
215
+
216
+ # # zero shot
217
+ # CREMA-D-EmotionClassification/train:
218
+ # prefix_prob: 1.0
219
+
220
+ # ravdess-EmotionClassification/train:
221
+ # prefix_prob: 1.0
222
+
223
+ # UrbanSound8K-EventClassification/train:
224
+ # prefix_prob: 1.0
225
+
226
+ # ESC50-EventClassification/train:
227
+ # prefix_prob: 1.0
228
+
229
+ # DCASE17Task4-SceneClassification/test:
230
+ # prefix_prob: 1.0
231
+
232
+ # GTZAN-GenreClassification/train:
233
+ # prefix_prob: 1.0
234
+
235
+ # Medley-solos-DB-InstrClassification/test:
236
+ # prefix_prob: 1.0
237
+
238
+ clap_config:
239
+ method: nvclap-large
240
+ audio_embed_dim: 2048
241
+ checkpoint: clap_ckpt/epoch_15.pt
242
+
243
+ window_length: 10.0 # seconds
244
+ window_overlap: 0.0 # seconds
245
+ max_num_window: 9 # 1.5 minutes
246
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
247
+ finetune: true
248
+
249
+ whisper_config:
250
+ method: whisper-large-v3
251
+ path: openai/whisper-large-v3
252
+ audio_embed_dim: 1280
253
+ sampling_rate: 16000
254
+
255
+ window_length: 30.0 # seconds
256
+ window_overlap: 0.0 # seconds
257
+ max_num_window: 1 # 5 minutes
258
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
259
+
260
+ mert_config:
261
+ method: mert-v1
262
+ path: m-a-p/MERT-v1-330M
263
+ audio_embed_dim: 1024
264
+ sampling_rate: 24000
265
+
266
+ window_length: 10.0 # seconds
267
+ window_overlap: 0.0 # seconds
268
+ max_num_window: 1 # 5 minutes
269
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
270
+
271
+ model_config:
272
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
273
+
274
+ lang_encoder_path: Qwen/Qwen2.5-3B
275
+ tokenizer_path: Qwen/Qwen2.5-3B
276
+ cross_attn_every_n_layers: 1
277
+ audio_transformer_kwargs: {
278
+ n_head: 8,
279
+ n_layers: 3,
280
+ d_inner: 2048,
281
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
282
+ max_window_per_audio: 1, # must = max_num_window
283
+ common_encoder_embed_dim: 1024
284
+ }
configs/inference_1.5.yaml ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
4
+ delete_previous_checkpoint: true
5
+ batch_size: 32
6
+ gradient_accumulation_steps: 2
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ # sft_config:
29
+ # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
30
+ # pretrained_ckpt: checkpoint_199.pt
31
+ # unfreeze_full_lm: false
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.005
35
+
36
+ dataset_blending_config:
37
+
38
+ MMAUQA/train:
39
+ weight: 1.5
40
+
41
+ AudioSet-Temporal-Speech-Audio-QA/train:
42
+ weight: 1.0
43
+
44
+ CompA-R-AQA/train:
45
+ weight: 1.0
46
+
47
+ # Audio QA
48
+ Clotho-AQA-AQA/train:
49
+ weight: 1.0
50
+
51
+ OpenAQA-AQA/train:
52
+ weight: 1.0
53
+
54
+ SalmonnQA/train:
55
+ weight: 0.8
56
+
57
+ AudioEntailmentQA/train:
58
+ weight: 1.0
59
+
60
+ # Audio Captioning
61
+
62
+ Clotho-v2-AudioCaptioning/train:
63
+ weight: 1.0
64
+
65
+ audiocaps-AudioCaptioning/train:
66
+ weight: 1.0
67
+
68
+ Epidemic_sound-AudioCaptioning/train:
69
+ weight: 1.0
70
+
71
+ MACS-AudioCaptioning/train:
72
+ weight: 1.0
73
+
74
+ # Audio Classification
75
+
76
+ UrbanSound8K-EventClassification/train:
77
+ weight: 0.5
78
+
79
+ TUT-EventClassification/train:
80
+ weight: 2.0
81
+
82
+ FSD50k-EventClassification/train:
83
+ weight: 1.0
84
+
85
+ CochlScene-SceneClassification/train:
86
+ weight: 1.0
87
+
88
+ NonSpeech7k-EventClassification/train:
89
+ weight: 1.0
90
+
91
+ chime-home-EventClassification/train:
92
+ weight: 1.0
93
+
94
+ SONYC-UST-EventClassification/train:
95
+ weight: 1.0
96
+
97
+ # Speech Emotion Classification
98
+
99
+ MELD-EmotionClassification/train:
100
+ weight: 0.5
101
+
102
+ MELD-SentimentClassification/train:
103
+ weight: 0.5
104
+
105
+ emov-db-EmotionClassification/train:
106
+ weight: 1.0
107
+
108
+ jl-corpus-EmotionClassification/train:
109
+ weight: 6.0
110
+
111
+ tess-EmotionClassification/train:
112
+ weight: 2.5
113
+
114
+ IEMOCAP-EmotionClassification/train:
115
+ weight: 3.0
116
+
117
+ OMGEmotion-EmotionClassification/train:
118
+ weight: 3.0
119
+
120
+ VocalSound-VocalClassification/train:
121
+ weight: 1.5
122
+
123
+ # Music QA
124
+
125
+ Music-AVQA-AQA_All/train:
126
+ weight: 3.0
127
+
128
+ MU-LLAMA-AQA/train:
129
+ weight: 1.0
130
+
131
+ # Music Captioning
132
+
133
+ LP-MusicCaps-MSD-AudioCaptioning/train:
134
+ weight: 0.06
135
+
136
+ LP-MusicCaps-MC-AudioCaptioning/train:
137
+ weight: 2.0
138
+
139
+ LP-MusicCaps-MTT-AudioCaptioning/train:
140
+ weight: 1.0
141
+
142
+ MusicCaps-AudioCaptioning/train:
143
+ weight: 6.0
144
+
145
+ musdbhq-captioning/train:
146
+ weight: 2.0
147
+
148
+ # Music Understanding
149
+
150
+ Medley-solos-DB-InstrClassification/train:
151
+ weight: 1.5
152
+
153
+ GTZAN-GenreClassification/train:
154
+ weight: 2.0
155
+
156
+ NSynth-MIR/train:
157
+ weight: 0.4
158
+
159
+ NSynth-Instrument/train:
160
+ weight: 1.5
161
+
162
+ NSynth-Source/train:
163
+ weight: 1.5
164
+
165
+ mtg-jamendo-MusicTagging/train:
166
+ weight: 1.0
167
+
168
+ FMA-GenreClassification/train:
169
+ weight: 1.0
170
+
171
+ musdbhq-InstrClassification/train:
172
+ weight: 1.0
173
+
174
+ LLARK_FMA-mir/train:
175
+ weight: 1.0
176
+
177
+ LLARK_FMA-reasoning/train:
178
+ weight: 1.0
179
+
180
+ LLARK_MagnaTagATune-mir/train:
181
+ weight: 1.0
182
+
183
+ LLARK_MTG-Jamendo-reasoning/train:
184
+ weight: 1.0
185
+
186
+ LLARK_MagnaTagATune-reasoning/train:
187
+ weight: 1.0
188
+
189
+ LLARK_MTG-Jamendo-mir/train:
190
+ weight: 1.0
191
+
192
+ MusicBenchQA/train:
193
+ weight: 1.0
194
+
195
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
196
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
197
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
198
+ max_tokens: 512
199
+ num_workers: 4
200
+
201
+ valid_dataset_config:
202
+
203
+ Clotho-AQA-AQA/test: true
204
+
205
+ Clotho-v2-AudioCaptioning/test: true
206
+ audiocaps-AudioCaptioning/test: true
207
+
208
+ FSD50k-EventClassification/test: true
209
+ CochlScene-SceneClassification/test: true
210
+ NonSpeech7k-EventClassification/test: true
211
+ SONYC-UST-EventClassification/test: true
212
+
213
+ MELD-EmotionClassification/test: true
214
+ MELD-SentimentClassification/test: true
215
+ emov-db-EmotionClassification/val: true
216
+ jl-corpus-EmotionClassification/val: true
217
+ tess-EmotionClassification/val: true
218
+ IEMOCAP-EmotionClassification/val: true
219
+ OMGEmotion-EmotionClassification/val: true
220
+ VocalSound-VocalClassification/test: true
221
+
222
+ Music-AVQA-AQA_All/test: true
223
+ MU-LLAMA-AQA/test: true
224
+
225
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
226
+ LP-MusicCaps-MC-AudioCaptioning/test: true
227
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
228
+ MusicCaps-AudioCaptioning/test: true
229
+
230
+ NSynth-MIR/test: true
231
+ mtg-jamendo-MusicTagging/val: true
232
+ musdbhq-InstrClassification/test: true
233
+
234
+ # zero shot
235
+ # CREMA-D-EmotionClassification/train:
236
+ # prefix_prob: 1.0
237
+
238
+ # ravdess-EmotionClassification/train:
239
+ # prefix_prob: 1.0
240
+
241
+ # UrbanSound8K-EventClassification/train:
242
+ # prefix_prob: 1.0
243
+
244
+ # ESC50-EventClassification/train:
245
+ # prefix_prob: 1.0
246
+
247
+ # DCASE17Task4-SceneClassification/test:
248
+ # prefix_prob: 1.0
249
+
250
+ # GTZAN-GenreClassification/train:
251
+ # prefix_prob: 1.0
252
+
253
+ # Medley-solos-DB-InstrClassification/test:
254
+ # prefix_prob: 1.0
255
+
256
+ clap_config:
257
+ method: nvclap-large
258
+ audio_embed_dim: 2048
259
+ checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
260
+
261
+ window_length: 10.0 # seconds
262
+ window_overlap: 0.0 # seconds
263
+ max_num_window: 9 # 1.5 minutes
264
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
265
+ finetune: true
266
+
267
+ whisper_config:
268
+ method: whisper-large-v3
269
+ path: openai/whisper-large-v3
270
+ audio_embed_dim: 1280
271
+ sampling_rate: 16000
272
+
273
+ window_length: 30.0 # seconds
274
+ window_overlap: 0.0 # seconds
275
+ max_num_window: 1 # 5 minutes
276
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
277
+
278
+ mert_config:
279
+ method: mert-v1
280
+ path: m-a-p/MERT-v1-330M
281
+ audio_embed_dim: 1024
282
+ sampling_rate: 24000
283
+
284
+ window_length: 10.0 # seconds
285
+ window_overlap: 0.0 # seconds
286
+ max_num_window: 1 # 5 minutes
287
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
288
+
289
+ model_config:
290
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
291
+
292
+ lang_encoder_path: Qwen/Qwen2.5-1.5B
293
+ tokenizer_path: Qwen/Qwen2.5-1.5B
294
+ cross_attn_every_n_layers: 1
295
+ audio_transformer_kwargs: {
296
+ n_head: 8,
297
+ n_layers: 3,
298
+ d_inner: 2048,
299
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
300
+ max_window_per_audio: 1, # must = max_num_window
301
+ common_encoder_embed_dim: 1024
302
+ }
configs/inference_2.yaml ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
4
+ delete_previous_checkpoint: true
5
+ batch_size: 4
6
+ gradient_accumulation_steps: 2
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ sft_config:
29
+ pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
30
+ pretrained_ckpt: checkpoint_199.pt
31
+ unfreeze_full_lm: false
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.005
35
+
36
+ dataset_blending_config:
37
+
38
+ MMAUQA/train:
39
+ weight: 1.5
40
+
41
+ AudioSet-Temporal-Speech-Audio-QA/train:
42
+ weight: 1.0
43
+
44
+ CompA-R-AQA/train:
45
+ weight: 1.0
46
+
47
+ # Audio QA
48
+ Clotho-AQA-AQA/train:
49
+ weight: 1.0
50
+
51
+ OpenAQA-AQA/train:
52
+ weight: 1.0
53
+
54
+ SalmonnQA/train:
55
+ weight: 0.8
56
+
57
+ AudioEntailmentQA/train:
58
+ weight: 1.0
59
+
60
+ # Audio Captioning
61
+
62
+ Clotho-v2-AudioCaptioning/train:
63
+ weight: 1.0
64
+
65
+ audiocaps-AudioCaptioning/train:
66
+ weight: 1.0
67
+
68
+ Epidemic_sound-AudioCaptioning/train:
69
+ weight: 1.0
70
+
71
+ MACS-AudioCaptioning/train:
72
+ weight: 1.0
73
+
74
+ # Audio Classification
75
+
76
+ UrbanSound8K-EventClassification/train:
77
+ weight: 0.5
78
+
79
+ TUT-EventClassification/train:
80
+ weight: 2.0
81
+
82
+ FSD50k-EventClassification/train:
83
+ weight: 1.0
84
+
85
+ CochlScene-SceneClassification/train:
86
+ weight: 1.0
87
+
88
+ NonSpeech7k-EventClassification/train:
89
+ weight: 1.0
90
+
91
+ chime-home-EventClassification/train:
92
+ weight: 1.0
93
+
94
+ SONYC-UST-EventClassification/train:
95
+ weight: 1.0
96
+
97
+ # Speech Emotion Classification
98
+
99
+ MELD-EmotionClassification/train:
100
+ weight: 0.5
101
+
102
+ MELD-SentimentClassification/train:
103
+ weight: 0.5
104
+
105
+ emov-db-EmotionClassification/train:
106
+ weight: 1.0
107
+
108
+ jl-corpus-EmotionClassification/train:
109
+ weight: 6.0
110
+
111
+ tess-EmotionClassification/train:
112
+ weight: 2.5
113
+
114
+ IEMOCAP-EmotionClassification/train:
115
+ weight: 3.0
116
+
117
+ OMGEmotion-EmotionClassification/train:
118
+ weight: 3.0
119
+
120
+ VocalSound-VocalClassification/train:
121
+ weight: 1.5
122
+
123
+ # Music QA
124
+
125
+ Music-AVQA-AQA_All/train:
126
+ weight: 3.0
127
+
128
+ MU-LLAMA-AQA/train:
129
+ weight: 1.0
130
+
131
+ # Music Captioning
132
+
133
+ LP-MusicCaps-MSD-AudioCaptioning/train:
134
+ weight: 0.06
135
+
136
+ LP-MusicCaps-MC-AudioCaptioning/train:
137
+ weight: 2.0
138
+
139
+ LP-MusicCaps-MTT-AudioCaptioning/train:
140
+ weight: 1.0
141
+
142
+ MusicCaps-AudioCaptioning/train:
143
+ weight: 6.0
144
+
145
+ musdbhq-captioning/train:
146
+ weight: 2.0
147
+
148
+ # Music Understanding
149
+
150
+ Medley-solos-DB-InstrClassification/train:
151
+ weight: 1.5
152
+
153
+ GTZAN-GenreClassification/train:
154
+ weight: 2.0
155
+
156
+ NSynth-MIR/train:
157
+ weight: 0.4
158
+
159
+ NSynth-Instrument/train:
160
+ weight: 1.5
161
+
162
+ NSynth-Source/train:
163
+ weight: 1.5
164
+
165
+ mtg-jamendo-MusicTagging/train:
166
+ weight: 1.0
167
+
168
+ FMA-GenreClassification/train:
169
+ weight: 1.0
170
+
171
+ musdbhq-InstrClassification/train:
172
+ weight: 1.0
173
+
174
+ LLARK_FMA-mir/train:
175
+ weight: 1.0
176
+
177
+ LLARK_FMA-reasoning/train:
178
+ weight: 1.0
179
+
180
+ LLARK_MagnaTagATune-mir/train:
181
+ weight: 1.0
182
+
183
+ LLARK_MTG-Jamendo-reasoning/train:
184
+ weight: 1.0
185
+
186
+ LLARK_MagnaTagATune-reasoning/train:
187
+ weight: 1.0
188
+
189
+ LLARK_MTG-Jamendo-mir/train:
190
+ weight: 1.0
191
+
192
+ MusicBenchQA/train:
193
+ weight: 1.0
194
+
195
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
196
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
197
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
198
+ max_tokens: 512
199
+ num_workers: 4
200
+
201
+ valid_dataset_config:
202
+
203
+ Clotho-AQA-AQA/test: true
204
+
205
+ Clotho-v2-AudioCaptioning/test: true
206
+ audiocaps-AudioCaptioning/test: true
207
+
208
+ FSD50k-EventClassification/test: true
209
+ CochlScene-SceneClassification/test: true
210
+ NonSpeech7k-EventClassification/test: true
211
+ SONYC-UST-EventClassification/test: true
212
+
213
+ MELD-EmotionClassification/test: true
214
+ MELD-SentimentClassification/test: true
215
+ emov-db-EmotionClassification/val: true
216
+ jl-corpus-EmotionClassification/val: true
217
+ tess-EmotionClassification/val: true
218
+ IEMOCAP-EmotionClassification/val: true
219
+ OMGEmotion-EmotionClassification/val: true
220
+ VocalSound-VocalClassification/test: true
221
+
222
+ Music-AVQA-AQA_All/test: true
223
+ MU-LLAMA-AQA/test: true
224
+
225
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
226
+ LP-MusicCaps-MC-AudioCaptioning/test: true
227
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
228
+ MusicCaps-AudioCaptioning/test: true
229
+
230
+ NSynth-MIR/test: true
231
+ mtg-jamendo-MusicTagging/val: true
232
+ musdbhq-InstrClassification/test: true
233
+
234
+ # zero shot
235
+ # CREMA-D-EmotionClassification/train:
236
+ # prefix_prob: 1.0
237
+
238
+ # ravdess-EmotionClassification/train:
239
+ # prefix_prob: 1.0
240
+
241
+ # UrbanSound8K-EventClassification/train:
242
+ # prefix_prob: 1.0
243
+
244
+ # ESC50-EventClassification/train:
245
+ # prefix_prob: 1.0
246
+
247
+ # DCASE17Task4-SceneClassification/test:
248
+ # prefix_prob: 1.0
249
+
250
+ # GTZAN-GenreClassification/train:
251
+ # prefix_prob: 1.0
252
+
253
+ # Medley-solos-DB-InstrClassification/test:
254
+ # prefix_prob: 1.0
255
+
256
+ clap_config:
257
+ method: nvclap-large
258
+ audio_embed_dim: 2048
259
+ checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
260
+
261
+ window_length: 10.0 # seconds
262
+ window_overlap: 0.0 # seconds
263
+ max_num_window: 9 # 1.5 minutes
264
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
265
+ finetune: true
266
+
267
+ whisper_config:
268
+ method: whisper-large-v3
269
+ path: openai/whisper-large-v3
270
+ audio_embed_dim: 1280
271
+ sampling_rate: 16000
272
+
273
+ window_length: 30.0 # seconds
274
+ window_overlap: 0.0 # seconds
275
+ max_num_window: 1 # 5 minutes
276
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
277
+
278
+ mert_config:
279
+ method: mert-v1
280
+ path: m-a-p/MERT-v1-330M
281
+ audio_embed_dim: 1024
282
+ sampling_rate: 24000
283
+
284
+ window_length: 10.0 # seconds
285
+ window_overlap: 0.0 # seconds
286
+ max_num_window: 1 # 5 minutes
287
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
288
+
289
+ model_config:
290
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
291
+
292
+ lang_encoder_path: Qwen/Qwen2.5-3B
293
+ tokenizer_path: Qwen/Qwen2.5-3B
294
+ cross_attn_every_n_layers: 1
295
+ audio_transformer_kwargs: {
296
+ n_head: 8,
297
+ n_layers: 3,
298
+ d_inner: 2048,
299
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
300
+ max_window_per_audio: 1, # must = max_num_window
301
+ common_encoder_embed_dim: 1024
302
+ }
configs/inference_long.yaml ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
4
+ delete_previous_checkpoint: true
5
+ batch_size: 2
6
+ gradient_accumulation_steps: 2
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ # sft_config:
29
+ # pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
+ # pretrained_ckpt: checkpoint_199.pt
31
+ # unfreeze_full_lm: false
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.005
35
+
36
+ dataset_blending_config:
37
+
38
+ MMAUQA/train:
39
+ weight: 1.5
40
+
41
+ AudioSet-Temporal-Speech-Audio-QA/train:
42
+ weight: 1.0
43
+
44
+ CompA-R-AQA/train:
45
+ weight: 1.0
46
+
47
+ # Audio QA
48
+ Clotho-AQA-AQA/train:
49
+ weight: 1.0
50
+
51
+ OpenAQA-AQA/train:
52
+ weight: 1.0
53
+
54
+ SalmonnQA/train:
55
+ weight: 1.0
56
+
57
+ AudioEntailmentQA/train:
58
+ weight: 1.0
59
+
60
+ # Audio Captioning
61
+
62
+ Clotho-v2-AudioCaptioning/train:
63
+ weight: 1.0
64
+
65
+ audiocaps-AudioCaptioning/train:
66
+ weight: 1.0
67
+
68
+ Epidemic_sound-AudioCaptioning/train:
69
+ weight: 1.0
70
+
71
+ MACS-AudioCaptioning/train:
72
+ weight: 1.0
73
+
74
+ # Audio Classification
75
+
76
+ FSD50k-EventClassification/train:
77
+ weight: 1.0
78
+
79
+ CochlScene-SceneClassification/train:
80
+ weight: 1.0
81
+
82
+ NonSpeech7k-EventClassification/train:
83
+ weight: 1.0
84
+
85
+ chime-home-EventClassification/train:
86
+ weight: 1.0
87
+
88
+ SONYC-UST-EventClassification/train:
89
+ weight: 1.0
90
+
91
+ # Speech Emotion Classification
92
+
93
+ MELD-EmotionClassification/train:
94
+ weight: 0.5
95
+
96
+ MELD-SentimentClassification/train:
97
+ weight: 0.5
98
+
99
+ emov-db-EmotionClassification/train:
100
+ weight: 1.0
101
+
102
+ jl-corpus-EmotionClassification/train:
103
+ weight: 6.0
104
+
105
+ tess-EmotionClassification/train:
106
+ weight: 2.5
107
+
108
+ IEMOCAP-EmotionClassification/train:
109
+ weight: 3.0
110
+
111
+ OMGEmotion-EmotionClassification/train:
112
+ weight: 3.0
113
+
114
+ VocalSound-VocalClassification/train:
115
+ weight: 1.5
116
+
117
+ # Music QA
118
+
119
+ Music-AVQA-AQA_All/train:
120
+ weight: 3.0
121
+
122
+ MU-LLAMA-AQA/train:
123
+ weight: 1.0
124
+
125
+ # Music Captioning
126
+
127
+ LP-MusicCaps-MSD-AudioCaptioning/train:
128
+ weight: 0.06
129
+
130
+ LP-MusicCaps-MC-AudioCaptioning/train:
131
+ weight: 2.0
132
+
133
+ LP-MusicCaps-MTT-AudioCaptioning/train:
134
+ weight: 1.0
135
+
136
+ MusicCaps-AudioCaptioning/train:
137
+ weight: 6.0
138
+
139
+ musdbhq-captioning/train:
140
+ weight: 2.0
141
+
142
+ # Music Understanding
143
+
144
+ NSynth-MIR/train:
145
+ weight: 0.2
146
+
147
+ mtg-jamendo-MusicTagging/train:
148
+ weight: 0.1
149
+
150
+ FMA-GenreClassification/train:
151
+ weight: 0.5
152
+
153
+ musdbhq-InstrClassification/train:
154
+ weight: 0.8
155
+
156
+ LLARK_FMA-mir/train:
157
+ weight: 1.0
158
+
159
+ LLARK_FMA-reasoning/train:
160
+ weight: 1.0
161
+
162
+ LLARK_MagnaTagATune-mir/train:
163
+ weight: 1.0
164
+
165
+ LLARK_MTG-Jamendo-reasoning/train:
166
+ weight: 1.0
167
+
168
+ LLARK_MagnaTagATune-reasoning/train:
169
+ weight: 1.0
170
+
171
+ LLARK_MTG-Jamendo-mir/train:
172
+ weight: 1.0
173
+
174
+ MusicBenchQA/train:
175
+ weight: 1.0
176
+
177
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
178
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
+ max_tokens: 512
181
+ num_workers: 4
182
+
183
+ valid_dataset_config:
184
+
185
+ Clotho-AQA-AQA/test: true
186
+
187
+ Clotho-v2-AudioCaptioning/test: true
188
+ audiocaps-AudioCaptioning/test: true
189
+
190
+ FSD50k-EventClassification/test: true
191
+ CochlScene-SceneClassification/test: true
192
+ NonSpeech7k-EventClassification/test: true
193
+ SONYC-UST-EventClassification/test: true
194
+
195
+ MELD-EmotionClassification/test: true
196
+ MELD-SentimentClassification/test: true
197
+ emov-db-EmotionClassification/val: true
198
+ jl-corpus-EmotionClassification/val: true
199
+ tess-EmotionClassification/val: true
200
+ IEMOCAP-EmotionClassification/val: true
201
+ OMGEmotion-EmotionClassification/val: true
202
+ VocalSound-VocalClassification/test: true
203
+
204
+ Music-AVQA-AQA_All/test: true
205
+ MU-LLAMA-AQA/test: true
206
+
207
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
208
+ LP-MusicCaps-MC-AudioCaptioning/test: true
209
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
210
+ MusicCaps-AudioCaptioning/test: true
211
+
212
+ NSynth-MIR/test: true
213
+ mtg-jamendo-MusicTagging/val: true
214
+ musdbhq-InstrClassification/test: true
215
+
216
+ # # zero shot
217
+ # CREMA-D-EmotionClassification/train:
218
+ # prefix_prob: 1.0
219
+
220
+ # ravdess-EmotionClassification/train:
221
+ # prefix_prob: 1.0
222
+
223
+ # UrbanSound8K-EventClassification/train:
224
+ # prefix_prob: 1.0
225
+
226
+ # ESC50-EventClassification/train:
227
+ # prefix_prob: 1.0
228
+
229
+ # DCASE17Task4-SceneClassification/test:
230
+ # prefix_prob: 1.0
231
+
232
+ # GTZAN-GenreClassification/train:
233
+ # prefix_prob: 1.0
234
+
235
+ # Medley-solos-DB-InstrClassification/test:
236
+ # prefix_prob: 1.0
237
+
238
+ clap_config:
239
+ method: nvclap-large
240
+ audio_embed_dim: 2048
241
+ checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
242
+
243
+ window_length: 10.0 # seconds
244
+ window_overlap: 0.0 # seconds
245
+ max_num_window: 30 # 1.5 minutes
246
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
247
+ finetune: true
248
+
249
+ whisper_config:
250
+ method: whisper-large-v3
251
+ path: openai/whisper-large-v3
252
+ audio_embed_dim: 1280
253
+ sampling_rate: 16000
254
+
255
+ window_length: 30.0 # seconds
256
+ window_overlap: 0.0 # seconds
257
+ max_num_window: 1 # 5 minutes
258
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
259
+
260
+ mert_config:
261
+ method: mert-v1
262
+ path: m-a-p/MERT-v1-330M
263
+ audio_embed_dim: 1024
264
+ sampling_rate: 24000
265
+
266
+ window_length: 10.0 # seconds
267
+ window_overlap: 0.0 # seconds
268
+ max_num_window: 1 # 5 minutes
269
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
270
+
271
+ model_config:
272
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
273
+
274
+ lang_encoder_path: Qwen/Qwen2.5-3B
275
+ tokenizer_path: Qwen/Qwen2.5-3B
276
+ cross_attn_every_n_layers: 1
277
+ audio_transformer_kwargs: {
278
+ n_head: 8,
279
+ n_layers: 3,
280
+ d_inner: 2048,
281
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
282
+ max_window_per_audio: 1, # must = max_num_window
283
+ common_encoder_embed_dim: 1024
284
+ }
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
4
+ delete_previous_checkpoint: true
5
+ batch_size: 6
6
+ gradient_accumulation_steps: 2 # 4 nodes
7
+ seed: 42
8
+ learning_rate: 0.0001
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 100 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: true
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ data_config:
28
+ dataset_blending_global_weight: 0.01
29
+
30
+ dataset_blending_config:
31
+
32
+ # Audio QA
33
+ OpenAQA-AQA/train:
34
+ weight: 1.0
35
+ prefix_prob: 0.0
36
+ augmentations:
37
+ do_nothing: 1.0
38
+
39
+ # Audio Captioning
40
+
41
+ BBCSoundEffects-AudioDescription/train:
42
+ weight: 5.0
43
+ prefix_prob: 0.5
44
+ augmentations:
45
+ do_nothing: 1.0
46
+
47
+ CLAP_freesound-AudioCaptioning/train:
48
+ weight: 1.0
49
+ prefix_prob: 0.5
50
+ augmentations:
51
+ do_nothing: 1.0
52
+
53
+ SoundDescs-AudioDescription/train:
54
+ weight: 1.0
55
+ prefix_prob: 0.5
56
+ augmentations:
57
+ do_nothing: 1.0
58
+
59
+ WavCaps-AudioSet_SL-AudioCaptioning/train:
60
+ weight: 1.0
61
+ prefix_prob: 0.5
62
+ augmentations:
63
+ do_nothing: 1.0
64
+
65
+ WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
66
+ weight: 2
67
+ prefix_prob: 0.5
68
+ augmentations:
69
+ do_nothing: 1.0
70
+
71
+ WavCaps-FreeSound-AudioCaptioning/train:
72
+ weight: 2
73
+ prefix_prob: 0.5
74
+ augmentations:
75
+ do_nothing: 1.0
76
+
77
+ WavCaps-SoundBible-AudioCaptioning/train:
78
+ weight: 5
79
+ prefix_prob: 0.5
80
+ augmentations:
81
+ do_nothing: 1.0
82
+
83
+ # Audio Classification
84
+
85
+ AudioSetFullwoAudioMusicCaps-EventClassification/train:
86
+ weight: 1.0
87
+ prefix_prob: 0.5
88
+ augmentations:
89
+ num_words: 0.8
90
+ do_nothing: 0.2
91
+
92
+ AudioSet-EventClassification/train:
93
+ weight: 5.0
94
+ prefix_prob: 0.5
95
+ augmentations:
96
+ num_words: 0.8
97
+ do_nothing: 0.2
98
+
99
+ Clotho-AQA-EventClassification/train:
100
+ weight: 5.0
101
+ prefix_prob: 0.5
102
+ augmentations:
103
+ num_words: 0.8
104
+ do_nothing: 0.2
105
+
106
+ WavText5K-Tagging/train:
107
+ weight: 3.0
108
+ prefix_prob: 0.5
109
+ augmentations:
110
+ num_words: 0.8
111
+ do_nothing: 0.2
112
+
113
+ # Speech Emotion Classification
114
+
115
+ MSP-PODCAST-Publish-1.9-EmotionClassification/train:
116
+ weight: 1.8
117
+ prefix_prob: 0.5
118
+ augmentations:
119
+ provide_all_labels: 0.9
120
+ do_nothing: 0.1
121
+ MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
122
+ weight: 1.2
123
+ prefix_prob: 0.5
124
+ augmentations:
125
+ provide_all_labels: 0.9
126
+ do_nothing: 0.1
127
+
128
+ MELD-EmotionClassification/train:
129
+ weight: 1.8
130
+ prefix_prob: 0.5
131
+ augmentations:
132
+ provide_all_labels: 0.9
133
+ do_nothing: 0.1
134
+ MELD-EmotionClassification/interleaved_knn-train:
135
+ weight: 1.2
136
+ prefix_prob: 0.5
137
+ augmentations:
138
+ provide_all_labels: 0.9
139
+ do_nothing: 0.1
140
+
141
+ MELD-SentimentClassification/train:
142
+ weight: 1.8
143
+ prefix_prob: 0.5
144
+ augmentations:
145
+ provide_all_labels: 0.9
146
+ do_nothing: 0.1
147
+ MELD-SentimentClassification/interleaved_knn-train:
148
+ weight: 1.2
149
+ prefix_prob: 0.5
150
+ augmentations:
151
+ provide_all_labels: 0.9
152
+ do_nothing: 0.1
153
+
154
+ # Music QA
155
+
156
+ Music-AVQA-AVQA_All/train:
157
+ weight: 3.0
158
+ prefix_prob: 0.5
159
+ augmentations:
160
+ AQA_binary_instruction: 1.0
161
+
162
+ MU-LLAMA-AQA/train:
163
+ weight: 1.8
164
+ prefix_prob: 0.5
165
+ augmentations:
166
+ do_nothing: 1.0
167
+ MU-LLAMA-AQA/interleaved_knn-train:
168
+ weight: 1.2
169
+ prefix_prob: 0.5
170
+ augmentations:
171
+ do_nothing: 1.0
172
+
173
+ # Music Captioning
174
+
175
+ LP-MusicCaps-MSD-AudioCaptioning/train:
176
+ weight: 1.0
177
+ prefix_prob: 0.5
178
+ augmentations:
179
+ do_nothing: 1.0
180
+
181
+ # Music Understanding
182
+
183
+ NSynth-MIR/train:
184
+ weight: 0.6
185
+ prefix_prob: 0.5
186
+ augmentations:
187
+ do_nothing: 1.0
188
+ NSynth-MIR/interleaved_knn-train:
189
+ weight: 0.4
190
+ prefix_prob: 0.5
191
+ augmentations:
192
+ do_nothing: 1.0
193
+
194
+ mtg-jamendo-MusicTagging/train:
195
+ weight: 1.0
196
+ prefix_prob: 0.5
197
+ augmentations:
198
+ do_nothing: 1.0
199
+
200
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
201
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
202
+ dataset_blending_output: dataset_blending.json
203
+ max_tokens: 512
204
+ num_workers: 4
205
+
206
+ valid_dataset_config:
207
+ CLAP_freesound-AudioCaptioning/test: true
208
+ SoundDescs-AudioDescription/test: true
209
+ Clotho-AQA-EventClassification/test: true
210
+
211
+ MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
212
+ MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
213
+ MELD-EmotionClassification/test: true
214
+ MELD-EmotionClassification/interleaved_knn-test: true
215
+ MELD-SentimentClassification/test: true
216
+ MELD-SentimentClassification/interleaved_knn-test: true
217
+
218
+ MU-LLAMA-AQA/test: true
219
+ LP-MusicCaps-MSD-AudioCaptioning/val: true
220
+ NSynth-MIR/test: true
221
+ NSynth-MIR/interleaved_knn-test: true
222
+ mtg-jamendo-MusicTagging/val: true
223
+
224
+ clap_config:
225
+ # method: laion-clap
226
+ # audio_embed_dim: 512
227
+ # model_name: 630k-fusion-best
228
+ # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
229
+
230
+ method: microsoft-clap
231
+ audio_embed_dim: 1024
232
+ config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
233
+ # model_name: '2023'
234
+ # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
235
+ model_name: 'clapcap'
236
+ checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
237
+
238
+ window_length: 7.0 # seconds
239
+ window_overlap: 5.25 # seconds
240
+ max_num_window: 16 # 35 seconds
241
+ max_num_fewshot: 4 # number of fewshot samples (including the final one)
242
+
243
+ model_config:
244
+ cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
245
+
246
+ lang_encoder_path: facebook/opt-iml-max-1.3b
247
+ tokenizer_path: facebook/opt-iml-max-1.3b
248
+ cross_attn_every_n_layers: 1
249
+ audio_transformer_kwargs: {
250
+ n_head: 8,
251
+ n_layers: 3,
252
+ d_inner: 2048,
253
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
254
+ max_window_per_audio: 16, # must = max_num_window
255
+ }
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
4
+ delete_previous_checkpoint: true
5
+ batch_size: 4
6
+ gradient_accumulation_steps: 2 # 4 nodes
7
+ seed: 42
8
+ learning_rate: 0.0001
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ data_config:
28
+ dataset_blending_global_weight: 0.005
29
+
30
+ dataset_blending_config:
31
+
32
+ # Audio QA
33
+ OpenAQA-AQA/train:
34
+ weight: 1.0
35
+
36
+ AudioSet-Temporal-Speech-Audio-QA/train:
37
+ weight: 2.0
38
+
39
+ CompA-R-AQA/train:
40
+ weight: 2.0
41
+
42
+ # Audio Captioning
43
+
44
+ BBCSoundEffects-AudioDescription/train:
45
+ weight: 5.0
46
+
47
+ CLAP_freesound-AudioCaptioning/train:
48
+ weight: 1.0
49
+
50
+ SoundDescs-AudioDescription/train:
51
+ weight: 1.0
52
+
53
+ WavCaps-AudioSet_SL-AudioCaptioning/train:
54
+ weight: 1.0
55
+
56
+ WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
57
+ weight: 2.0
58
+
59
+ WavCaps-FreeSound-AudioCaptioning/train:
60
+ weight: 2.0
61
+
62
+ WavCaps-SoundBible-AudioCaptioning/train:
63
+ weight: 5.0
64
+
65
+ Ego-10-AudioCaptioning/train:
66
+ weight: 2.0
67
+
68
+ Ego-30-AudioCaptioning/train:
69
+ weight: 2.0
70
+
71
+ # Audio Classification
72
+
73
+ AudioSetFullwoAudioMusicCaps-EventClassification/train:
74
+ weight: 1.0
75
+
76
+ AudioSet-EventClassification/train:
77
+ weight: 5.0
78
+
79
+ Clotho-AQA-EventClassification/train:
80
+ weight: 5.0
81
+
82
+ WavText5K-Tagging/train:
83
+ weight: 3.0
84
+
85
+ # Speech Emotion Classification
86
+
87
+ MSP-PODCAST-Publish-1.9-EmotionClassification/train:
88
+ weight: 3.0
89
+
90
+ MELD-EmotionClassification/train:
91
+ weight: 3.0
92
+
93
+ MELD-SentimentClassification/train:
94
+ weight: 3.0
95
+
96
+ # Music QA
97
+
98
+ Music-AVQA-AVQA_All/train:
99
+ weight: 3.0
100
+
101
+ MU-LLAMA-AQA/train:
102
+ weight: 3.0
103
+
104
+ # Music Captioning
105
+
106
+ LP-MusicCaps-MSD-AudioCaptioning/train:
107
+ weight: 1.0
108
+
109
+ # Music Understanding
110
+
111
+ NSynth-MIR/train:
112
+ weight: 1.0
113
+
114
+ mtg-jamendo-MusicTagging/train:
115
+ weight: 1.0
116
+
117
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
118
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
119
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
120
+ max_tokens: 512
121
+ num_workers: 4
122
+
123
+ valid_dataset_config:
124
+ CLAP_freesound-AudioCaptioning/test: true
125
+ SoundDescs-AudioDescription/test: true
126
+ Clotho-AQA-EventClassification/test: true
127
+
128
+ MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
129
+ MELD-EmotionClassification/test: true
130
+ MELD-SentimentClassification/test: true
131
+
132
+ MU-LLAMA-AQA/test: true
133
+ LP-MusicCaps-MSD-AudioCaptioning/val: true
134
+ NSynth-MIR/test: true
135
+ mtg-jamendo-MusicTagging/val: true
136
+
137
+ clap_config:
138
+ method: nvclap-large
139
+ audio_embed_dim: 2048
140
+ checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
141
+
142
+ window_length: 10.0 # seconds
143
+ window_overlap: 0.0 # seconds
144
+ max_num_window: 3 # 5 minutes
145
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
146
+
147
+ whisper_config:
148
+ method: whisper-large-v3
149
+ path: openai/whisper-large-v3
150
+ audio_embed_dim: 1280
151
+ sampling_rate: 16000
152
+
153
+ window_length: 30.0 # seconds
154
+ window_overlap: 0.0 # seconds
155
+ max_num_window: 1 # 5 minutes
156
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
157
+ finetune: true
158
+
159
+ mert_config:
160
+ method: mert-v1
161
+ path: m-a-p/MERT-v1-330M
162
+ audio_embed_dim: 1024
163
+ sampling_rate: 24000
164
+
165
+ window_length: 10.0 # seconds
166
+ window_overlap: 0.0 # seconds
167
+ max_num_window: 1 # 5 minutes
168
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
169
+
170
+ model_config:
171
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
172
+
173
+ lang_encoder_path: Qwen/Qwen2.5-3B
174
+ tokenizer_path: Qwen/Qwen2.5-3B
175
+ cross_attn_every_n_layers: 1
176
+ audio_transformer_kwargs: {
177
+ n_head: 8,
178
+ n_layers: 3,
179
+ d_inner: 2048,
180
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
181
+ max_window_per_audio: 1, # must = max_num_window
182
+ common_encoder_embed_dim: 1024
183
+ }
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
3
+ run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
4
+ delete_previous_checkpoint: true
5
+ batch_size: 4
6
+ gradient_accumulation_steps: 1
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: fp32 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 160 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ sft_config:
29
+ pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
30
+ pretrained_ckpt: checkpoint_99.pt
31
+ unfreeze_full_lm: true
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.01
35
+
36
+ dataset_blending_config:
37
+
38
+ # Audio QA
39
+ Clotho-AQA-AQA/train:
40
+ weight: 0.8
41
+ prefix_prob: 1.0
42
+ augmentations:
43
+ AQA_binary_instruction: 1.0
44
+ Clotho-AQA-AQA/interleaved_knn-train:
45
+ weight: 0.2
46
+ prefix_prob: 1.0
47
+ augmentations:
48
+ AQA_binary_instruction: 1.0
49
+
50
+ OpenAQA-AQA/train:
51
+ weight: 1.0
52
+ prefix_prob: 1.0
53
+ augmentations:
54
+ do_nothing: 1.0
55
+
56
+ # Audio Captioning
57
+
58
+ Clotho-v2-AudioCaptioning/train:
59
+ weight: 0.8
60
+ prefix_prob: 1.0
61
+ augmentations:
62
+ AC_short: 1.0
63
+ Clotho-v2-AudioCaptioning/interleaved_knn-train:
64
+ weight: 0.2
65
+ prefix_prob: 1.0
66
+ augmentations:
67
+ AC_short: 1.0
68
+
69
+ audiocaps-AudioCaptioning/train:
70
+ weight: 0.8
71
+ prefix_prob: 1.0
72
+ augmentations:
73
+ AC_short: 1.0
74
+ audiocaps-AudioCaptioning/interleaved_knn-train:
75
+ weight: 0.2
76
+ prefix_prob: 1.0
77
+ augmentations:
78
+ AC_short: 1.0
79
+
80
+ Epidemic_sound-AudioCaptioning/train:
81
+ weight: 0.8
82
+ prefix_prob: 1.0
83
+ augmentations:
84
+ AC_short: 1.0
85
+ Epidemic_sound-AudioCaptioning/interleaved_knn-train:
86
+ weight: 0.2
87
+ prefix_prob: 1.0
88
+ augmentations:
89
+ AC_short: 1.0
90
+
91
+ MACS-AudioCaptioning/train:
92
+ weight: 0.8
93
+ prefix_prob: 1.0
94
+ augmentations:
95
+ AC_short: 1.0
96
+ MACS-AudioCaptioning/interleaved_knn-train:
97
+ weight: 0.2
98
+ prefix_prob: 1.0
99
+ augmentations:
100
+ AC_short: 1.0
101
+
102
+ # Audio Classification
103
+
104
+ FSD50k-EventClassification/train:
105
+ weight: 0.8
106
+ prefix_prob: 1.0
107
+ augmentations:
108
+ default: 1.0
109
+ FSD50k-EventClassification/interleaved_knn-train:
110
+ weight: 0.2
111
+ prefix_prob: 1.0
112
+ augmentations:
113
+ default: 1.0
114
+
115
+ CochlScene-SceneClassification/train:
116
+ weight: 0.8
117
+ prefix_prob: 1.0
118
+ augmentations:
119
+ provide_all_labels: 0.5
120
+ default: 0.5
121
+ CochlScene-SceneClassification/interleaved_knn-train:
122
+ weight: 0.2
123
+ prefix_prob: 1.0
124
+ augmentations:
125
+ provide_all_labels: 0.5
126
+ default: 0.5
127
+
128
+ NonSpeech7k-EventClassification/train:
129
+ weight: 0.8
130
+ prefix_prob: 1.0
131
+ augmentations:
132
+ provide_all_labels: 0.5
133
+ default: 0.5
134
+ NonSpeech7k-EventClassification/interleaved_knn-train:
135
+ weight: 0.2
136
+ prefix_prob: 1.0
137
+ augmentations:
138
+ provide_all_labels: 0.5
139
+ default: 0.5
140
+
141
+ chime-home-EventClassification/train:
142
+ weight: 0.8
143
+ prefix_prob: 1.0
144
+ augmentations:
145
+ default: 0.5
146
+ num_words: 0.5
147
+ chime-home-EventClassification/interleaved_knn-train:
148
+ weight: 0.2
149
+ prefix_prob: 1.0
150
+ augmentations:
151
+ default: 0.5
152
+ num_words: 0.5
153
+
154
+ SONYC-UST-EventClassification/train:
155
+ weight: 0.8
156
+ prefix_prob: 1.0
157
+ augmentations:
158
+ default: 0.5
159
+ num_words: 0.5
160
+ SONYC-UST-EventClassification/interleaved_knn-train:
161
+ weight: 0.2
162
+ prefix_prob: 1.0
163
+ augmentations:
164
+ default: 0.5
165
+ num_words: 0.5
166
+
167
+ # Speech Emotion Classification
168
+
169
+ MELD-EmotionClassification/train:
170
+ weight: 0.5
171
+ prefix_prob: 1.0
172
+ augmentations:
173
+ provide_all_labels: 0.5
174
+ default: 0.5
175
+
176
+ MELD-SentimentClassification/train:
177
+ weight: 0.5
178
+ prefix_prob: 1.0
179
+ augmentations:
180
+ provide_all_labels: 0.1
181
+ default: 0.9
182
+
183
+ emov-db-EmotionClassification/train:
184
+ weight: 1.6
185
+ prefix_prob: 1.0
186
+ augmentations:
187
+ provide_all_labels: 0.5
188
+ default: 0.5
189
+ emov-db-EmotionClassification/interleaved_knn-train:
190
+ weight: 0.4
191
+ prefix_prob: 1.0
192
+ augmentations:
193
+ provide_all_labels: 0.5
194
+ default: 0.5
195
+
196
+ jl-corpus-EmotionClassification/train:
197
+ weight: 6.0
198
+ prefix_prob: 1.0
199
+ augmentations:
200
+ provide_all_labels: 0.5
201
+ default: 0.5
202
+ jl-corpus-EmotionClassification/interleaved_knn-train:
203
+ weight: 1.5
204
+ prefix_prob: 1.0
205
+ augmentations:
206
+ provide_all_labels: 0.5
207
+ default: 0.5
208
+
209
+ tess-EmotionClassification/train:
210
+ weight: 2.0
211
+ prefix_prob: 1.0
212
+ augmentations:
213
+ provide_all_labels: 0.5
214
+ default: 0.5
215
+ tess-EmotionClassification/interleaved_knn-train:
216
+ weight: 0.5
217
+ prefix_prob: 1.0
218
+ augmentations:
219
+ provide_all_labels: 0.5
220
+ default: 0.5
221
+
222
+ IEMOCAP-EmotionClassification/train:
223
+ weight: 2.4
224
+ prefix_prob: 1.0
225
+ augmentations:
226
+ provide_all_labels: 0.5
227
+ default: 0.5
228
+ IEMOCAP-EmotionClassification/interleaved_knn-train:
229
+ weight: 0.6
230
+ prefix_prob: 1.0
231
+ augmentations:
232
+ provide_all_labels: 0.5
233
+ default: 0.5
234
+
235
+ OMGEmotion-EmotionClassification/train:
236
+ weight: 3.0
237
+ prefix_prob: 1.0
238
+ augmentations:
239
+ provide_all_labels: 0.5
240
+ default: 0.5
241
+
242
+ VocalSound-VocalClassification/train:
243
+ weight: 1.0
244
+ prefix_prob: 1.0
245
+ augmentations:
246
+ provide_all_labels: 0.5
247
+ default: 0.5
248
+
249
+ # Music QA
250
+
251
+ Music-AVQA-AQA_All/train:
252
+ weight: 2.0
253
+ prefix_prob: 1.0
254
+ augmentations:
255
+ AQA_binary_instruction: 1.0
256
+ Music-AVQA-AQA_All/interleaved_knn-train:
257
+ weight: 1.0
258
+ prefix_prob: 1.0
259
+ augmentations:
260
+ AQA_binary_instruction: 1.0
261
+
262
+ MU-LLAMA-AQA/train:
263
+ weight: 0.9
264
+ prefix_prob: 1.0
265
+ augmentations:
266
+ do_nothing: 1.0
267
+ MU-LLAMA-AQA/interleaved_knn-train:
268
+ weight: 0.1
269
+ prefix_prob: 1.0
270
+ augmentations:
271
+ do_nothing: 1.0
272
+
273
+ # Music Captioning
274
+
275
+ LP-MusicCaps-MSD-AudioCaptioning/train:
276
+ weight: 0.05 # 1.3M
277
+ prefix_prob: 1.0
278
+ augmentations:
279
+ AC_paragraph: 1.0
280
+ LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
281
+ weight: 0.05 # 111k
282
+ prefix_prob: 1.0
283
+ augmentations:
284
+ AC_paragraph: 1.0
285
+
286
+ LP-MusicCaps-MC-AudioCaptioning/train:
287
+ weight: 1.6
288
+ prefix_prob: 1.0
289
+ augmentations:
290
+ AC_paragraph: 1.0
291
+ LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
292
+ weight: 0.4
293
+ prefix_prob: 1.0
294
+ augmentations:
295
+ AC_paragraph: 1.0
296
+
297
+ LP-MusicCaps-MTT-AudioCaptioning/train:
298
+ weight: 0.8
299
+ prefix_prob: 1.0
300
+ augmentations:
301
+ AC_long: 1.0
302
+ LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
303
+ weight: 0.2
304
+ prefix_prob: 1.0
305
+ augmentations:
306
+ AC_long: 1.0
307
+
308
+ MusicCaps-AudioCaptioning/train:
309
+ weight: 6.0
310
+ prefix_prob: 1.0
311
+ augmentations:
312
+ AC_paragraph: 1.0
313
+ MusicCaps-AudioCaptioning/interleaved_knn-train:
314
+ weight: 1.5
315
+ prefix_prob: 1.0
316
+ augmentations:
317
+ AC_paragraph: 1.0
318
+
319
+ SongDescriber-AudioCaptioning/train:
320
+ weight: 0.8
321
+ prefix_prob: 1.0
322
+ augmentations:
323
+ AC_long: 1.0
324
+ SongDescriber-AudioCaptioning/interleaved_knn-train:
325
+ weight: 0.2
326
+ prefix_prob: 1.0
327
+ augmentations:
328
+ AC_long: 1.0
329
+
330
+ # Music Understanding
331
+
332
+ NSynth-MIR/train:
333
+ weight: 0.2 # 289k for weight = 1
334
+ prefix_prob: 1.0
335
+ augmentations:
336
+ do_nothing: 1.0
337
+ NSynth-MIR/interleaved_knn-train:
338
+ weight: 0.2 # 60k for weight = 1
339
+ prefix_prob: 1.0
340
+ augmentations:
341
+ do_nothing: 1.0
342
+
343
+ mtg-jamendo-MusicTagging/train:
344
+ weight: 0.1
345
+ prefix_prob: 1.0
346
+ augmentations:
347
+ default: 1.0
348
+
349
+ FMA-GenreClassification/train:
350
+ weight: 0.4 # 104k for weight = 1
351
+ prefix_prob: 1.0
352
+ augmentations:
353
+ do_nothing: 1.0
354
+ FMA-GenreClassification/interleaved_knn-train:
355
+ weight: 0.3 # 46k for weight = 1
356
+ prefix_prob: 1.0
357
+ augmentations:
358
+ do_nothing: 1.0
359
+
360
+ musdbhq-InstrClassification/train:
361
+ weight: 0.8
362
+ prefix_prob: 1.0
363
+ augmentations:
364
+ provide_all_labels: 0.5
365
+ default: 0.5
366
+
367
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
368
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
369
+ dataset_blending_output: dataset_blending.json
370
+ max_tokens: 512
371
+ num_workers: 4
372
+
373
+ valid_dataset_config:
374
+
375
+ Clotho-AQA-AQA/test: true
376
+ Clotho-AQA-AQA/interleaved_knn-test: true
377
+
378
+ Clotho-v2-AudioCaptioning/test: true
379
+ Clotho-v2-AudioCaptioning/interleaved_knn-test: true
380
+
381
+ FSD50k-EventClassification/test: true
382
+ FSD50k-EventClassification/interleaved_knn-test: true
383
+
384
+ CochlScene-SceneClassification/test: true
385
+ CochlScene-SceneClassification/interleaved_knn-test: true
386
+
387
+ NonSpeech7k-EventClassification/test: true
388
+ NonSpeech7k-EventClassification/interleaved_knn-test: true
389
+
390
+ SONYC-UST-EventClassification/test: true
391
+ SONYC-UST-EventClassification/interleaved_knn-test: true
392
+
393
+ emov-db-EmotionClassification/val: true
394
+ emov-db-EmotionClassification/interleaved_knn-val: true
395
+
396
+ jl-corpus-EmotionClassification/val: true
397
+ jl-corpus-EmotionClassification/interleaved_knn-val: true
398
+
399
+ tess-EmotionClassification/val: true
400
+ tess-EmotionClassification/interleaved_knn-val: true
401
+
402
+ IEMOCAP-EmotionClassification/test: true
403
+ IEMOCAP-EmotionClassification/interleaved_knn-test: true
404
+
405
+ OMGEmotion-EmotionClassification/val: true
406
+
407
+ Music-AVQA-AQA_All/test: true
408
+ Music-AVQA-AQA_All/interleaved_knn-test: true
409
+
410
+ MU-LLAMA-AQA/test: true
411
+
412
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
413
+ LP-MusicCaps-MC-AudioCaptioning/test: true
414
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
415
+ LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
416
+
417
+ NSynth-MIR/test: true
418
+ NSynth-MIR/interleaved_knn-test: true
419
+
420
+ mtg-jamendo-MusicTagging/val: true
421
+
422
+ audiocaps-AudioCaptioning/test: true
423
+ audiocaps-AudioCaptioning/interleaved_knn-test: true
424
+
425
+ MusicCaps-AudioCaptioning/test: true
426
+
427
+ MELD-EmotionClassification/test: true
428
+ MELD-SentimentClassification/test: true
429
+ VocalSound-VocalClassification/test: true
430
+ musdbhq-InstrClassification/test: true
431
+
432
+ # zero shot
433
+
434
+ GTZAN-GenreClassification/train:
435
+ prefix_prob: 1.0
436
+ augmentations:
437
+ provide_all_labels: 1.0
438
+ GTZAN-GenreClassification/interleaved_knn-train:
439
+ prefix_prob: 1.0
440
+ augmentations:
441
+ provide_all_labels: 1.0
442
+
443
+ Medley-solos-DB-InstrClassification/test:
444
+ prefix_prob: 1.0
445
+ augmentations:
446
+ provide_all_labels: 1.0
447
+ Medley-solos-DB-InstrClassification/interleaved_knn-test:
448
+ prefix_prob: 1.0
449
+ augmentations:
450
+ provide_all_labels: 1.0
451
+
452
+ clap_config:
453
+ # method: laion-clap
454
+ # audio_embed_dim: 512
455
+ # model_name: 630k-fusion-best
456
+ # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
457
+
458
+ method: microsoft-clap
459
+ audio_embed_dim: 1024
460
+ config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
461
+ # model_name: '2023'
462
+ # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
463
+ model_name: 'clapcap'
464
+ checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
465
+
466
+ window_length: 7.0 # seconds
467
+ window_overlap: 5.25 # seconds
468
+ max_num_window: 16 # 35 seconds
469
+ max_num_fewshot: 4 # number of fewshot samples (including the final one)
470
+
471
+ model_config:
472
+ cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
473
+
474
+ lang_encoder_path: facebook/opt-iml-max-1.3b
475
+ tokenizer_path: facebook/opt-iml-max-1.3b
476
+ cross_attn_every_n_layers: 1
477
+ audio_transformer_kwargs: {
478
+ n_head: 8,
479
+ n_layers: 3,
480
+ d_inner: 2048,
481
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
482
+ max_window_per_audio: 16, # must = max_num_window
483
+ }
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_config:
2
+ expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
3
+ run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
4
+ delete_previous_checkpoint: true
5
+ batch_size: 4
6
+ gradient_accumulation_steps: 2
7
+ seed: 42
8
+ learning_rate: 0.00002
9
+ lr_scheduler: constant
10
+ loss_multiplier: 1.0
11
+ warmup_steps: 1875
12
+ weight_decay: 0.1
13
+ precision: amp_bf16 # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
14
+ gradient_checkpointing: False
15
+ num_epochs: 200 # num_epochs * dataset_blending_global_weight = 1
16
+ offline: false
17
+ freeze_lm_embeddings: false
18
+ logging_steps: 10
19
+ dist_backend: nccl
20
+ dist_url: env:// # tcp://localhost:7000
21
+ no_set_device_rank: false
22
+ fsdp: true
23
+ fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
24
+ fsdp_sharding_strategy: full # full, hybrid
25
+ horovod: false
26
+
27
+ # instruction tuning hparams
28
+ sft_config:
29
+ pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
30
+ pretrained_ckpt: checkpoint_199.pt
31
+ unfreeze_full_lm: false
32
+
33
+ data_config:
34
+ dataset_blending_global_weight: 0.005
35
+
36
+ dataset_blending_config:
37
+
38
+ MMAUQA/train:
39
+ weight: 1.5
40
+
41
+ AudioSet-Temporal-Speech-Audio-QA/train:
42
+ weight: 1.0
43
+
44
+ CompA-R-AQA/train:
45
+ weight: 1.0
46
+
47
+ # Audio QA
48
+ Clotho-AQA-AQA/train:
49
+ weight: 1.0
50
+
51
+ OpenAQA-AQA/train:
52
+ weight: 1.0
53
+
54
+ SalmonnQA/train:
55
+ weight: 1.0
56
+
57
+ AudioEntailmentQA/train:
58
+ weight: 1.0
59
+
60
+ # Audio Captioning
61
+
62
+ Clotho-v2-AudioCaptioning/train:
63
+ weight: 1.0
64
+
65
+ audiocaps-AudioCaptioning/train:
66
+ weight: 1.0
67
+
68
+ Epidemic_sound-AudioCaptioning/train:
69
+ weight: 1.0
70
+
71
+ MACS-AudioCaptioning/train:
72
+ weight: 1.0
73
+
74
+ # Audio Classification
75
+
76
+ FSD50k-EventClassification/train:
77
+ weight: 1.0
78
+
79
+ CochlScene-SceneClassification/train:
80
+ weight: 1.0
81
+
82
+ NonSpeech7k-EventClassification/train:
83
+ weight: 1.0
84
+
85
+ chime-home-EventClassification/train:
86
+ weight: 1.0
87
+
88
+ SONYC-UST-EventClassification/train:
89
+ weight: 1.0
90
+
91
+ # Speech Emotion Classification
92
+
93
+ MELD-EmotionClassification/train:
94
+ weight: 0.5
95
+
96
+ MELD-SentimentClassification/train:
97
+ weight: 0.5
98
+
99
+ emov-db-EmotionClassification/train:
100
+ weight: 1.0
101
+
102
+ jl-corpus-EmotionClassification/train:
103
+ weight: 6.0
104
+
105
+ tess-EmotionClassification/train:
106
+ weight: 2.5
107
+
108
+ IEMOCAP-EmotionClassification/train:
109
+ weight: 3.0
110
+
111
+ OMGEmotion-EmotionClassification/train:
112
+ weight: 3.0
113
+
114
+ VocalSound-VocalClassification/train:
115
+ weight: 1.5
116
+
117
+ # Music QA
118
+
119
+ Music-AVQA-AQA_All/train:
120
+ weight: 3.0
121
+
122
+ MU-LLAMA-AQA/train:
123
+ weight: 1.0
124
+
125
+ # Music Captioning
126
+
127
+ LP-MusicCaps-MSD-AudioCaptioning/train:
128
+ weight: 0.06
129
+
130
+ LP-MusicCaps-MC-AudioCaptioning/train:
131
+ weight: 2.0
132
+
133
+ LP-MusicCaps-MTT-AudioCaptioning/train:
134
+ weight: 1.0
135
+
136
+ MusicCaps-AudioCaptioning/train:
137
+ weight: 6.0
138
+
139
+ musdbhq-captioning/train:
140
+ weight: 2.0
141
+
142
+ # Music Understanding
143
+
144
+ NSynth-MIR/train:
145
+ weight: 0.2
146
+
147
+ mtg-jamendo-MusicTagging/train:
148
+ weight: 0.1
149
+
150
+ FMA-GenreClassification/train:
151
+ weight: 0.5
152
+
153
+ musdbhq-InstrClassification/train:
154
+ weight: 0.8
155
+
156
+ LLARK_FMA-mir/train:
157
+ weight: 1.0
158
+
159
+ LLARK_FMA-reasoning/train:
160
+ weight: 1.0
161
+
162
+ LLARK_MagnaTagATune-mir/train:
163
+ weight: 1.0
164
+
165
+ LLARK_MTG-Jamendo-reasoning/train:
166
+ weight: 1.0
167
+
168
+ LLARK_MagnaTagATune-reasoning/train:
169
+ weight: 1.0
170
+
171
+ LLARK_MTG-Jamendo-mir/train:
172
+ weight: 1.0
173
+
174
+ MusicBenchQA/train:
175
+ weight: 1.0
176
+
177
+ dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
178
+ data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
179
+ dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
180
+ max_tokens: 512
181
+ num_workers: 4
182
+
183
+ valid_dataset_config:
184
+
185
+ Clotho-AQA-AQA/test: true
186
+
187
+ Clotho-v2-AudioCaptioning/test: true
188
+ audiocaps-AudioCaptioning/test: true
189
+
190
+ FSD50k-EventClassification/test: true
191
+ CochlScene-SceneClassification/test: true
192
+ NonSpeech7k-EventClassification/test: true
193
+ SONYC-UST-EventClassification/test: true
194
+
195
+ MELD-EmotionClassification/test: true
196
+ MELD-SentimentClassification/test: true
197
+ emov-db-EmotionClassification/val: true
198
+ jl-corpus-EmotionClassification/val: true
199
+ tess-EmotionClassification/val: true
200
+ IEMOCAP-EmotionClassification/val: true
201
+ OMGEmotion-EmotionClassification/val: true
202
+ VocalSound-VocalClassification/test: true
203
+
204
+ Music-AVQA-AQA_All/test: true
205
+ MU-LLAMA-AQA/test: true
206
+
207
+ LP-MusicCaps-MSD-AudioCaptioning/test: true
208
+ LP-MusicCaps-MC-AudioCaptioning/test: true
209
+ LP-MusicCaps-MTT-AudioCaptioning/test: true
210
+ MusicCaps-AudioCaptioning/test: true
211
+
212
+ NSynth-MIR/test: true
213
+ mtg-jamendo-MusicTagging/val: true
214
+ musdbhq-InstrClassification/test: true
215
+
216
+ # # zero shot
217
+ # CREMA-D-EmotionClassification/train:
218
+ # prefix_prob: 1.0
219
+
220
+ # ravdess-EmotionClassification/train:
221
+ # prefix_prob: 1.0
222
+
223
+ # UrbanSound8K-EventClassification/train:
224
+ # prefix_prob: 1.0
225
+
226
+ # ESC50-EventClassification/train:
227
+ # prefix_prob: 1.0
228
+
229
+ # DCASE17Task4-SceneClassification/test:
230
+ # prefix_prob: 1.0
231
+
232
+ # GTZAN-GenreClassification/train:
233
+ # prefix_prob: 1.0
234
+
235
+ # Medley-solos-DB-InstrClassification/test:
236
+ # prefix_prob: 1.0
237
+
238
+ clap_config:
239
+ method: nvclap-large
240
+ audio_embed_dim: 2048
241
+ checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
242
+
243
+ window_length: 10.0 # seconds
244
+ window_overlap: 0.0 # seconds
245
+ max_num_window: 9 # 1.5 minutes
246
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
247
+ finetune: true
248
+
249
+ whisper_config:
250
+ method: whisper-large-v3
251
+ path: openai/whisper-large-v3
252
+ audio_embed_dim: 1280
253
+ sampling_rate: 16000
254
+
255
+ window_length: 30.0 # seconds
256
+ window_overlap: 0.0 # seconds
257
+ max_num_window: 1 # 5 minutes
258
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
259
+
260
+ mert_config:
261
+ method: mert-v1
262
+ path: m-a-p/MERT-v1-330M
263
+ audio_embed_dim: 1024
264
+ sampling_rate: 24000
265
+
266
+ window_length: 10.0 # seconds
267
+ window_overlap: 0.0 # seconds
268
+ max_num_window: 1 # 5 minutes
269
+ max_num_fewshot: 1 # number of fewshot samples (including the final one)
270
+
271
+ model_config:
272
+ cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
273
+
274
+ lang_encoder_path: Qwen/Qwen2.5-3B
275
+ tokenizer_path: Qwen/Qwen2.5-3B
276
+ cross_attn_every_n_layers: 1
277
+ audio_transformer_kwargs: {
278
+ n_head: 8,
279
+ n_layers: 3,
280
+ d_inner: 2048,
281
+ max_num_media: 128, # must be >= max_num_window * num_fewshot_samples (4)
282
+ max_window_per_audio: 1, # must = max_num_window
283
+ common_encoder_embed_dim: 1024
284
+ }
data/__pycache__/data.cpython-38.pyc ADDED
Binary file (16.4 kB). View file
 
data/data.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import io
3
+ import json
4
+ import math
5
+ import os
6
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # disable the tokenizer parallelism warning
7
+ import random
8
+ import re
9
+ import string
10
+ import subprocess
11
+ import sys
12
+ import yaml
13
+
14
+ import numpy as np
15
+
16
+ from collections import defaultdict
17
+ from copy import deepcopy
18
+ from dataclasses import dataclass
19
+ from functools import partial
20
+ from pydub import AudioSegment
21
+ from tqdm import tqdm
22
+
23
+ import torch
24
+ import torchvision
25
+ import torch.nn.functional as F
26
+ from torch.utils.data import DataLoader, Dataset, get_worker_info
27
+ from torch.utils.data.distributed import DistributedSampler
28
+
29
+
30
+ from transformers import AutoTokenizer
31
+
32
+ import librosa
33
+ import soundfile as sf
34
+
35
+ EMOTION_MAP_DICT = {
36
+ 'amused': 'amused' ,
37
+ 'anger': 'angry' , 'angry': 'angry' ,
38
+ 'anxious': 'anxious' ,
39
+ 'apologetic': 'apologetic' ,
40
+ 'assertive': 'assertive' ,
41
+ 'calm': 'calm' ,
42
+ 'concerned': 'concerned' ,
43
+ 'contempt': 'contempt' ,
44
+ 'disgust': 'disgusted' , 'disgusted': 'disgusted' ,
45
+ 'encouraging': 'encouraging' ,
46
+ 'excited': 'excited' ,
47
+ 'fear': 'fearful' , 'fearful': 'fearful' ,
48
+ 'frustated': 'frustated' ,
49
+ 'happy': 'happy' , 'joy': 'happy' ,
50
+ 'neutral': 'neutral' ,
51
+ 'sad': 'sad' , 'sadness': 'sad' ,
52
+ 'sleepy': 'sleepy' ,
53
+ 'surprise': 'surprised' , 'surprised': 'surprised' ,
54
+ 'pleasantly surprised': 'pleasantly surprised' ,
55
+ }
56
+
57
+
58
+ def int16_to_float32(x):
59
+ return (x / 32767.0).astype(np.float32)
60
+
61
+
62
+ def float32_to_int16(x):
63
+ x = np.clip(x, a_min=-1., a_max=1.)
64
+ return (x * 32767.).astype(np.int16)
65
+
66
+
67
+ class DataCollator:
68
+ def __init__(self, tokenizer, clap_config):
69
+
70
+ self.tokenizer = tokenizer
71
+ self.clap_config = clap_config
72
+ self.max_num_window = clap_config["max_num_window"]
73
+
74
+ def __call__(self, batch):
75
+
76
+ filenames, audio_clips, audio_embed_masks, input_ids, attention_masks = zip(*batch)
77
+
78
+ num_windows_all = [sum(audio_embed_mask) for audio_embed_mask in audio_embed_masks]
79
+ max_window_batch = int(max(num_windows_all))
80
+
81
+ if max_window_batch > self.max_num_window:
82
+ max_window_batch = self.max_num_window
83
+
84
+ padded_audio_clips = []
85
+ padded_audio_embed_masks = []
86
+ for audio_clip, audio_embed_mask in zip(audio_clips,audio_embed_masks):
87
+ this_audio_clip_clips = [clip for clip in audio_clip]
88
+ num_windows = len(this_audio_clip_clips)
89
+ if num_windows < max_window_batch:
90
+ for _ in range(max_window_batch - num_windows):
91
+ this_audio_clip_clips.append(torch.zeros_like(this_audio_clip_clips[-1]))
92
+ audio_clip = torch.cat(this_audio_clip_clips)
93
+ audio_embed_mask = torch.zeros(max_window_batch)
94
+ audio_embed_mask[:num_windows] = 1
95
+ elif num_windows < max_window_batch:
96
+ audio_clip = this_audio_clip_clips[:max_window_batch]
97
+ audio_clip = torch.cat(this_audio_clip_clips)
98
+ audio_embed_mask = audio_embed_mask[:max_window_batch]
99
+ else:
100
+ audio_clip = torch.cat(this_audio_clip_clips)
101
+
102
+ padded_audio_clips.append(audio_clip)
103
+ padded_audio_embed_masks.append(audio_embed_mask)
104
+
105
+ audio_clips = torch.cat([x.unsqueeze(0) for x in padded_audio_clips], dim=0)
106
+ audio_embed_mask = torch.cat([x.unsqueeze(0) for x in padded_audio_embed_masks], dim=0)
107
+
108
+ max_length = max([ids.shape[1] for ids in input_ids])
109
+
110
+ padded_input_ids = []
111
+ padded_attention_masks = []
112
+ for ids, mask in zip(input_ids, attention_masks):
113
+ if ids.shape[1] < max_length:
114
+ padded_input_ids.append(
115
+ torch.cat([ids, torch.LongTensor([self.tokenizer.pad_token_id] * (max_length - ids.shape[1])).unsqueeze(0)], dim=1)
116
+ )
117
+ padded_attention_masks.append(
118
+ torch.cat([mask, torch.LongTensor([0] * (max_length - mask.shape[1])).unsqueeze(0)], dim=1)
119
+ )
120
+ else:
121
+ padded_input_ids.append(ids)
122
+ padded_attention_masks.append(mask)
123
+
124
+ padded_input_ids = torch.cat(padded_input_ids, dim=0)
125
+ padded_attention_masks = torch.cat(padded_attention_masks, dim=0).bool()
126
+
127
+ out_dict = dict(
128
+ filenames=filenames,
129
+ audio_clips=audio_clips,
130
+ audio_embed_mask=audio_embed_mask,
131
+ input_ids=padded_input_ids,
132
+ attention_mask=padded_attention_masks
133
+ )
134
+ return out_dict
135
+
136
+
137
+ class AudioTextData(torch.utils.data.Dataset):
138
+ def __init__(
139
+ self,
140
+ dataset_file_root: str,
141
+ data_root: str,
142
+ clap_config: dict,
143
+ dataset_blending_global_weight: float,
144
+ dataset_blending_config: dict,
145
+ dataset_blending_output: str,
146
+ tokenizer,
147
+ max_tokens: int,
148
+ split: str = 'train',
149
+ valid_dataset_config: dict = {},
150
+ valid_dataset_name: str = '',
151
+ epoch: int = 0,
152
+ force_reblend: bool = False,
153
+ sr = 16000,
154
+ **kwargs
155
+ ):
156
+ self.dataset_file_root = dataset_file_root
157
+ self.data_root = data_root
158
+ self.clap_config = clap_config
159
+ self.dataset_blending_global_weight = dataset_blending_global_weight
160
+ self.dataset_blending_config = dataset_blending_config
161
+ self.sr = sr
162
+
163
+ self.split = split
164
+ self.epoch = epoch
165
+ self.force_reblend = force_reblend
166
+
167
+ assert self.split in ['train', 'val', 'test']
168
+
169
+ if self.split == 'train':
170
+ self.data = self.blend_dataset(dataset_blending_config, dataset_blending_output)
171
+
172
+ elif self.split in ['val', 'test']:
173
+ self.valid_data = self.validation_dataset(valid_dataset_config, valid_dataset_name)
174
+
175
+ self.tokenizer = tokenizer
176
+ self.tokenizer.padding_side = "right"
177
+ self.max_tokens = max_tokens
178
+
179
+ @staticmethod
180
+ def shuffle_dict_fixed_rand(dic, seed=0):
181
+ print('randomly shuffling key-value pairs')
182
+
183
+ local_random = np.random.default_rng(seed)
184
+ original_keys = list(dic.keys())
185
+ shuffled_keys = deepcopy(original_keys)
186
+ local_random.shuffle(shuffled_keys)
187
+ shuffling_mapping = {x: y for (x, y) in zip(original_keys, shuffled_keys)}
188
+
189
+ shuffled_dic = {}
190
+ for idx in original_keys:
191
+ shuffled_idx = shuffling_mapping[idx]
192
+ shuffled_dic[idx] = dic[shuffled_idx]
193
+ return shuffled_dic
194
+
195
+ @staticmethod
196
+ def is_broken_file(audiopath):
197
+ BROKEN_FILES = [
198
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/023/023431.mp3",
199
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/033/033690.mp3",
200
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119217.mp3",
201
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119222.mp3",
202
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119219.mp3",
203
+ "/lustre/fsw/portfolios/adlr/users/zkong/datasets/GTZAN/gtzan/data/genres/jazz/jazz.00054.wav"
204
+ ]
205
+ return audiopath in BROKEN_FILES
206
+
207
+ def _read_dataset_file(self, dataset_file):
208
+ print("reading", dataset_file)
209
+ with open(dataset_file) as f:
210
+ contents = f.read()
211
+ contents = json.loads(contents)
212
+
213
+ if contents['split_path'] is not None:
214
+ abs_path = contents['split_path']
215
+
216
+ """
217
+ for normal data
218
+ contents['data'] = {idx: {
219
+ 'name': rel_path/name,
220
+ 'prompt': prompt,
221
+ 'output': output,
222
+ [optional] 'audio_start': audio_start,
223
+ 'task': task,
224
+ }}
225
+ """
226
+
227
+ if 'interleaved' not in dataset_file:
228
+ for idx in contents["data"]:
229
+ contents["data"][idx]['task'] = contents["flamingo_task"]
230
+ contents["data"][idx]['name'] = os.path.join(
231
+ abs_path, contents["data"][idx]['name']
232
+ )
233
+ return contents
234
+
235
+ def blend_dataset(self, dataset_blending_config, dataset_blending_output):
236
+ if os.path.exists(dataset_blending_output) and not self.force_reblend:
237
+ print("loading blended dataset file from:", dataset_blending_output)
238
+ with open(dataset_blending_output) as f:
239
+ contents = f.read()
240
+ self_data = json.loads(contents)
241
+
242
+ else:
243
+ if not self.force_reblend:
244
+ print("no blended dataset file found; reading all dataset files")
245
+ else:
246
+ print("force reblending dataset at epoch {}; reading all dataset files".format(self.epoch))
247
+
248
+ all_data = {}
249
+ for dataset_name in dataset_blending_config:
250
+ dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(dataset_name))
251
+ contents = self._read_dataset_file(dataset_file)
252
+ contents['data'] = self.shuffle_dict_fixed_rand(
253
+ contents['data'],
254
+ seed=sum(list(map(ord, dataset_name)))
255
+ )
256
+
257
+ weight_global = float(self.dataset_blending_global_weight)
258
+ weight_dataset = float(dataset_blending_config[dataset_name]["weight"])
259
+ weight = weight_global * weight_dataset
260
+
261
+ all_data[dataset_name] = {
262
+ "contents": contents,
263
+ "weight": weight
264
+ }
265
+
266
+ self_data = {
267
+ "dataset_path": self.data_root,
268
+ "split_path": None,
269
+ "total_num": 0,
270
+ "data": {} # {id: {'name': rel_path/name or [rel_path/names], 'prompt': prompt or [prompts], 'output': output or [outputs], 'task': task, 'interleaved': interleave_method}}
271
+ }
272
+
273
+ for dataset_name in all_data:
274
+ print('blending {}'.format(dataset_name))
275
+
276
+ contents = all_data[dataset_name]["contents"]
277
+ shuffled_contents_data = contents['data']
278
+ weight = all_data[dataset_name]["weight"]
279
+ assert type(weight) == float and weight > 0.0
280
+
281
+ dataset_total_num = contents['total_num']
282
+ start_idx = int(self.epoch * dataset_total_num * weight)
283
+ end_idx = int((self.epoch + 1) * dataset_total_num * weight)
284
+
285
+ for idx in range(start_idx, end_idx):
286
+ if idx > 0 and idx % dataset_total_num == 0:
287
+ print('force shuffling at new epoch {} for dataset {}'.format(idx // dataset_total_num, dataset_name))
288
+ shuffled_contents_data = self.shuffle_dict_fixed_rand(
289
+ contents['data'],
290
+ seed=sum(list(map(ord, '{}-epoch-{}'.format(dataset_name, idx // dataset_total_num))))
291
+ )
292
+
293
+ key = str(idx % dataset_total_num)
294
+ item = shuffled_contents_data[key]
295
+
296
+ found_broken = False
297
+ if type(item['name']) is str:
298
+ audiopath = item['name']
299
+ if self.is_broken_file(audiopath):
300
+ print('cannot read {}'.format(audiopath))
301
+ found_broken = True
302
+
303
+ if found_broken:
304
+ continue
305
+
306
+ self_data['data'][self_data['total_num']] = item
307
+ self_data['total_num'] += 1
308
+
309
+ if not self.force_reblend:
310
+ print('writing blended dataset file to:', dataset_blending_output)
311
+ with open(dataset_blending_output, 'w') as json_file:
312
+ json.dump(self_data, json_file)
313
+ else:
314
+ print('writing reblended dataset file to:', dataset_blending_output.replace('.json', '-reblended.json'))
315
+ with open(dataset_blending_output.replace('.json', '-reblended.json'), 'w') as json_file:
316
+ json.dump(self_data, json_file)
317
+
318
+ return self_data
319
+
320
+ def get_num_windows(self, T, sr):
321
+ clap_config = self.clap_config
322
+ window_length = int(float(clap_config["window_length"]) * sr)
323
+ window_overlap = int(float(clap_config["window_overlap"]) * sr)
324
+ max_num_window = int(clap_config["max_num_window"])
325
+
326
+ num_windows = 1
327
+ if T <= window_length:
328
+ num_windows = 1
329
+ full_length = window_length
330
+ elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
331
+ num_windows = max_num_window
332
+ full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
333
+ else:
334
+ num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
335
+ full_length = num_windows * window_length - (num_windows - 1) * window_overlap
336
+
337
+ return num_windows, full_length
338
+
339
+ def load_audio(self, file_path, target_sr=16000, duration=30.0, start=0.0):
340
+ if file_path.endswith('.mp3'):
341
+ audio = AudioSegment.from_file(file_path)
342
+ if len(audio) > (start + duration) * 1000:
343
+ audio = audio[start * 1000:(start + duration) * 1000]
344
+
345
+ if audio.frame_rate != target_sr:
346
+ audio = audio.set_frame_rate(target_sr)
347
+
348
+ if audio.channels > 1:
349
+ audio = audio.set_channels(1)
350
+
351
+ data = np.array(audio.get_array_of_samples())
352
+ if audio.sample_width == 2:
353
+ data = data.astype(np.float32) / np.iinfo(np.int16).max
354
+ elif audio.sample_width == 4:
355
+ data = data.astype(np.float32) / np.iinfo(np.int32).max
356
+ else:
357
+ raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
358
+
359
+ else:
360
+ with sf.SoundFile(file_path) as audio:
361
+ original_sr = audio.samplerate
362
+ channels = audio.channels
363
+
364
+ max_frames = int((start + duration) * original_sr)
365
+
366
+ audio.seek(int(start * original_sr))
367
+ frames_to_read = min(max_frames, len(audio))
368
+ data = audio.read(frames_to_read)
369
+
370
+ if data.max() > 1 or data.min() < -1:
371
+ data = data / max(abs(data.max()), abs(data.min()))
372
+
373
+ if original_sr != target_sr:
374
+ if channels == 1:
375
+ data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
376
+ else:
377
+ data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
378
+ else:
379
+ if channels != 1:
380
+ data = data.T[0]
381
+
382
+ if data.min() >= 0:
383
+ data = 2 * data / abs(data.max()) - 1.0
384
+ else:
385
+ data = data / max(abs(data.max()), abs(data.min()))
386
+
387
+ assert len(data.shape) == 1, data.shape
388
+ return data
389
+
390
+ def compute_sliding_window(self, audio_file, audio_start=0.0, audio="sound"):
391
+ if type(audio_start) == str:
392
+ audio_start = float(audio_start)
393
+
394
+ if audio == "sound":
395
+ encoder_config = self.clap_config
396
+ else:
397
+ raise NotImplementedError
398
+
399
+ if encoder_config["method"] == 'nvclap-large':
400
+ sr = 16000
401
+ else:
402
+ raise NotImplementedError
403
+
404
+ window_length = int(float(encoder_config["window_length"]) * sr)
405
+ window_overlap = int(float(encoder_config["window_overlap"]) * sr)
406
+ max_num_window = int(encoder_config["max_num_window"])
407
+ duration = max_num_window * (encoder_config["window_length"] - encoder_config["window_overlap"]) + encoder_config["window_overlap"]
408
+
409
+ audio_data = self.load_audio(os.path.join(self.data_root, audio_file), sr, duration, audio_start) # already cuts to max duration
410
+ T = len(audio_data)
411
+ num_windows, full_length = self.get_num_windows(T, sr)
412
+
413
+ # pads to the nearest multiple of window_length
414
+ if full_length > T:
415
+ audio_data = np.append(audio_data, np.zeros(full_length - T))
416
+
417
+ audio_data = audio_data.reshape(1, -1)
418
+ audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
419
+
420
+ audio_clips = []
421
+ audio_embed_mask = torch.ones(num_windows)
422
+ for i in range(num_windows):
423
+ start = i * (window_length - window_overlap)
424
+ audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
425
+ audio_clips.append(audio_data_tensor_this)
426
+
427
+ return audio_clips, audio_embed_mask
428
+
429
+ def validation_dataset(self, valid_dataset_config, valid_dataset_name):
430
+ dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(valid_dataset_name))
431
+ contents = self._read_dataset_file(dataset_file)
432
+
433
+ contents['data'] = self.shuffle_dict_fixed_rand(
434
+ contents['data'],
435
+ seed=sum(list(map(ord, valid_dataset_name)))
436
+ )
437
+
438
+ return contents
439
+
440
+ def preprocess_string_for_eval(self, x):
441
+ x = x.rstrip().lstrip()
442
+ x = x.lower()
443
+ return x
444
+
445
+ def _actual_getitem(self, i):
446
+ if self.split == 'train':
447
+ try:
448
+ item = self.data['data'][str(i)]
449
+ except:
450
+ item = self.data['data'][i]
451
+
452
+ if type(item['name']) is str:
453
+ audio_file = item['name']
454
+ audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
455
+ else:
456
+ raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
457
+
458
+ # compute window for long audios
459
+ audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
460
+
461
+ # make the text prompt
462
+ text_prompt = str(item['prompt']).lower()
463
+ text_output = str(item['output']).lower()
464
+
465
+ sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
466
+
467
+ text = self.tokenizer(
468
+ sample,
469
+ max_length=self.max_tokens,
470
+ padding="longest",
471
+ truncation="only_first",
472
+ return_tensors="pt"
473
+ )
474
+
475
+ elif self.split in ['val', 'test']:
476
+ try:
477
+ item = self.valid_data['data'][str(i)]
478
+ except:
479
+ item = self.valid_data['data'][i]
480
+
481
+ if type(item['name']) is str:
482
+ audio_file = os.path.join(self.data_root, item['name'])
483
+ audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
484
+ else:
485
+ raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
486
+
487
+ # compute window for long audios
488
+ audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
489
+
490
+ # make the text prompt
491
+ text_prompt = self.preprocess_string_for_eval(str(item['prompt']).lower())
492
+ text_output = self.preprocess_string_for_eval(str(item['output']).lower())
493
+
494
+ sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
495
+
496
+ text = self.tokenizer(
497
+ sample,
498
+ max_length=self.max_tokens,
499
+ padding="longest",
500
+ truncation="only_first",
501
+ return_tensors="pt"
502
+ )
503
+
504
+ # audio_clips_clap, audio_embed_mask_clap, audio_clips_speech, audio_embed_mask_speech, audio_clips_music, audio_embed_mask_music,
505
+ return (item['name'], audio_clips, audio_embed_mask, text["input_ids"], text["attention_mask"])
506
+
507
+ def __getitem__(self, i):
508
+ try:
509
+ return self._actual_getitem(i)
510
+ except Exception as e:
511
+ print('batch {} failed with reason {}'.format(i, e))
512
+ try:
513
+ return self._actual_getitem((i-42)%99)
514
+ except:
515
+ return self._actual_getitem((i-84)%99)
516
+
517
+ def __len__(self):
518
+ if self.split == 'train':
519
+ return len(list(self.data['data'].keys()))
520
+
521
+ elif self.split == 'val':
522
+ return min(len(list(self.valid_data['data'].keys())), 64)
523
+
524
+ elif self.split == 'test':
525
+ return len(list(self.valid_data['data'].keys()))
526
+
527
+
528
+ @dataclass
529
+ class DataInfo:
530
+ dataset: Dataset
531
+ dataloader: DataLoader
532
+ sampler: DistributedSampler = None
533
+
534
+ def set_epoch(self, epoch):
535
+ if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
536
+ self.sampler.set_epoch(epoch)
537
+
538
+
539
+ def get_audiotext_dataloader(data_config, clap_config, text_tokenizer, batch_size, split='train', epoch=0, force_reblend=False):
540
+ assert split in ['train', 'val', 'test']
541
+
542
+ data_collator = DataCollator(text_tokenizer, clap_config)
543
+ dataloader_shuffle = False
544
+
545
+ if split == 'train':
546
+ trainset = AudioTextData(
547
+ **data_config,
548
+ clap_config=clap_config,
549
+ tokenizer=text_tokenizer,
550
+ split=split,
551
+ epoch=epoch,
552
+ force_reblend=force_reblend
553
+ )
554
+ sampler = DistributedSampler(trainset, shuffle=True)
555
+ trainloader = DataLoader(
556
+ trainset,
557
+ sampler=sampler,
558
+ batch_size=batch_size,
559
+ shuffle=dataloader_shuffle,
560
+ collate_fn=data_collator,
561
+ num_workers=data_config["num_workers"]
562
+ )
563
+ return DataInfo(dataset=trainset, dataloader=trainloader, sampler=sampler)
564
+
565
+ elif split in ['val', 'test']:
566
+ all_DataInfo = {}
567
+ for valid_dataset_name in list(data_config["valid_dataset_config"].keys()):
568
+ valid_dataset_name = valid_dataset_name.strip()
569
+ validset = AudioTextData(
570
+ **data_config,
571
+ clap_config=clap_config,
572
+ tokenizer=text_tokenizer,
573
+ split=split,
574
+ valid_dataset_name=valid_dataset_name
575
+ )
576
+ if split == 'val':
577
+ # distributed sampler
578
+ all_DataInfo[valid_dataset_name] = DataInfo(
579
+ dataset=validset,
580
+ dataloader=DataLoader(
581
+ validset,
582
+ sampler=DistributedSampler(validset, shuffle=False),
583
+ batch_size=batch_size,
584
+ shuffle=dataloader_shuffle,
585
+ collate_fn=data_collator,
586
+ num_workers=data_config["num_workers"]
587
+ ))
588
+ else:
589
+ # single GPU
590
+ all_DataInfo[valid_dataset_name] = DataInfo(
591
+ dataset=validset,
592
+ dataloader=DataLoader(
593
+ validset,
594
+ batch_size=batch_size,
595
+ shuffle=dataloader_shuffle,
596
+ collate_fn=data_collator,
597
+ num_workers=data_config["num_workers"]
598
+ ))
599
+
600
+ return all_DataInfo
601
+
602
+
603
+ def main():
604
+ import time
605
+ import argparse
606
+
607
+ parser = argparse.ArgumentParser()
608
+ parser.add_argument('-c', '--config', type=str, default='../configs/config.yaml', help='yaml config path')
609
+ args = parser.parse_args()
610
+
611
+ config = yaml.load(open(args.config), Loader=yaml.FullLoader)
612
+
613
+ data_config = config['data_config']
614
+ clap_config = config['clap_config']
615
+ whisper_config = config["whisper_config"]
616
+ mert_config = config["mert_config"]
617
+
618
+ tokenizer_path = "facebook/opt-1.3b"
619
+ cache_dir = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
620
+ text_tokenizer = AutoTokenizer.from_pretrained(
621
+ tokenizer_path,
622
+ local_files_only=False,
623
+ trust_remote_code=True,
624
+ cache_dir=cache_dir,
625
+ )
626
+ text_tokenizer.add_special_tokens(
627
+ {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
628
+ )
629
+ if text_tokenizer.pad_token is None:
630
+ text_tokenizer.add_special_tokens({"pad_token": "<|PAD_TOKEN|>"})
631
+ if text_tokenizer.sep_token is None:
632
+ text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
633
+
634
+ trainset = AudioTextData(
635
+ **data_config,
636
+ clap_config=clap_config, tokenizer=text_tokenizer,
637
+ epoch=66, force_reblend=True
638
+ )
639
+
640
+ data_collator = DataCollator(text_tokenizer)
641
+ dataloader = DataLoader(trainset, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=4)
642
+
643
+ for step, batch in enumerate(dataloader):
644
+ filenames = batch["filenames"]
645
+ audio_clips = batch["audio_clips"]
646
+ audio_embed_mask = batch["audio_embed_mask"]
647
+ input_ids = batch["input_ids"]
648
+ attention_mask = batch["attention_mask"]
649
+
650
+ print(
651
+ 'batch {}:'.format(step+1),
652
+ audio_clips.shape, audio_embed_mask.shape,
653
+ input_ids.shape, attention_mask.shape
654
+ )
655
+
656
+ print('filenames', filenames)
657
+ print('audio_embed_mask', audio_embed_mask)
658
+ print('input_ids', input_ids)
659
+ for input_id in input_ids:
660
+ print('-' * 50)
661
+ print(text_tokenizer.decode(input_id))
662
+ print('attention_mask', attention_mask)
663
+
664
+ if step == 20:
665
+ break
666
+
667
+
668
+ if __name__ == "__main__":
669
+ main()
data/prepare_each_dataset.py ADDED
The diff for this file is too large to render. See raw diff
 
eval/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Audio Flamingo Inference
eval/__init__.py ADDED
File without changes
eval/inference.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import functools
3
+ import glob
4
+ import os
5
+ import random
6
+ import string
7
+ import json
8
+ import sys
9
+ sys.path.append('../')
10
+ from tqdm import tqdm
11
+ import yaml
12
+ from collections import defaultdict
13
+ import io
14
+ import warnings
15
+ import subprocess
16
+ import pickle
17
+
18
+ import numpy as np
19
+ import torch
20
+
21
+ from data.data import get_audiotext_dataloader
22
+ from src.factory import create_model_and_transforms
23
+ from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
24
+
25
+ def inference_this(
26
+ args, data_config, clap_config, model_config, test_dataset_name, tmp_file,
27
+ temperature=1.0, num_beams=3, ckpt=-1, end_batch_idx=-2, verbose=False,
28
+ ):
29
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # disable the tokenizer parallelism warning
30
+ model, tokenizer = create_model_and_transforms(
31
+ **model_config,
32
+ clap_config=clap_config,
33
+ use_local_files=args.offline,
34
+ gradient_checkpointing=args.gradient_checkpointing,
35
+ freeze_lm_embeddings=args.freeze_lm_embeddings,
36
+ )
37
+
38
+ device_id = 0
39
+ model = model.to(device_id)
40
+ model.eval()
41
+
42
+ if ckpt == -1:
43
+ checkpoint_list = glob.glob(f"{args.expdir}/{args.run_name}/checkpoint_*.pt")
44
+ resume_from_checkpoint = sorted(checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0]))[-1]
45
+ else:
46
+ resume_from_checkpoint = f"{args.expdir}/{args.run_name}/checkpoint_{ckpt}.pt"
47
+ checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
48
+ msd = checkpoint["model_state_dict"]
49
+ msd = {k.replace("module.", ""): v for k, v in msd.items()}
50
+ x,y = model.load_state_dict(msd, False)
51
+ print(x)
52
+ print(y)
53
+
54
+ autocast = get_autocast(
55
+ args.precision, cache_enabled=(not args.fsdp)
56
+ )
57
+ cast_dtype = get_cast_dtype(args.precision)
58
+
59
+ # model = model.to(dtype=cast_dtype)
60
+
61
+ if test_dataset_name in data_config["valid_dataset_config"]:
62
+ data_config["valid_dataset_config"] = {test_dataset_name: data_config["valid_dataset_config"][test_dataset_name]}
63
+ else:
64
+ data_config["valid_dataset_config"] = {test_dataset_name: True}
65
+
66
+ all_test_AudioTextDataInfo = get_audiotext_dataloader(data_config, clap_config, tokenizer, args.batch_size, split='test')
67
+
68
+ assert test_dataset_name in list(all_test_AudioTextDataInfo.keys()), "{} not a test set".format(test_dataset_name)
69
+ dataloader = all_test_AudioTextDataInfo[test_dataset_name].dataloader
70
+
71
+ deduplicate_tasks = ["Clotho-v2-AudioCaptioning", "audiocaps-AudioCaptioning", "MACS-AudioCaptioning", "LP-MusicCaps-MSD-AudioCaptioning", "LP-MusicCaps-MC-AudioCaptioning"]
72
+ if any([test_dataset_name.startswith(x) for x in deduplicate_tasks]):
73
+ deduplicate = True
74
+ else:
75
+ deduplicate = False
76
+
77
+ if os.path.exists(tmp_file):
78
+ with open(tmp_file, 'rb') as pickle_file:
79
+ tmp_data = pickle.load(pickle_file)
80
+ results_dic = tmp_data['results_dic']
81
+ results = tmp_data['results']
82
+ finished_batches = tmp_data['finished_batches']
83
+ print('reading tmp data from {}: {} batches already computed'.format(tmp_file, finished_batches+1))
84
+
85
+ else:
86
+ tmp_data = {}
87
+ results_dic = {} # for deduplicate
88
+ results = [] # for non-deduplicate
89
+ finished_batches = -1
90
+ print('no tmp data found; will store tmp data to {}'.format(tmp_file))
91
+
92
+ # print(len(dataloader))
93
+ # print('---------------------')
94
+ from itertools import islice
95
+ for batch_idx, batch in tqdm(enumerate(islice(dataloader, finished_batches, None), start=finished_batches)):
96
+ # for batch_idx, batch in tqdm(enumerate(dataloader)):
97
+ if end_batch_idx > 0 and batch_idx == end_batch_idx:
98
+ break
99
+
100
+ if batch_idx <= finished_batches:
101
+ continue
102
+
103
+ audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)
104
+ audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)
105
+ input_ids = batch["input_ids"].to(device_id, non_blocking=True)
106
+ filenames = batch["filenames"]
107
+ # print(input_ids)
108
+
109
+ media_token_id = tokenizer.encode("<audio>")[-1]
110
+ sep_token_id = tokenizer.sep_token_id
111
+
112
+ for idx in range(input_ids.shape[0]):
113
+ filename = filenames[idx]
114
+ if type(filename) is list:
115
+ # interleaved data
116
+ filename = filename[-1]
117
+
118
+ input_id = input_ids[idx]
119
+ for sep_location in range(len(input_id)-1, -1, -1):
120
+ # find last <SEP>
121
+ if input_id[sep_location] == sep_token_id:
122
+ break
123
+ # print(tokenizer.decode(input_id))
124
+ prompt = input_id[:sep_location+1]
125
+
126
+ prompt_decoded = tokenizer.decode(prompt).replace(tokenizer.sep_token, '')
127
+ ground_truth_decoded = tokenizer.decode(input_id).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
128
+
129
+ if not (deduplicate and (filename, prompt_decoded) in results_dic):
130
+ # print(prompt)
131
+ # print(prompt_decoded)
132
+ output = model.generate(
133
+ audio_x=audio_clips[idx].unsqueeze(0),
134
+ audio_x_mask=audio_embed_mask[idx].unsqueeze(0),
135
+ lang_x=prompt.unsqueeze(0),
136
+ eos_token_id=tokenizer.eos_token_id,
137
+ max_new_tokens=256,
138
+ temperature=temperature,
139
+ )[0]
140
+ output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
141
+ # print(ground_truth_decoded)
142
+ # print('------')
143
+ # print(output_decoded)
144
+
145
+ if deduplicate:
146
+ if (filename, prompt_decoded) in results_dic:
147
+ results_dic[(filename, prompt_decoded)]['ground_truth'].append(ground_truth_decoded)
148
+
149
+ else:
150
+ results_dic[(filename, prompt_decoded)] = {
151
+ 'ground_truth': [ground_truth_decoded],
152
+ 'output': output_decoded
153
+ }
154
+ else:
155
+ results.append((filename, prompt_decoded, ground_truth_decoded, output_decoded))
156
+
157
+
158
+ tmp_data['results_dic'] = results_dic
159
+ tmp_data['results'] = results
160
+ tmp_data['finished_batches'] = batch_idx
161
+ with open(tmp_file, 'wb') as pickle_file:
162
+ pickle.dump(tmp_data, pickle_file)
163
+
164
+ if deduplicate:
165
+ for (filename, prompt) in results_dic:
166
+ ground_truth = '|'.join(results_dic[(filename, prompt)]['ground_truth'])
167
+ output = results_dic[(filename, prompt)]['output']
168
+ results.append((filename, prompt, ground_truth, output))
169
+
170
+ # if verbose:
171
+ # for filename, prompt, ground_truth, output in results:
172
+ # print('-'*30)
173
+ # print('filename:', filename)
174
+ # print('prompt:', prompt)
175
+ # print('ground_truth:', ground_truth)
176
+ # print('output:', output)
177
+
178
+ return results
179
+
180
+
181
+ def main():
182
+ parser = argparse.ArgumentParser()
183
+ parser.add_argument('-c', '--config', type=str, default='../config/config.yaml', help='yaml config path')
184
+ parser.add_argument('-t', '--task', type=str, help='which task to inference')
185
+ parser.add_argument('-temp', '--temperature', type=float, default=1.0, help='temperature')
186
+ parser.add_argument('-nb', '--num_beams', type=int, default=1, help='num beams for beam search')
187
+ parser.add_argument('--ckpt', type=int, default=-1, help='checkpoint idx, -1 means latest')
188
+ parsed_args = parser.parse_args()
189
+
190
+ print(parsed_args)
191
+
192
+ test_dataset_name = parsed_args.task
193
+
194
+ output_file = os.path.join(
195
+ '../outputs/',
196
+ parsed_args.task.replace('/', '-'),
197
+ '{}-ckpt{}-{}.log'.format(
198
+ parsed_args.config.split('/')[-1][:-5],
199
+ parsed_args.ckpt,
200
+ "sft"
201
+ )
202
+ )
203
+ tmp_file = output_file.replace('.log', '.tmp.pickle')
204
+ print('output file:', output_file)
205
+
206
+ print('no previous log file; generating samples')
207
+
208
+ config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
209
+ # print(config)
210
+ # print('----------------------')
211
+ data_config = config['data_config']
212
+ model_config = config['model_config']
213
+ print(model_config)
214
+ clap_config = config['clap_config']
215
+ clap_config = config['clap_config']
216
+ mert_config = config['mert_config']
217
+ args = Dict2Class(config['train_config'])
218
+
219
+ results = inference_this(
220
+ args, data_config, clap_config, model_config, test_dataset_name,
221
+ temperature=float(parsed_args.temperature),
222
+ num_beams=int(parsed_args.num_beams),
223
+ ckpt=parsed_args.ckpt,
224
+ verbose=True,
225
+ tmp_file=tmp_file,
226
+ )
227
+
228
+ if __name__ == "__main__":
229
+ main()
eval/inference.sh ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ TO_SUBMIT_JOBS=$(ls ../configs | grep "inference.yaml")
4
+
5
+ ALL_TASK=$1
6
+ # ALL_TASK=""
7
+ # ALL_TASK="${ALL_TASK} MMAU/test"
8
+ # ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
9
+ # ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
10
+ # ALL_TASK="${ALL_TASK} audiocaps-AudioCaptioning/interleaved_knn-test"
11
+ # ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/interleaved_knn-test"
12
+
13
+ # # # ===== Classification =====
14
+ # ALL_TASK="${ALL_TASK} CochlScene-SceneClassification/test"
15
+ # ALL_TASK="${ALL_TASK} NonSpeech7k-EventClassification/test"
16
+
17
+ # # # ===== zero-shot =====
18
+ # ALL_TASK="${ALL_TASK} CREMA-D-EmotionClassification/train"
19
+ # ALL_TASK="${ALL_TASK} ravdess-EmotionClassification/train"
20
+ # ALL_TASK="${ALL_TASK} UrbanSound8K-EventClassification/train"
21
+ # ALL_TASK="${ALL_TASK} GTZAN-GenreClassification/train"
22
+ # ALL_TASK="${ALL_TASK} Medley-solos-DB-InstrClassification/test"
23
+
24
+ for task in ${ALL_TASK}
25
+ do
26
+ OUTFOLDER=${task//\//-} # replace / into -
27
+ mkdir -p ../outputs/$OUTFOLDER
28
+ done
29
+
30
+ temp=0.0
31
+ numbeams=1
32
+ ckpt=199
33
+
34
+ for EXP in $TO_SUBMIT_JOBS
35
+ do
36
+ L=${#EXP}
37
+ NAME=$(echo ${EXP} | cut -c 1-$(($L-5))) # remove last .yaml
38
+
39
+ for task in ${ALL_TASK}
40
+ do
41
+ echo "task: $task, config: $NAME, ckpt: $ckpt"
42
+
43
+ OUTFOLDER=${task//\//-}
44
+ OUTFILE="../outputs/$OUTFOLDER/$NAME-ckpt${ckpt}.log"
45
+ CKPT_DIR="/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3/$NAME"
46
+ python -u inference.py \
47
+ -c ../configs/$EXP \
48
+ -t $task \
49
+ -temp $temp \
50
+ -nb $numbeams \
51
+ --ckpt ${ckpt}
52
+
53
+ done
54
+ wait
55
+ done
eval/interactive.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # IMAGE=gitlab-master.nvidia.com/zkong/audio_flamingo_v1/audiolm:0.2
2
+ IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.2/image.sqsh"
3
+
4
+ submit_job -i -n interactive \
5
+ --gpu 1 \
6
+ --duration 2 \
7
+ --image $IMAGE \
8
+ --mounts /home/zkong,/lustre/fsw/portfolios/adlr/users/zkong
eval/keep_run.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHECK_EVERY=900
2
+ # DURATION_DAYS=10
3
+ # CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
4
+ # NEPOCH_PRE=99
5
+ # NEPOCH_SFT=159
6
+ # NAME="audio-gen-train_audiogen"
7
+
8
+ # for (( i = 1; i <= $CHECK_TOTAL; i++ ))
9
+ # do
10
+ # RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
11
+ # PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
12
+
13
+ # for STATE in "RUNNING" "PENDING" "NOT-RUN"
14
+ # do
15
+ # echo "===========${STATE}=========="
16
+
17
+ # if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
18
+ # echo ${NAME}
19
+ # elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
20
+ # echo ${NAME}
21
+ # elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
22
+
23
+ # base_path="/lustre/fsw/portfolios/adlr/users/sreyang/ckpts/stable_llm/harmonai_train/"
24
+ # # Find the last subfolder
25
+ # last_subfolder=$(ls -d "$base_path"*/ | sort -V | tail -n 1)
26
+ # # Find the last checkpoint in the subfolder
27
+ # last_ckpt=$(ls "$last_subfolder/checkpoints/"*.ckpt | sort -V | tail -n 1)
28
+ # echo $last_ckpt
29
+ # sh submit_job.sh "True" $last_ckpt
30
+ # sleep 1
31
+ # fi
32
+ # done
33
+ # echo "============================"
34
+ # sleep $CHECK_EVERY
35
+ # done
36
+
37
+ CHECK_EVERY=900
38
+ DURATION_DAYS=10
39
+ CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
40
+ NEPOCH_PRE=99
41
+ NEPOCH_SFT=159
42
+ NAME="eval"
43
+
44
+ for (( i = 1; i <= $CHECK_TOTAL; i++ ))
45
+ do
46
+ RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
47
+ PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
48
+
49
+ for STATE in "RUNNING" "PENDING" "NOT-RUN"
50
+ do
51
+ echo "===========${STATE}=========="
52
+
53
+ if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
54
+ echo ${NAME}
55
+ elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
56
+ echo ${NAME}
57
+ elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
58
+ sh submit.sh
59
+ sleep 1
60
+ fi
61
+ done
62
+ echo "============================"
63
+ sleep $CHECK_EVERY
64
+ done
eval/submit.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
2
+ NAME=eval
3
+ PARTITION="polar,polar3,polar4"
4
+ MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
5
+
6
+ LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
7
+
8
+ # "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
9
+ # Predefined list of strings
10
+
11
+
12
+ STRING_LIST=("Music4All/train")
13
+
14
+
15
+ # "MusicCaps-AudioCaptioning/test_2")
16
+
17
+ # "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
18
+
19
+ # "Clotho-v2-AudioCaptioning/test")
20
+
21
+ # "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
22
+
23
+ #"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
24
+
25
+ #"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
26
+
27
+ # "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
28
+
29
+ # "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
30
+ #("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
31
+
32
+ for i in "${STRING_LIST[@]}"; do
33
+
34
+ OUTFILE=$LOGDIR/output_$i-2.out
35
+
36
+ TASK=""
37
+ TASK="${TASK} $i"
38
+
39
+ SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
40
+ --mounts $MOUNTS \
41
+ --name audio-flamingo-$NAME \
42
+ --duration 4 \
43
+ --partition $PARTITION \
44
+ --gpu 2 \
45
+ --nodes 1 \
46
+ --image $IMAGE \
47
+ --email_mode never \
48
+ --outfile $OUTFILE \
49
+ --logdir $LOGDIR \
50
+ --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
51
+ --command "sh inference.sh $TASK"
52
+ sleep 30
53
+ done
54
+
eval/submit_2.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
2
+ NAME=eval
3
+ PARTITION="polar,polar2,polar3,polar4"
4
+ MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
5
+
6
+ LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
7
+
8
+ # "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
9
+ # Predefined list of strings
10
+
11
+
12
+ STRING_LIST=("Clotho-v2-AudioCaptioning/test" "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test" "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
13
+
14
+ # "Clotho-v2-AudioCaptioning/test")
15
+
16
+ # "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
17
+
18
+ #"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
19
+
20
+ #"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
21
+
22
+ # "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
23
+
24
+ # "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
25
+ #("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
26
+
27
+ for i in "${STRING_LIST[@]}"; do
28
+
29
+ OUTFILE=$LOGDIR/output_$i-4096_7.out
30
+
31
+ TASK=""
32
+ TASK="${TASK} $i"
33
+
34
+ SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
35
+ --mounts $MOUNTS \
36
+ --name audio-flamingo-$NAME \
37
+ --duration 4 \
38
+ --partition $PARTITION \
39
+ --gpu 2 \
40
+ --nodes 1 \
41
+ --image $IMAGE \
42
+ --email_mode never \
43
+ --outfile $OUTFILE \
44
+ --logdir $LOGDIR \
45
+ --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
46
+ --command "sh inference.sh $TASK"
47
+ sleep 30
48
+ done
49
+
my_laion_clap/CLAP/LICENSE ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Creative Commons Legal Code
2
+
3
+ CC0 1.0 Universal
4
+
5
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6
+ LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7
+ ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8
+ INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9
+ REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10
+ PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11
+ THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12
+ HEREUNDER.
13
+
14
+ Statement of Purpose
15
+
16
+ The laws of most jurisdictions throughout the world automatically confer
17
+ exclusive Copyright and Related Rights (defined below) upon the creator
18
+ and subsequent owner(s) (each and all, an "owner") of an original work of
19
+ authorship and/or a database (each, a "Work").
20
+
21
+ Certain owners wish to permanently relinquish those rights to a Work for
22
+ the purpose of contributing to a commons of creative, cultural and
23
+ scientific works ("Commons") that the public can reliably and without fear
24
+ of later claims of infringement build upon, modify, incorporate in other
25
+ works, reuse and redistribute as freely as possible in any form whatsoever
26
+ and for any purposes, including without limitation commercial purposes.
27
+ These owners may contribute to the Commons to promote the ideal of a free
28
+ culture and the further production of creative, cultural and scientific
29
+ works, or to gain reputation or greater distribution for their Work in
30
+ part through the use and efforts of others.
31
+
32
+ For these and/or other purposes and motivations, and without any
33
+ expectation of additional consideration or compensation, the person
34
+ associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35
+ is an owner of Copyright and Related Rights in the Work, voluntarily
36
+ elects to apply CC0 to the Work and publicly distribute the Work under its
37
+ terms, with knowledge of his or her Copyright and Related Rights in the
38
+ Work and the meaning and intended legal effect of CC0 on those rights.
39
+
40
+ 1. Copyright and Related Rights. A Work made available under CC0 may be
41
+ protected by copyright and related or neighboring rights ("Copyright and
42
+ Related Rights"). Copyright and Related Rights include, but are not
43
+ limited to, the following:
44
+
45
+ i. the right to reproduce, adapt, distribute, perform, display,
46
+ communicate, and translate a Work;
47
+ ii. moral rights retained by the original author(s) and/or performer(s);
48
+ iii. publicity and privacy rights pertaining to a person's image or
49
+ likeness depicted in a Work;
50
+ iv. rights protecting against unfair competition in regards to a Work,
51
+ subject to the limitations in paragraph 4(a), below;
52
+ v. rights protecting the extraction, dissemination, use and reuse of data
53
+ in a Work;
54
+ vi. database rights (such as those arising under Directive 96/9/EC of the
55
+ European Parliament and of the Council of 11 March 1996 on the legal
56
+ protection of databases, and under any national implementation
57
+ thereof, including any amended or successor version of such
58
+ directive); and
59
+ vii. other similar, equivalent or corresponding rights throughout the
60
+ world based on applicable law or treaty, and any national
61
+ implementations thereof.
62
+
63
+ 2. Waiver. To the greatest extent permitted by, but not in contravention
64
+ of, applicable law, Affirmer hereby overtly, fully, permanently,
65
+ irrevocably and unconditionally waives, abandons, and surrenders all of
66
+ Affirmer's Copyright and Related Rights and associated claims and causes
67
+ of action, whether now known or unknown (including existing as well as
68
+ future claims and causes of action), in the Work (i) in all territories
69
+ worldwide, (ii) for the maximum duration provided by applicable law or
70
+ treaty (including future time extensions), (iii) in any current or future
71
+ medium and for any number of copies, and (iv) for any purpose whatsoever,
72
+ including without limitation commercial, advertising or promotional
73
+ purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74
+ member of the public at large and to the detriment of Affirmer's heirs and
75
+ successors, fully intending that such Waiver shall not be subject to
76
+ revocation, rescission, cancellation, termination, or any other legal or
77
+ equitable action to disrupt the quiet enjoyment of the Work by the public
78
+ as contemplated by Affirmer's express Statement of Purpose.
79
+
80
+ 3. Public License Fallback. Should any part of the Waiver for any reason
81
+ be judged legally invalid or ineffective under applicable law, then the
82
+ Waiver shall be preserved to the maximum extent permitted taking into
83
+ account Affirmer's express Statement of Purpose. In addition, to the
84
+ extent the Waiver is so judged Affirmer hereby grants to each affected
85
+ person a royalty-free, non transferable, non sublicensable, non exclusive,
86
+ irrevocable and unconditional license to exercise Affirmer's Copyright and
87
+ Related Rights in the Work (i) in all territories worldwide, (ii) for the
88
+ maximum duration provided by applicable law or treaty (including future
89
+ time extensions), (iii) in any current or future medium and for any number
90
+ of copies, and (iv) for any purpose whatsoever, including without
91
+ limitation commercial, advertising or promotional purposes (the
92
+ "License"). The License shall be deemed effective as of the date CC0 was
93
+ applied by Affirmer to the Work. Should any part of the License for any
94
+ reason be judged legally invalid or ineffective under applicable law, such
95
+ partial invalidity or ineffectiveness shall not invalidate the remainder
96
+ of the License, and in such case Affirmer hereby affirms that he or she
97
+ will not (i) exercise any of his or her remaining Copyright and Related
98
+ Rights in the Work or (ii) assert any associated claims and causes of
99
+ action with respect to the Work, in either case contrary to Affirmer's
100
+ express Statement of Purpose.
101
+
102
+ 4. Limitations and Disclaimers.
103
+
104
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
105
+ surrendered, licensed or otherwise affected by this document.
106
+ b. Affirmer offers the Work as-is and makes no representations or
107
+ warranties of any kind concerning the Work, express, implied,
108
+ statutory or otherwise, including without limitation warranties of
109
+ title, merchantability, fitness for a particular purpose, non
110
+ infringement, or the absence of latent or other defects, accuracy, or
111
+ the present or absence of errors, whether or not discoverable, all to
112
+ the greatest extent permissible under applicable law.
113
+ c. Affirmer disclaims responsibility for clearing rights of other persons
114
+ that may apply to the Work or any use thereof, including without
115
+ limitation any person's Copyright and Related Rights in the Work.
116
+ Further, Affirmer disclaims responsibility for obtaining any necessary
117
+ consents, permissions or other rights required for any use of the
118
+ Work.
119
+ d. Affirmer understands and acknowledges that Creative Commons is not a
120
+ party to this document and has no duty or obligation with respect to
121
+ this CC0 or use of the Work.
my_laion_clap/CLAP/MANIFEST.in ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ recursive-include src/laion_clap/clap_module/model_configs *.json
2
+ recursive-include src/laion_clap/clap_module bpe_simple_vocab_16e6.txt.gz
3
+ recursive-include src/laion_clap/training audioset_textmap.npy
my_laion_clap/CLAP/README.md ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAP
2
+ <p align="center">
3
+ <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/logo.PNG" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
4
+ </p>
5
+ <p align="center">
6
+ <a href="https://arxiv.org/abs/2211.06687"><img src="https://img.shields.io/badge/arXiv-2211.06687-brightgreen.svg?style=flat-square"/></a>
7
+ <a href="https://pypi.org/project/laion-clap"><img src="https://badge.fury.io/py/laion-clap.svg"/></a>
8
+ <a href="https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Transformers-blue"/></a>
9
+ </p>
10
+
11
+ ### This repository provides representations of audios and texts via Contrastive Language-Audio Pretraining (CLAP)
12
+
13
+ With CLAP, you can extract a latent representation of any given audio and text for your own model, or for different downstream tasks.
14
+
15
+ All codes are comming officially with the following paper, accepted by IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023:
16
+ - [Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
17
+
18
+ **New Updates:**
19
+
20
+ <b>1. We release new CLAP pretrained checkpoints pretrained on music and speech data collecstions from [our dataset collection repo](https://github.com/LAION-AI/audio-dataset).</b>
21
+
22
+ <b>2. CLAP model is incorporated and supported by [HuggingFace Transformers](https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap). Many thanks to [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://fr.linkedin.com/in/arthur-zucker-8a0445144) for contributing to the HuggingFace support. </b>
23
+
24
+ ## About this project
25
+
26
+ This project is a project in [LAION](https://laion.ai/) that aims at learning better audio understanding and getting more audio data.
27
+ This is an opensource project. We adopt the codebase of [open_clip](https://github.com/mlfoundations/open_clip) for this project.
28
+
29
+ many thanks to <a href="https://github.com/cfoster0/CLAP">@cfoster0</a> for allowing us to use his repo name.
30
+
31
+ ## Architecture
32
+ Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP (Contrastive Language-Image Pretraining) architecture, the CLAP architecture is as follows.
33
+ <p align="center">
34
+ <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/audioclip-arch.png" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
35
+ </p>
36
+
37
+ ## Quick Start
38
+ We provide the PyPI library for our CLAP model:
39
+ ```bash
40
+ pip install laion-clap
41
+ ```
42
+
43
+ Then you can follow the below usage or refer to [unit_test.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/unit_test.py).
44
+
45
+ For the documentation of the API, please refer to [hook.py](https://github.com/LAION-AI/CLAP/blob/main/src/laion_clap/hook.py).
46
+
47
+ ```python
48
+ import numpy as np
49
+ import librosa
50
+ import torch
51
+ import laion_clap
52
+
53
+ # quantization
54
+ def int16_to_float32(x):
55
+ return (x / 32767.0).astype(np.float32)
56
+
57
+
58
+ def float32_to_int16(x):
59
+ x = np.clip(x, a_min=-1., a_max=1.)
60
+ return (x * 32767.).astype(np.int16)
61
+
62
+ model = laion_clap.CLAP_Module(enable_fusion=False)
63
+ model.load_ckpt() # download the default pretrained checkpoint.
64
+
65
+ # Directly get audio embeddings from audio files
66
+ audio_file = [
67
+ '/home/data/test_clap_short.wav',
68
+ '/home/data/test_clap_long.wav'
69
+ ]
70
+ audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=False)
71
+ print(audio_embed[:,-20:])
72
+ print(audio_embed.shape)
73
+
74
+ # Get audio embeddings from audio data
75
+ audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
76
+ audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
77
+ audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)
78
+ print(audio_embed[:,-20:])
79
+ print(audio_embed.shape)
80
+
81
+ # Directly get audio embeddings from audio files, but return torch tensor
82
+ audio_file = [
83
+ '/home/data/test_clap_short.wav',
84
+ '/home/data/test_clap_long.wav'
85
+ ]
86
+ audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
87
+ print(audio_embed[:,-20:])
88
+ print(audio_embed.shape)
89
+
90
+ # Get audio embeddings from audio data
91
+ audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
92
+ audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
93
+ audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
94
+ audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
95
+ print(audio_embed[:,-20:])
96
+ print(audio_embed.shape)
97
+
98
+ # Get text embedings from texts:
99
+ text_data = ["I love the contrastive learning", "I love the pretrain model"]
100
+ text_embed = model.get_text_embedding(text_data)
101
+ print(text_embed)
102
+ print(text_embed.shape)
103
+
104
+ # Get text embedings from texts, but return torch tensor:
105
+ text_data = ["I love the contrastive learning", "I love the pretrain model"]
106
+ text_embed = model.get_text_embedding(text_data, use_tensor=True)
107
+ print(text_embed)
108
+ print(text_embed.shape)
109
+
110
+ ```
111
+
112
+ ## Pretrained Models
113
+ The pretrained checkpoints can be found in [here](https://huggingface.co/lukewys/laion_clap/tree/main).
114
+ Please refer to the previous section for how to load and run the checkpoints.
115
+ For the PyPI library, [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) and [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) are our default models (non-fusion and fusion)
116
+
117
+ We further provide below pretrained models according to your usages:
118
+
119
+ * For general audio less than 10-sec: [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) or [630k-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-best.pt)
120
+ * For general audio with variable-length: [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) or [630k-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-fusion-best.pt)
121
+ * For music: [music_audioset_epoch_15_esc_90.14.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_audioset_epoch_15_esc_90.14.pt)
122
+ * For music and speech: [music_speech_epoch_15_esc_89.25.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_epoch_15_esc_89.25.pt)
123
+ * For speech, music and general audio: [music_speech_audioset_epoch_15_esc_89.98.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_audioset_epoch_15_esc_89.98.pt)
124
+
125
+ The checkpoints list here for each model setting is the one with the highest average mAP score in training.
126
+ The average mAP score is calculated by averaging 4 scores: A-->T mAP@10 on AudioCaps, and T-->A mAP@10 on AudioCaps, A-->T mAP@10 on Clotho, and T-->A mAP@10 on Clotho.
127
+
128
+ To use above pretrained models, you need to load the ckpt by yourself, as:
129
+
130
+ Update 2023.4.7: we have released 3 larger CLAP models trained on music, speech dataset in addition to LAION-Audio-630k. Here are descriptions of the model and their performance:
131
+
132
+ - `music_speech_audioset_epoch_15_esc_89.98.pt`: trained on music + speech + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 89.98%, the GTZAN performance is 51%.
133
+ - `music_audioset_epoch_15_esc_90.14.pt`: trained on music + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 90.14%, the GTZAN performance is 71%.
134
+ - `music_speech_epoch_15_esc_89.25.pt`: trained on music + speech + LAION-Audio-630k. The zeroshot ESC50 performance is 89.25%, the GTZAN performance is 69%.
135
+
136
+ The model uses a larger audio encoder. To load the model using the pip API:
137
+ ```python
138
+ import laion_clap
139
+ model = laion_clap.CLAP_Module(enable_fusion=False, amodel= 'HTSAT-base')
140
+ model.load_ckpt('checkpoint_path/checkpoint_name.pt')
141
+ ```
142
+
143
+ Please note that this is a temporary release for people who are working on larger-scale down-stream task.
144
+ We will release a more comprehensive version of the model with detailed experiments in the future.
145
+ Please take your own risk when using this model.
146
+
147
+ * All the new checkpoints did not trained with fusion. The training dataset size for `music_speech_audioset_epoch_15_esc_89.98.pt` is around 4M samples. The zeroshot GTZAN score is evaluated using the prompt `This audio is a <genre> song.`
148
+
149
+ <!-- We provide the CLAP's performance on audio classification tasks under the zero-shot setting or the supervised setting. More results can be found at our paper.
150
+ <p align="center">
151
+ <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/clap-zeroshot.PNG" alt="Zero-shot Performance" width="100%"/>
152
+ </p> -->
153
+
154
+
155
+
156
+
157
+ ## Environment Installation
158
+ If you want to check and reuse our model into your project instead of directly using the pip library, you need to install the same environment as we use, please run the following command:
159
+ ```bash
160
+ conda create env -n clap python=3.10
161
+ conda activate clap
162
+ git clone https://github.com/LAION-AI/CLAP.git
163
+ cd CLAP
164
+ # you can also install pytorch by following the official instruction (https://pytorch.org/get-started/locally/)
165
+ pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
166
+ pip install -r requirements.txt
167
+ ```
168
+ ## Dataset format
169
+ We use training data in webdataset format. For details of our dataset please see https://github.com/LAION-AI/audio-dataset.
170
+
171
+ Due to copyright reasons, we cannot release the dataset we train this model on. However, we released [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k), the data source we used to compose the dataset with link to each audio and their caption. Please refer to [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k) for more details. You could download the dataset, preprocess it on your own and train it locally. To train on the local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
172
+
173
+ You can find an example of our dataset format in [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing).
174
+ It contains the full ESC50 dataset, split according to the first 5-fold split.
175
+
176
+ ## Training, Fine-tuning and Evaluation
177
+ Please find the script of training, fine-tuning and evaluation (zero-shot and retrieval) in the [experiment_scripts](./experiment_scripts) folder.
178
+ The scripts included there are the one we used to train our model on a SLURM cluster.
179
+ You need to change the script to fit your own environment.
180
+ For example, in a single machine multi-GPU setting, you might want to use `torchrun` instead of `srun` to run the script.
181
+ To train on a single GPU machine, use `CUDA_VISIBLE_DEVICES=0 python -m ...` instead of `srun`.
182
+ We use [Weights and Biases](https://wandb.ai/site) for experiment logging. You need to configure the weights and biases in your environment.
183
+ To train on local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
184
+
185
+ ## Core Code
186
+ Please refer to [main.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/main.py), [train.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/train.py), [data.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/data.py),and [model.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/clap_module/model.py) to quicly get familiar with our model.
187
+
188
+
189
+ ## Reproducibility
190
+ An example of the preprocessed Clotho dataset in webdataset format can be download [here](https://drive.google.com/drive/folders/1mU9mBOe11jTFCrQRJQsUa4S-3TlNuYoI?usp=sharing) (by downloading, you will be agreeing the license described in the [Clotho dataset](https://zenodo.org/record/3490684#.Y9ALPeyZP1w)). The audio encoder pretrained with 48kHz AudioSet can be found [here](https://drive.google.com/drive/folders/1SMQyzJvc6DwJNuhQ_WI8tlCFL5HG2vk6?usp=sharing), where `HTSAT-fullset-imagenet-map=0.467.ckpt` is the checkpoint used to initalize our HTSAT audio encoder. You should get similar result by loading from the audio encoder checkpoint and training on same dataset.
191
+
192
+ The script to train the model on Clotho dataset is included [here](experiment_scripts/train-only-clotho.sh). You need to replace the `datasetpath` and `pretrained-audio` to pointing to your own directory. You could check the [report](https://stability.wandb.io/clap/clap/reports/CLAP-trained-on-Clotho-dataset--VmlldzoyNzY?accessToken=c0erq9hhp7h880jclihd9j9if679s6bylwto33vo14yo5jg40ppe38qeoafoonpz) of the training script on a single A100 GPU for reference.
193
+
194
+ Because most of the dataset has copyright restriction, unfortunatly we cannot directly share other preprocessed datasets. The caption generated by keyword-to-caption model for Audioset can be found [here](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k#keyword-to-caption-augmentation)
195
+
196
+
197
+ ## Zeroshot Classification with ESC50 official split
198
+
199
+ Here is an example code to run the zeroshot classification on **first** ESC50 official split with the pip API:
200
+
201
+ ```python
202
+ import laion_clap
203
+ import glob
204
+ import json
205
+ import torch
206
+ import numpy as np
207
+
208
+ device = torch.device('cuda:0')
209
+
210
+ # download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
211
+ esc50_test_dir = './ESC50_1/test/*/'
212
+ class_index_dict_path = './class_labels/ESC50_class_labels_indices_space.json'
213
+
214
+ # Load the model
215
+ model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
216
+ model.load_ckpt()
217
+
218
+ # Get the class index dict
219
+ class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
220
+
221
+ # Get all the data
222
+ audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
223
+ json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
224
+ ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
225
+
226
+ with torch.no_grad():
227
+ ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
228
+
229
+ # Get text features
230
+ all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
231
+ text_embed = model.get_text_embedding(all_texts)
232
+ audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
233
+
234
+ ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
235
+ preds = torch.where(ranking == ground_truth)[1]
236
+ preds = preds.cpu().numpy()
237
+
238
+ metrics = {}
239
+ metrics[f"mean_rank"] = preds.mean() + 1
240
+ metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
241
+ for k in [1, 5, 10]:
242
+ metrics[f"R@{k}"] = np.mean(preds < k)
243
+ # map@10
244
+ metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
245
+
246
+ print(
247
+ f"Zeroshot Classification Results: "
248
+ + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
249
+ )
250
+ ```
251
+
252
+ For ESC50 dataset, you could either download our processed ESC50 in webdataset format
253
+ from [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing), and extract the
254
+ `./test/0.tar` to `./test/`. Or you could download the original ESC50 dataset and
255
+ preprocess the label to the format of `class_labels/ESC50_class_labels_indices_space.json` by yourself (replace `_` with space).
256
+
257
+ The result should be the same as the following:
258
+
259
+ For `model = laion_clap.CLAP_Module(enable_fusion=True, device=device)`: `mean_rank: 1.2425 median_rank: 1.0000 R@1: 0.9050 R@5: 0.9900 R@10: 0.9925 mAP@10: 0.9407`
260
+
261
+ For `model = laion_clap.CLAP_Module(enable_fusion=False, device=device)`: `mean_rank: 1.1450 median_rank: 1.0000 R@1: 0.9275 R@5: 0.9975 R@10: 1.0000 mAP@10: 0.9556`
262
+
263
+ Note that the results is slightly higher than the reported results in the paper, because we use the train + test data of ESC50 and removing the data overlap in other training datasets (mainly freesound).
264
+
265
+ ## Citation
266
+ If you find this project and the LAION-Audio-630K dataset useful, please cite our paper:
267
+ ```
268
+ @inproceedings{laionclap2023,
269
+ title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
270
+ author = {Wu*, Yusong and Chen*, Ke and Zhang*, Tianyu and Hui*, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
271
+ booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
272
+ year = {2023}
273
+ }
274
+ @inproceedings{htsatke2022,
275
+ author = {Ke Chen and Xingjian Du and Bilei Zhu and Zejun Ma and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
276
+ title = {HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection},
277
+ booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
278
+ year = {2022}
279
+ }
280
+ ```
281
+
282
+ ## Acknowledgements
283
+
284
+ This project is working in progress, thus the codebase and model might not be perfect or bug-free.
285
+ We will very much appreciate any kind of contribution or and issue raised.
286
+ If you find a bug or have any suggestion, please feel free to open an issue or contact us.
287
+ If you would actively contribute to this project, please join the discord of LAION.
my_laion_clap/CLAP/assets/audioclip-arch.png ADDED
my_laion_clap/CLAP/assets/clap-zeroshot.PNG ADDED
my_laion_clap/CLAP/assets/logo.PNG ADDED
my_laion_clap/CLAP/experiment_scripts/esc50_api.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import laion_clap
2
+ import glob
3
+ import json
4
+ import torch
5
+ import numpy as np
6
+
7
+ device = torch.device('cuda:0')
8
+
9
+ # download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
10
+ esc50_test_dir = './ESC50_1/test/*/'
11
+ class_index_dict_path = '/fsx/yusong/CLAP/class_labels/ESC50_class_labels_indices_space.json'
12
+
13
+ # Load the model
14
+ model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
15
+ model.load_ckpt()
16
+
17
+ # Get the class index dict
18
+ class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
19
+
20
+ # Get all the data
21
+ audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
22
+ json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
23
+ ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
24
+
25
+ with torch.no_grad():
26
+ ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
27
+
28
+ # Get text features
29
+ all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
30
+ text_embed = model.get_text_embedding(all_texts)
31
+ audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
32
+
33
+ ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
34
+ preds = torch.where(ranking == ground_truth)[1]
35
+ preds = preds.cpu().numpy()
36
+
37
+ metrics = {}
38
+ metrics[f"mean_rank"] = preds.mean() + 1
39
+ metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
40
+ for k in [1, 5, 10]:
41
+ metrics[f"R@{k}"] = np.mean(preds < k)
42
+ # map@10
43
+ metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
44
+
45
+ print(
46
+ f"Zeroshot Classification Results: "
47
+ + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
48
+ )
my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_retrieval_main \
38
+ --save-frequency 5 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --warmup 0 \
44
+ --batch-size=512 \
45
+ --wd=0.0 \
46
+ --epochs=50 \
47
+ --workers=6 \
48
+ --use-bn-sync \
49
+ --freeze-text \
50
+ --amodel HTSAT-tiny \
51
+ --tmodel roberta \
52
+ --report-to "wandb" \
53
+ --wandb-notes "10.17-freesound-dataset-4#" \
54
+ --datasetnames "freesound_no_overlap_noesc50" \
55
+ --datasetinfos "train" \
56
+ --seed 3407 \
57
+ --remotedata \
58
+ --logs /fsx/clap_logs \
59
+ --gather-with-grad \
60
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
61
+ --data-filling "repeatpad" \
62
+ --data-truncating "rand_trunc" \
63
+ --pretrained="/fsx/clap_logs/2022_10_17-02_08_21-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"
my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
38
+ --save-frequency 50 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --warmup 0 \
44
+ --batch-size=160 \
45
+ --lr=1e-4 \
46
+ --wd=0.1 \
47
+ --epochs=100 \
48
+ --workers=4 \
49
+ --use-bn-sync \
50
+ --freeze-text \
51
+ --amodel PANN-14 \
52
+ --tmodel roberta \
53
+ --report-to "wandb" \
54
+ --wandb-notes "10.14-finetune-esc50" \
55
+ --datasetnames "esc50" \
56
+ --datasetinfos "train" \
57
+ --seed 3407 \
58
+ --remotedata \
59
+ --logs /fsx/clap_logs \
60
+ --gather-with-grad \
61
+ --lp-loss="ce" \
62
+ --lp-metrics="acc" \
63
+ --lp-lr=1e-4 \
64
+ --lp-mlp \
65
+ --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
66
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
67
+ --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
68
+ --data-filling "repeatpad" \
69
+ --data-truncating "rand_trunc" \
70
+ --optimizer "adam"
my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
38
+ --save-frequency 50 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --warmup 0 \
44
+ --batch-size=160 \
45
+ --lr=1e-4 \
46
+ --wd=0.1 \
47
+ --epochs=100 \
48
+ --workers=4 \
49
+ --use-bn-sync \
50
+ --freeze-text \
51
+ --amodel PANN-14 \
52
+ --tmodel roberta \
53
+ --report-to "wandb" \
54
+ --wandb-notes "10.14-finetune-fsd50k" \
55
+ --datasetnames "fsd50k_class_label" \
56
+ --datasetinfos "train" \
57
+ --seed 3407 \
58
+ --remotedata \
59
+ --logs /fsx/clap_logs \
60
+ --gather-with-grad \
61
+ --lp-loss="bce" \
62
+ --lp-metrics="map" \
63
+ --lp-lr=1e-4 \
64
+ --lp-mlp \
65
+ --class-label-path="../class_labels/FSD50k_class_labels_indices.json" \
66
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
67
+ --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
68
+ --data-filling "repeatpad" \
69
+ --data-truncating "rand_trunc" \
70
+ --optimizer "adam"
my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
38
+ --save-frequency 5 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --batch-size=96 \
44
+ --lr=1e-4 \
45
+ --wd=0.0 \
46
+ --epochs=45 \
47
+ --workers=6 \
48
+ --use-bn-sync \
49
+ --amodel HTSAT-tiny \
50
+ --tmodel roberta \
51
+ --warmup 3200 \
52
+ --report-to "wandb" \
53
+ --wandb-notes "10.16-clap-dataset-2#-htsat-roberta-fusion" \
54
+ --datasetnames "Clotho" "audiocaps" "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "freesound_no_overlap_noesc50" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" "MACS" "WavText5K" \
55
+ --full-train-dataset "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" \
56
+ --exclude-eval-dataset "freesound_no_overlap_noesc50" "MACS" "WavText5K" "fsd50k_class_label" \
57
+ --datasetinfos "train" "unbalanced_train" \
58
+ --top-k-checkpoint-select-dataset="Clotho-test" \
59
+ --top-k-checkpoint-select-metric="mAP@10" \
60
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
61
+ --logs /fsx/clap_logs \
62
+ --seed 3407 \
63
+ --remotedata \
64
+ --gather-with-grad \
65
+ --optimizer "adam" \
66
+ --data-filling "repeatpad" \
67
+ --data-truncating "fusion" \
68
+ --enable-fusion \
69
+ --fusion-type "aff_2d" \
70
+ --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt
my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
38
+ --save-frequency 5 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --batch-size=96 \
44
+ --lr=1e-4 \
45
+ --wd=0.0 \
46
+ --epochs=45 \
47
+ --workers=6 \
48
+ --use-bn-sync \
49
+ --amodel HTSAT-tiny \
50
+ --tmodel roberta \
51
+ --warmup 3200 \
52
+ --report-to "wandb" \
53
+ --wandb-notes "10.16-clap-dataset-1#-htsat-roberta" \
54
+ --datasetnames "Clotho" "audiocaps" \
55
+ --datasetinfos "train" "unbalanced_train" \
56
+ --top-k-checkpoint-select-dataset="Clotho-test" \
57
+ --top-k-checkpoint-select-metric="mAP@10" \
58
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
59
+ --logs /fsx/clap_logs \
60
+ --seed 3407 \
61
+ --remotedata \
62
+ --gather-with-grad \
63
+ --optimizer "adam" \
64
+ --data-filling "repeatpad" \
65
+ --data-truncating "rand_trunc" \
66
+ --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt
my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python -m laion_clap.training.main \
2
+ --save-frequency 5 \
3
+ --save-top-performance 3 \
4
+ --save-most-recent \
5
+ --dataset-type="webdataset" \
6
+ --datasetpath="<to-your-directory-containing-Clotho-not-the-path-to-Clotho>" \
7
+ --precision="fp32" \
8
+ --batch-size=96 \
9
+ --lr=1e-4 \
10
+ --wd=0.0 \
11
+ --epochs=45 \
12
+ --workers=6 \
13
+ --use-bn-sync \
14
+ --amodel HTSAT-tiny \
15
+ --tmodel roberta \
16
+ --warmup 3200 \
17
+ --datasetnames "Clotho" \
18
+ --datasetinfos "train" \
19
+ --top-k-checkpoint-select-dataset="Clotho-test" \
20
+ --top-k-checkpoint-select-metric="mAP@10" \
21
+ --logs 'logs' \
22
+ --seed 3407 \
23
+ --gather-with-grad \
24
+ --optimizer "adam" \
25
+ --data-filling "repeatpad" \
26
+ --data-truncating "rand_trunc" \
27
+ --pretrained-audio '<path-to>/HTSAT-fullset-imagenet-map=0.467.ckpt' \
28
+ --prefetch-factor 2
my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --comment clap
3
+ #SBATCH --partition=g40423
4
+ #SBATCH --job-name=mclap
5
+ #SBATCH --nodes 3
6
+ #SBATCH --ntasks-per-node 8
7
+ #SBATCH --cpus-per-gpu=6
8
+ #SBATCH --exclusive
9
+ #SBATCH --output=%x_%j.out
10
+
11
+ module load openmpi
12
+ module load cuda/11.7
13
+ export NCCL_PROTO=simple
14
+ export FI_EFA_FORK_SAFE=1
15
+ export FI_LOG_LEVEL=1
16
+ export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
17
+ export NCCL_DEBUG=info
18
+ export OMPI_MCA_mtl_base_verbose=1
19
+ export FI_EFA_ENABLE_SHM_TRANSFER=0
20
+ export FI_PROVIDER=efa
21
+ export FI_EFA_TX_MIN_CREDITS=64
22
+ export NCCL_TREE_THRESHOLD=0
23
+
24
+ # sent to sub script
25
+ export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
26
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
27
+ export MASTER_PORT=12802
28
+ export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
29
+
30
+ echo go $COUNT_NODE
31
+ echo $HOSTNAMES
32
+
33
+ source /fsx/yusong/clap/bin/activate
34
+ cd /fsx/yusong/CLAP/src
35
+ export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
36
+
37
+ srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
38
+ --save-frequency 5 \
39
+ --save-top-performance 3 \
40
+ --save-most-recent \
41
+ --dataset-type="webdataset" \
42
+ --precision="fp32" \
43
+ --batch-size=96 \
44
+ --lr=1e-4 \
45
+ --wd=0.0 \
46
+ --epochs=45 \
47
+ --workers=6 \
48
+ --use-bn-sync \
49
+ --amodel PANN-14 \
50
+ --tmodel roberta \
51
+ --warmup 500 \
52
+ --report-to "wandb" \
53
+ --wandb-notes "10.16-clap-dataset-1#-pann-roberta" \
54
+ --datasetnames "Clotho" "audiocaps" \
55
+ --datasetinfos "train" "unbalanced_train" \
56
+ --top-k-checkpoint-select-dataset="Clotho-test" \
57
+ --top-k-checkpoint-select-metric="mAP@10" \
58
+ --openai-model-cache-dir /fsx/yusong/transformers_cache \
59
+ --logs /fsx/clap_logs \
60
+ --seed 3407 \
61
+ --remotedata \
62
+ --gather-with-grad \
63
+ --optimizer "adam" \
64
+ --data-filling "repeatpad" \
65
+ --data-truncating "rand_trunc" \
66
+ --pretrained-audio /fsx/yusong/audio_pretrained_model/PANN-fullset-map=0.439.ckpt
my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # run from CLAP directory
2
+ python -m evaluate.eval_zeroshot_classification \
3
+ --dataset-type="webdataset" \
4
+ --precision="fp32" \
5
+ --batch-size=512 \
6
+ --workers=6 \
7
+ --amodel HTSAT-tiny \
8
+ --tmodel roberta \
9
+ --datasetnames "esc50_no_overlap" \
10
+ --remotedata \
11
+ --datasetinfos "train" \
12
+ --seed 3407 \
13
+ --logs ./logs \
14
+ --data-filling "repeatpad" \
15
+ --data-truncating "rand_trunc" \
16
+ --freeze-text \
17
+ --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
18
+ --pretrained="/fsx/clap_logs/2023_02_18-00_03_45-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"
19
+
my_laion_clap/CLAP/pyproject.toml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+ [project]
5
+ name = "laion_clap"
6
+ version = "1.1.4"
7
+ authors = [
8
+ { name="Ke Chen", email="knutchen@ucsd.edu" },
9
+ { name="Yusong Wu" },
10
+ { name="Tianyu Zhang" },
11
+ { name="Yuchen Hui" }
12
+ ]
13
+ maintainers = [
14
+ { name="Ke Chen", email="knutchen@ucsd.edu" },
15
+ { name="Yusong Wu" },
16
+ { name="Tianyu Zhang" },
17
+ { name="Yuchen Hui" }
18
+ ]
19
+ description = "Contrastive Language-Audio Pretraining Model from LAION"
20
+ license = {file = "LICENSE"}
21
+ readme = "README.md"
22
+ requires-python = ">=3.7"
23
+ dependencies = [
24
+ "numpy==1.23.5",
25
+ "soundfile",
26
+ "librosa",
27
+ "torchlibrosa",
28
+ "ftfy",
29
+ "braceexpand",
30
+ "webdataset",
31
+ "wget",
32
+ "wandb",
33
+ "llvmlite",
34
+ "scipy",
35
+ "scikit-learn",
36
+ "pandas",
37
+ "h5py",
38
+ "tqdm",
39
+ "regex",
40
+ "transformers",
41
+ "progressbar"
42
+ ]
43
+ classifiers = [
44
+ 'Development Status :: 3 - Alpha',
45
+ 'Intended Audience :: Developers',
46
+ 'Intended Audience :: Science/Research',
47
+ 'License :: OSI Approved :: Apache Software License',
48
+ 'Topic :: Scientific/Engineering :: Artificial Intelligence',
49
+ ]
50
+
51
+
52
+ [project.urls]
53
+ "Homepage" = "https://github.com/LAION-AI/CLAP"
54
+ "Bug Tracker" = "https://github.com/LAION-AI/CLAP/issues"
my_laion_clap/CLAP/requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ soundfile
2
+ librosa
3
+ torchlibrosa
4
+ ftfy
5
+ braceexpand
6
+ webdataset
7
+ wget
8
+ wandb
9
+ llvmlite
10
+ scipy
11
+ scikit-learn
12
+ pandas
13
+ h5py
14
+ tqdm
15
+ regex
16
+ transformers<=4.30.2
my_laion_clap/CLAP/src/laion_clap/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ dir_path = os.path.dirname(os.path.abspath(__file__))
4
+ sys.path.append(dir_path)
5
+ from .hook import CLAP_Module
my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (372 Bytes). View file
 
my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc ADDED
Binary file (7.78 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .factory import list_models, create_model, create_model_and_transforms, add_model_config
2
+ from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
3
+ from .model import CLAP, CLAPTextCfg, CLAPVisionCfg, CLAPAudioCfp, convert_weights_to_fp16, trace_model
4
+ from .openai import load_openai_model, list_openai_models
5
+ from .pretrained import list_pretrained, list_pretrained_tag_models, list_pretrained_model_tags,\
6
+ get_pretrained_url, download_pretrained
7
+ from .tokenizer import SimpleTokenizer, tokenize
8
+ from .transform import image_transform
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (1.02 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc ADDED
Binary file (6.64 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc ADDED
Binary file (4.23 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc ADDED
Binary file (30.6 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc ADDED
Binary file (7.98 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc ADDED
Binary file (23.8 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc ADDED
Binary file (4.51 kB). View file
 
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc ADDED
Binary file (13.2 kB). View file