Spaces:

nvidia
/

audio-flamingo-2

Running on L4

App Files Files Community

root commited on 29 days ago

Commit

a344f64

1 Parent(s): de86ffd

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +3 -3
app.py +232 -0
configs/inference.yaml +284 -0
configs/inference_1.5.yaml +302 -0
configs/inference_2.yaml +302 -0
configs/inference_long.yaml +284 -0
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +255 -0
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +183 -0
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +483 -0
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +284 -0
data/__pycache__/data.cpython-38.pyc +0 -0
data/data.py +669 -0
data/prepare_each_dataset.py +0 -0
eval/README.md +1 -0
eval/__init__.py +0 -0
eval/inference.py +229 -0
eval/inference.sh +55 -0
eval/interactive.sh +8 -0
eval/keep_run.sh +64 -0
eval/submit.sh +54 -0
eval/submit_2.sh +49 -0
my_laion_clap/CLAP/LICENSE +121 -0
my_laion_clap/CLAP/MANIFEST.in +3 -0
my_laion_clap/CLAP/README.md +287 -0
my_laion_clap/CLAP/assets/audioclip-arch.png +0 -0
my_laion_clap/CLAP/assets/clap-zeroshot.PNG +0 -0
my_laion_clap/CLAP/assets/logo.PNG +0 -0
my_laion_clap/CLAP/experiment_scripts/esc50_api.py +48 -0
my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh +63 -0
my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh +70 -0
my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh +70 -0
my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh +70 -0
my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh +66 -0
my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh +28 -0
my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh +66 -0
my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh +19 -0
my_laion_clap/CLAP/pyproject.toml +54 -0
my_laion_clap/CLAP/requirements.txt +16 -0
my_laion_clap/CLAP/src/laion_clap/__init__.py +5 -0
my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py +8 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc +0 -0
my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc +0 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: Audio Flamingo 2
-emoji: 🐢
-colorFrom: purple
 colorTo: red
 sdk: gradio
 sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: NVIDIA Audio Flamingo 2 Demo
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Audio Flamingo 2
+emoji: 🏃
+colorFrom: yellow
 colorTo: red
 sdk: gradio
 sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Audio Flamingo 2 Demo
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+import yaml
+import json
+import torch
+import spaces
+import librosa
+import argparse
+import numpy as np
+import gradio as gr
+from tqdm import tqdm
+import soundfile as sf
+from pydub import AudioSegment
+from safetensors.torch import load_file
+from huggingface_hub import snapshot_download
+from data.data import get_audiotext_dataloader
+from src.factory import create_model_and_transforms
+from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+snapshot_download(repo_id="SreyanG-NVIDIA/audio-flamingo-2", local_dir="./")
+config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
+data_config = config['data_config']
+model_config = config['model_config']
+clap_config = config['clap_config']
+args = Dict2Class(config['train_config'])
+model, tokenizer = create_model_and_transforms(
+    **model_config,
+    clap_config=clap_config,
+    use_local_files=args.offline,
+    gradient_checkpointing=args.gradient_checkpointing,
+    freeze_lm_embeddings=args.freeze_lm_embeddings,
+)
+device_id = 0
+model = model.to(device_id)
+model.eval()
+# Load metadata
+with open("safe_ckpt/metadata.json", "r") as f:
+    metadata = json.load(f)
+# Reconstruct the full state_dict
+state_dict = {}
+# Load each SafeTensors chunk
+for chunk_name in metadata:
+    chunk_path = f"safe_ckpt/{chunk_name}.safetensors"
+    chunk_tensors = load_file(chunk_path)
+    # Merge tensors into state_dict
+    state_dict.update(chunk_tensors)
+x,y = model.load_state_dict(state_dict, False)
+autocast = get_autocast(
+    args.precision, cache_enabled=(not args.fsdp)
+)
+cast_dtype = get_cast_dtype(args.precision)
+def get_num_windows(T, sr):
+    window_length  = int(float(clap_config["window_length"]) * sr)
+    window_overlap = int(float(clap_config["window_overlap"]) * sr)
+    max_num_window = int(clap_config["max_num_window"])
+    num_windows = 1
+    if T <= window_length:
+        num_windows = 1
+        full_length = window_length
+    elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+        num_windows = max_num_window
+        full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+    else:
+        num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+        full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+    return num_windows, full_length
+def read_audio(file_path, target_sr=16000, duration=30.0, start=0.0):
+    if file_path.endswith('.mp3'):
+        audio = AudioSegment.from_file(file_path)
+        if len(audio) > (start + duration) * 1000:
+            audio = audio[start * 1000:(start + duration) * 1000]
+        if audio.frame_rate != target_sr:
+            audio = audio.set_frame_rate(target_sr)
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        data = np.array(audio.get_array_of_samples())
+        if audio.sample_width == 2:
+            data = data.astype(np.float32) / np.iinfo(np.int16).max
+        elif audio.sample_width == 4:
+            data = data.astype(np.float32) / np.iinfo(np.int32).max
+        else:
+            raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+    else:
+        with sf.SoundFile(file_path) as audio:
+            original_sr = audio.samplerate
+            channels = audio.channels
+            max_frames = int((start + duration) * original_sr)
+            audio.seek(int(start * original_sr))
+            frames_to_read = min(max_frames, len(audio))
+            data = audio.read(frames_to_read)
+            if data.max() > 1 or data.min() < -1:
+                data = data / max(abs(data.max()), abs(data.min()))
+        if original_sr != target_sr:
+            if channels == 1:
+                data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+            else:
+                data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+        else:
+            if channels != 1:
+                data = data.T[0]
+    if data.min() >= 0:
+        data = 2 * data / abs(data.max()) - 1.0
+    else:
+        data = data / max(abs(data.max()), abs(data.min()))
+    assert len(data.shape) == 1, data.shape
+    return data
+def load_audio(audio_path):
+    sr = 16000
+    window_length  = int(float(clap_config["window_length"]) * sr)
+    window_overlap = int(float(clap_config["window_overlap"]) * sr)
+    max_num_window = int(clap_config["max_num_window"])
+    duration = max_num_window * (clap_config["window_length"] - clap_config["window_overlap"]) + clap_config["window_overlap"]
+    audio_data = read_audio(audio_path, sr, duration, 0.0) # hard code audio start to 0.0
+    T = len(audio_data)
+    num_windows, full_length = get_num_windows(T, sr)
+    # pads to the nearest multiple of window_length
+    if full_length > T:
+        audio_data = np.append(audio_data, np.zeros(full_length - T))
+    audio_data = audio_data.reshape(1, -1)
+    audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+    audio_clips = []
+    audio_embed_mask = torch.ones(num_windows)
+    for i in range(num_windows):
+        start = i * (window_length - window_overlap)
+        audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+        audio_clips.append(audio_data_tensor_this)
+    if len(audio_clips) < max_num_window:
+        audio_clips = audio_clips[:max_num_window]
+        audio_embed_mask = audio_embed_mask[:max_num_window]
+    audio_clips = torch.cat(audio_clips)
+    return audio_clips, audio_embed_mask
+@spaces.GPU
+def predict(filepath, question):
+    audio_clips, audio_embed_mask = load_audio(filepath)
+    audio_clips = audio_clips.to(device_id, dtype=cast_dtype, non_blocking=True)
+    audio_embed_mask = audio_embed_mask.to(device_id, dtype=cast_dtype, non_blocking=True)
+    text_prompt = str(question).lower()
+    text_output = str(question).lower()
+    sample = f"<audio>{text_prompt.strip()}{tokenizer.sep_token}"
+    # None<|endofchunk|>{tokenizer.eos_token}"
+    text = tokenizer(
+        sample,
+        max_length=512,
+        padding="longest",
+        truncation="only_first",
+        return_tensors="pt"
+    )
+    input_ids = text["input_ids"].to(device_id, non_blocking=True)
+    media_token_id = tokenizer.encode("<audio>")[-1]
+    sep_token_id = tokenizer.sep_token_id
+    prompt = input_ids
+    with torch.no_grad():
+        output = model.generate(
+            audio_x=audio_clips.unsqueeze(0),
+            audio_x_mask=audio_embed_mask.unsqueeze(0),
+            lang_x=prompt,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=256,
+            temperature=0.0)[0]
+    output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+    return output_decoded
+link = "TBD"
+text = "[Github]"
+paper_link = "https://github.com/NVIDIA/audio-flamingo/"
+paper_text = "TBD"
+demo = gr.Interface(fn=predict,
+                    inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Edit the textbox to ask your own questions!')],
+                    outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
+                    cache_examples=True,
+                    title="Audio Flamingo 2 Demo",
+                    description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
+                    "**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
+                    "The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later points.")
+demo.launch(share=True)

configs/inference.yaml ADDED Viewed

	@@ -0,0 +1,284 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3
+  delete_previous_checkpoint: true
+  batch_size: 8
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    MMAUQA/train:
+      weight: 1.5
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+    CompA-R-AQA/train:
+      weight: 1.0
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+    SalmonnQA/train:
+      weight: 1.0
+    AudioEntailmentQA/train:
+      weight: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 1.0
+    # Audio Classification
+    FSD50k-EventClassification/train:
+      weight: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 1.0
+    NonSpeech7k-EventClassification/train:
+      weight: 1.0
+    chime-home-EventClassification/train:
+      weight: 1.0
+    SONYC-UST-EventClassification/train:
+      weight: 1.0
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+    tess-EmotionClassification/train:
+      weight: 2.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+    musdbhq-captioning/train:
+      weight: 2.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 0.2
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    FMA-GenreClassification/train:
+      weight: 0.5
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+    LLARK_FMA-mir/train:
+      weight: 1.0
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+    MusicBenchQA/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # UrbanSound8K-EventClassification/train:
+    #   prefix_prob: 1.0
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+    # Medley-solos-DB-InstrClassification/test:
+    #   prefix_prob: 1.0
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: clap_ckpt/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

configs/inference_1.5.yaml ADDED Viewed

	@@ -0,0 +1,302 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
+  delete_previous_checkpoint: true
+  batch_size: 32
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    MMAUQA/train:
+      weight: 1.5
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+    CompA-R-AQA/train:
+      weight: 1.0
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+    SalmonnQA/train:
+      weight: 0.8
+    AudioEntailmentQA/train:
+      weight: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 1.0
+    # Audio Classification
+    UrbanSound8K-EventClassification/train:
+      weight: 0.5
+    TUT-EventClassification/train:
+      weight: 2.0
+    FSD50k-EventClassification/train:
+      weight: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 1.0
+    NonSpeech7k-EventClassification/train:
+      weight: 1.0
+    chime-home-EventClassification/train:
+      weight: 1.0
+    SONYC-UST-EventClassification/train:
+      weight: 1.0
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+    tess-EmotionClassification/train:
+      weight: 2.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+    musdbhq-captioning/train:
+      weight: 2.0
+    # Music Understanding
+    Medley-solos-DB-InstrClassification/train:
+      weight: 1.5
+    GTZAN-GenreClassification/train:
+      weight: 2.0
+    NSynth-MIR/train:
+      weight: 0.4
+    NSynth-Instrument/train:
+      weight: 1.5
+    NSynth-Source/train:
+      weight: 1.5
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+    FMA-GenreClassification/train:
+      weight: 1.0
+    musdbhq-InstrClassification/train:
+      weight: 1.0
+    LLARK_FMA-mir/train:
+      weight: 1.0
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+    MusicBenchQA/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+    # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # UrbanSound8K-EventClassification/train:
+    #   prefix_prob: 1.0
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+    # Medley-solos-DB-InstrClassification/test:
+    #   prefix_prob: 1.0
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-1.5B
+  tokenizer_path: Qwen/Qwen2.5-1.5B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

configs/inference_2.yaml ADDED Viewed

	@@ -0,0 +1,302 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
+  delete_previous_checkpoint: true
+  batch_size: 4
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
+  pretrained_ckpt: checkpoint_199.pt
+  unfreeze_full_lm: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    MMAUQA/train:
+      weight: 1.5
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+    CompA-R-AQA/train:
+      weight: 1.0
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+    SalmonnQA/train:
+      weight: 0.8
+    AudioEntailmentQA/train:
+      weight: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 1.0
+    # Audio Classification
+    UrbanSound8K-EventClassification/train:
+      weight: 0.5
+    TUT-EventClassification/train:
+      weight: 2.0
+    FSD50k-EventClassification/train:
+      weight: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 1.0
+    NonSpeech7k-EventClassification/train:
+      weight: 1.0
+    chime-home-EventClassification/train:
+      weight: 1.0
+    SONYC-UST-EventClassification/train:
+      weight: 1.0
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+    tess-EmotionClassification/train:
+      weight: 2.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+    musdbhq-captioning/train:
+      weight: 2.0
+    # Music Understanding
+    Medley-solos-DB-InstrClassification/train:
+      weight: 1.5
+    GTZAN-GenreClassification/train:
+      weight: 2.0
+    NSynth-MIR/train:
+      weight: 0.4
+    NSynth-Instrument/train:
+      weight: 1.5
+    NSynth-Source/train:
+      weight: 1.5
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+    FMA-GenreClassification/train:
+      weight: 1.0
+    musdbhq-InstrClassification/train:
+      weight: 1.0
+    LLARK_FMA-mir/train:
+      weight: 1.0
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+    MusicBenchQA/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+    # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # UrbanSound8K-EventClassification/train:
+    #   prefix_prob: 1.0
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+    # Medley-solos-DB-InstrClassification/test:
+    #   prefix_prob: 1.0
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

configs/inference_long.yaml ADDED Viewed

	@@ -0,0 +1,284 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
+  delete_previous_checkpoint: true
+  batch_size: 2
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    MMAUQA/train:
+      weight: 1.5
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+    CompA-R-AQA/train:
+      weight: 1.0
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+    SalmonnQA/train:
+      weight: 1.0
+    AudioEntailmentQA/train:
+      weight: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 1.0
+    # Audio Classification
+    FSD50k-EventClassification/train:
+      weight: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 1.0
+    NonSpeech7k-EventClassification/train:
+      weight: 1.0
+    chime-home-EventClassification/train:
+      weight: 1.0
+    SONYC-UST-EventClassification/train:
+      weight: 1.0
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+    tess-EmotionClassification/train:
+      weight: 2.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+    musdbhq-captioning/train:
+      weight: 2.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 0.2
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    FMA-GenreClassification/train:
+      weight: 0.5
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+    LLARK_FMA-mir/train:
+      weight: 1.0
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+    MusicBenchQA/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # UrbanSound8K-EventClassification/train:
+    #   prefix_prob: 1.0
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+    # Medley-solos-DB-InstrClassification/test:
+    #   prefix_prob: 1.0
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 30  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml ADDED Viewed

	@@ -0,0 +1,255 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
+  delete_previous_checkpoint: true
+  batch_size: 6
+  gradient_accumulation_steps: 2  # 4 nodes
+  seed: 42
+  learning_rate: 0.0001
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 100  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: true
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+data_config:
+  dataset_blending_global_weight: 0.01
+  dataset_blending_config:
+    # Audio QA
+    OpenAQA-AQA/train:
+      weight: 1.0
+      prefix_prob: 0.0
+      augmentations:
+        do_nothing: 1.0
+    # Audio Captioning
+    BBCSoundEffects-AudioDescription/train:
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    CLAP_freesound-AudioCaptioning/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    SoundDescs-AudioDescription/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    WavCaps-AudioSet_SL-AudioCaptioning/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
+      weight: 2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    WavCaps-FreeSound-AudioCaptioning/train:
+      weight: 2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    WavCaps-SoundBible-AudioCaptioning/train:
+      weight: 5
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    # Audio Classification
+    AudioSetFullwoAudioMusicCaps-EventClassification/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+    AudioSet-EventClassification/train:
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+    Clotho-AQA-EventClassification/train:
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+    WavText5K-Tagging/train:
+      weight: 3.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+    # Speech Emotion Classification
+    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-EmotionClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-EmotionClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-SentimentClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-SentimentClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    # Music QA
+    Music-AVQA-AVQA_All/train:
+      weight: 3.0
+      prefix_prob: 0.5
+      augmentations:
+        AQA_binary_instruction: 1.0
+    MU-LLAMA-AQA/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    MU-LLAMA-AQA/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 0.6
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    NSynth-MIR/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    CLAP_freesound-AudioCaptioning/test: true
+    SoundDescs-AudioDescription/test: true
+    Clotho-AQA-EventClassification/test: true
+    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
+    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
+    MELD-EmotionClassification/test: true
+    MELD-EmotionClassification/interleaved_knn-test: true
+    MELD-SentimentClassification/test: true
+    MELD-SentimentClassification/interleaved_knn-test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/val: true
+    NSynth-MIR/test: true
+    NSynth-MIR/interleaved_knn-test: true
+    mtg-jamendo-MusicTagging/val: true
+clap_config:
+  # method: laion-clap
+  # audio_embed_dim: 512
+  # model_name: 630k-fusion-best
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
+  # model_name: '2023'
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
+  model_name: 'clapcap'
+  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
+  window_length: 7.0  # seconds
+  window_overlap: 5.25  # seconds
+  max_num_window: 16  # 35 seconds
+  max_num_fewshot: 4  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 16,  # must = max_num_window
+  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml ADDED Viewed

	@@ -0,0 +1,183 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
+  delete_previous_checkpoint: true
+  batch_size: 4
+  gradient_accumulation_steps: 2  # 4 nodes
+  seed: 42
+  learning_rate: 0.0001
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    # Audio QA
+    OpenAQA-AQA/train:
+      weight: 1.0
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 2.0
+    CompA-R-AQA/train:
+      weight: 2.0
+    # Audio Captioning
+    BBCSoundEffects-AudioDescription/train:
+      weight: 5.0
+    CLAP_freesound-AudioCaptioning/train:
+      weight: 1.0
+    SoundDescs-AudioDescription/train:
+      weight: 1.0
+    WavCaps-AudioSet_SL-AudioCaptioning/train:
+      weight: 1.0
+    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
+      weight: 2.0
+    WavCaps-FreeSound-AudioCaptioning/train:
+      weight: 2.0
+    WavCaps-SoundBible-AudioCaptioning/train:
+      weight: 5.0
+    Ego-10-AudioCaptioning/train:
+      weight: 2.0
+    Ego-30-AudioCaptioning/train:
+      weight: 2.0
+    # Audio Classification
+    AudioSetFullwoAudioMusicCaps-EventClassification/train:
+      weight: 1.0
+    AudioSet-EventClassification/train:
+      weight: 5.0
+    Clotho-AQA-EventClassification/train:
+      weight: 5.0
+    WavText5K-Tagging/train:
+      weight: 3.0
+    # Speech Emotion Classification
+    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
+      weight: 3.0
+    MELD-EmotionClassification/train:
+      weight: 3.0
+    MELD-SentimentClassification/train:
+      weight: 3.0
+    # Music QA
+    Music-AVQA-AVQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 3.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 1.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 1.0
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    CLAP_freesound-AudioCaptioning/test: true
+    SoundDescs-AudioDescription/test: true
+    Clotho-AQA-EventClassification/test: true
+    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/val: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 3  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml ADDED Viewed

	@@ -0,0 +1,483 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
+  run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
+  delete_previous_checkpoint: true
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: fp32  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 160  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
+  pretrained_ckpt: checkpoint_99.pt
+  unfreeze_full_lm: true
+data_config:
+  dataset_blending_global_weight: 0.01
+  dataset_blending_config:
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    Clotho-AQA-AQA/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    Clotho-v2-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    audiocaps-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    Epidemic_sound-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    MACS-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    # Audio Classification
+    FSD50k-EventClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+    FSD50k-EventClassification/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    CochlScene-SceneClassification/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    NonSpeech7k-EventClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    NonSpeech7k-EventClassification/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    chime-home-EventClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    chime-home-EventClassification/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    SONYC-UST-EventClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    SONYC-UST-EventClassification/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.1
+        default: 0.9
+    emov-db-EmotionClassification/train:
+      weight: 1.6
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    emov-db-EmotionClassification/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    jl-corpus-EmotionClassification/interleaved_knn-train:
+      weight: 1.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    tess-EmotionClassification/train:
+      weight: 2.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    tess-EmotionClassification/interleaved_knn-train:
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 2.4
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    IEMOCAP-EmotionClassification/interleaved_knn-train:
+      weight: 0.6
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    VocalSound-VocalClassification/train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 2.0
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    Music-AVQA-AQA_All/interleaved_knn-train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    MU-LLAMA-AQA/train:
+      weight: 0.9
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    MU-LLAMA-AQA/interleaved_knn-train:
+      weight: 0.1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.05  # 1.3M
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
+      weight: 0.05  # 111k
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 1.6
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    MusicCaps-AudioCaptioning/interleaved_knn-train:
+      weight: 1.5
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    SongDescriber-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    SongDescriber-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 0.2  # 289k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    NSynth-MIR/interleaved_knn-train:
+      weight: 0.2  # 60k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+    FMA-GenreClassification/train:
+      weight: 0.4  # 104k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    FMA-GenreClassification/interleaved_knn-train:
+      weight: 0.3  # 46k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-AQA-AQA/interleaved_knn-test: true
+    Clotho-v2-AudioCaptioning/test: true
+    Clotho-v2-AudioCaptioning/interleaved_knn-test: true
+    FSD50k-EventClassification/test: true
+    FSD50k-EventClassification/interleaved_knn-test: true
+    CochlScene-SceneClassification/test: true
+    CochlScene-SceneClassification/interleaved_knn-test: true
+    NonSpeech7k-EventClassification/test: true
+    NonSpeech7k-EventClassification/interleaved_knn-test: true
+    SONYC-UST-EventClassification/test: true
+    SONYC-UST-EventClassification/interleaved_knn-test: true
+    emov-db-EmotionClassification/val: true
+    emov-db-EmotionClassification/interleaved_knn-val: true
+    jl-corpus-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/interleaved_knn-val: true
+    tess-EmotionClassification/val: true
+    tess-EmotionClassification/interleaved_knn-val: true
+    IEMOCAP-EmotionClassification/test: true
+    IEMOCAP-EmotionClassification/interleaved_knn-test: true
+    OMGEmotion-EmotionClassification/val: true
+    Music-AVQA-AQA_All/test: true
+    Music-AVQA-AQA_All/interleaved_knn-test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
+    NSynth-MIR/test: true
+    NSynth-MIR/interleaved_knn-test: true
+    mtg-jamendo-MusicTagging/val: true
+    audiocaps-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/interleaved_knn-test: true
+    MusicCaps-AudioCaptioning/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    VocalSound-VocalClassification/test: true
+    musdbhq-InstrClassification/test: true
+    # zero shot
+    GTZAN-GenreClassification/train:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+    GTZAN-GenreClassification/interleaved_knn-train:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+    Medley-solos-DB-InstrClassification/test:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+    Medley-solos-DB-InstrClassification/interleaved_knn-test:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+clap_config:
+  # method: laion-clap
+  # audio_embed_dim: 512
+  # model_name: 630k-fusion-best
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
+  # model_name: '2023'
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
+  model_name: 'clapcap'
+  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
+  window_length: 7.0  # seconds
+  window_overlap: 5.25  # seconds
+  max_num_window: 16  # 35 seconds
+  max_num_fewshot: 4  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 16,  # must = max_num_window
+  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml ADDED Viewed

	@@ -0,0 +1,284 @@

+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
+  delete_previous_checkpoint: true
+  batch_size: 4
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false
+  fsdp: true
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+  pretrained_ckpt: checkpoint_199.pt
+  unfreeze_full_lm: false
+data_config:
+  dataset_blending_global_weight: 0.005
+  dataset_blending_config:
+    MMAUQA/train:
+      weight: 1.5
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+    CompA-R-AQA/train:
+      weight: 1.0
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    OpenAQA-AQA/train:
+      weight: 1.0
+    SalmonnQA/train:
+      weight: 1.0
+    AudioEntailmentQA/train:
+      weight: 1.0
+    # Audio Captioning
+    Clotho-v2-AudioCaptioning/train:
+      weight: 1.0
+    audiocaps-AudioCaptioning/train:
+      weight: 1.0
+    Epidemic_sound-AudioCaptioning/train:
+      weight: 1.0
+    MACS-AudioCaptioning/train:
+      weight: 1.0
+    # Audio Classification
+    FSD50k-EventClassification/train:
+      weight: 1.0
+    CochlScene-SceneClassification/train:
+      weight: 1.0
+    NonSpeech7k-EventClassification/train:
+      weight: 1.0
+    chime-home-EventClassification/train:
+      weight: 1.0
+    SONYC-UST-EventClassification/train:
+      weight: 1.0
+    # Speech Emotion Classification
+    MELD-EmotionClassification/train:
+      weight: 0.5
+    MELD-SentimentClassification/train:
+      weight: 0.5
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+    tess-EmotionClassification/train:
+      weight: 2.5
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+    # Music QA
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+    # Music Captioning
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+    musdbhq-captioning/train:
+      weight: 2.0
+    # Music Understanding
+    NSynth-MIR/train:
+      weight: 0.2
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    FMA-GenreClassification/train:
+      weight: 0.5
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+    LLARK_FMA-mir/train:
+      weight: 1.0
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+    MusicBenchQA/train:
+      weight: 1.0
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+  valid_dataset_config:
+    Clotho-AQA-AQA/test: true
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+    # UrbanSound8K-EventClassification/train:
+    #   prefix_prob: 1.0
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+    # Medley-solos-DB-InstrClassification/test:
+    #   prefix_prob: 1.0
+clap_config:
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+whisper_config:
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+mert_config:
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }

data/__pycache__/data.cpython-38.pyc ADDED Viewed

Binary file (16.4 kB). View file

data/data.py ADDED Viewed

	@@ -0,0 +1,669 @@

+import functools
+import io
+import json
+import math
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+import random
+import re
+import string
+import subprocess
+import sys
+import yaml
+import numpy as np
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from pydub import AudioSegment
+from tqdm import tqdm
+import torch
+import torchvision
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoTokenizer
+import librosa
+import soundfile as sf
+EMOTION_MAP_DICT = {
+    'amused':       'amused'      ,
+    'anger':        'angry'       , 'angry':        'angry'       ,
+    'anxious':      'anxious'     ,
+    'apologetic':   'apologetic'  ,
+    'assertive':    'assertive'   ,
+    'calm':         'calm'        ,
+    'concerned':    'concerned'   ,
+    'contempt':     'contempt'    ,
+    'disgust':      'disgusted'   , 'disgusted':    'disgusted'   ,
+    'encouraging':  'encouraging' ,
+    'excited':      'excited'     ,
+    'fear':         'fearful'     , 'fearful':      'fearful'     ,
+    'frustated':    'frustated'   ,
+    'happy':        'happy'       , 'joy':          'happy'       ,
+    'neutral':      'neutral'     ,
+    'sad':          'sad'         , 'sadness':      'sad'         ,
+    'sleepy':       'sleepy'      ,
+    'surprise':     'surprised'   , 'surprised':    'surprised'   ,
+    'pleasantly surprised': 'pleasantly surprised' ,
+}
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+class DataCollator:
+    def __init__(self, tokenizer, clap_config):
+        self.tokenizer = tokenizer
+        self.clap_config = clap_config
+        self.max_num_window = clap_config["max_num_window"]
+    def __call__(self, batch):
+        filenames, audio_clips, audio_embed_masks, input_ids, attention_masks = zip(*batch)
+        num_windows_all = [sum(audio_embed_mask) for audio_embed_mask in audio_embed_masks]
+        max_window_batch = int(max(num_windows_all))
+        if max_window_batch > self.max_num_window:
+            max_window_batch = self.max_num_window
+        padded_audio_clips = []
+        padded_audio_embed_masks = []
+        for audio_clip, audio_embed_mask in zip(audio_clips,audio_embed_masks):
+            this_audio_clip_clips = [clip for clip in audio_clip]
+            num_windows = len(this_audio_clip_clips)
+            if num_windows < max_window_batch:
+                for _ in range(max_window_batch - num_windows):
+                    this_audio_clip_clips.append(torch.zeros_like(this_audio_clip_clips[-1]))
+                audio_clip = torch.cat(this_audio_clip_clips)
+                audio_embed_mask = torch.zeros(max_window_batch)
+                audio_embed_mask[:num_windows] = 1
+            elif num_windows < max_window_batch:
+                audio_clip = this_audio_clip_clips[:max_window_batch]
+                audio_clip = torch.cat(this_audio_clip_clips)
+                audio_embed_mask = audio_embed_mask[:max_window_batch]
+            else:
+                audio_clip = torch.cat(this_audio_clip_clips)
+            padded_audio_clips.append(audio_clip)
+            padded_audio_embed_masks.append(audio_embed_mask)
+        audio_clips = torch.cat([x.unsqueeze(0) for x in padded_audio_clips], dim=0)
+        audio_embed_mask = torch.cat([x.unsqueeze(0) for x in padded_audio_embed_masks], dim=0)
+        max_length = max([ids.shape[1] for ids in input_ids])
+        padded_input_ids = []
+        padded_attention_masks = []
+        for ids, mask in zip(input_ids, attention_masks):
+            if ids.shape[1] < max_length:
+                padded_input_ids.append(
+                    torch.cat([ids, torch.LongTensor([self.tokenizer.pad_token_id] * (max_length - ids.shape[1])).unsqueeze(0)], dim=1)
+                )
+                padded_attention_masks.append(
+                    torch.cat([mask, torch.LongTensor([0] * (max_length - mask.shape[1])).unsqueeze(0)], dim=1)
+                )
+            else:
+                padded_input_ids.append(ids)
+                padded_attention_masks.append(mask)
+        padded_input_ids = torch.cat(padded_input_ids, dim=0)
+        padded_attention_masks = torch.cat(padded_attention_masks, dim=0).bool()
+        out_dict = dict(
+            filenames=filenames,
+            audio_clips=audio_clips,
+            audio_embed_mask=audio_embed_mask,
+            input_ids=padded_input_ids,
+            attention_mask=padded_attention_masks
+        )
+        return out_dict
+class AudioTextData(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        dataset_file_root: str,
+        data_root: str,
+        clap_config: dict,
+        dataset_blending_global_weight: float,
+        dataset_blending_config: dict,
+        dataset_blending_output: str,
+        tokenizer,
+        max_tokens: int,
+        split: str = 'train',
+        valid_dataset_config: dict = {},
+        valid_dataset_name: str = '',
+        epoch: int = 0,
+        force_reblend: bool = False,
+        sr = 16000,
+        **kwargs
+    ):
+        self.dataset_file_root = dataset_file_root
+        self.data_root = data_root
+        self.clap_config = clap_config
+        self.dataset_blending_global_weight = dataset_blending_global_weight
+        self.dataset_blending_config = dataset_blending_config
+        self.sr = sr
+        self.split = split
+        self.epoch = epoch
+        self.force_reblend = force_reblend
+        assert self.split in ['train', 'val', 'test']
+        if self.split == 'train':
+            self.data = self.blend_dataset(dataset_blending_config, dataset_blending_output)
+        elif self.split in ['val', 'test']:
+            self.valid_data = self.validation_dataset(valid_dataset_config, valid_dataset_name)
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "right"
+        self.max_tokens = max_tokens
+    @staticmethod
+    def shuffle_dict_fixed_rand(dic, seed=0):
+        print('randomly shuffling key-value pairs')
+        local_random = np.random.default_rng(seed)
+        original_keys = list(dic.keys())
+        shuffled_keys = deepcopy(original_keys)
+        local_random.shuffle(shuffled_keys)
+        shuffling_mapping = {x: y for (x, y) in zip(original_keys, shuffled_keys)}
+        shuffled_dic = {}
+        for idx in original_keys:
+            shuffled_idx = shuffling_mapping[idx]
+            shuffled_dic[idx] = dic[shuffled_idx]
+        return shuffled_dic
+    @staticmethod
+    def is_broken_file(audiopath):
+        BROKEN_FILES = [
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/023/023431.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/033/033690.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119217.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119222.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119219.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/GTZAN/gtzan/data/genres/jazz/jazz.00054.wav"
+        ]
+        return audiopath in BROKEN_FILES
+    def _read_dataset_file(self, dataset_file):
+        print("reading", dataset_file)
+        with open(dataset_file) as f:
+            contents = f.read()
+        contents = json.loads(contents)
+        if contents['split_path'] is not None:
+            abs_path = contents['split_path']
+        """
+        for normal data
+        contents['data'] = {idx: {
+                'name': rel_path/name,
+                'prompt': prompt,
+                'output': output,
+                [optional] 'audio_start': audio_start,
+                'task': task,
+            }}
+        """
+        if 'interleaved' not in dataset_file:
+            for idx in contents["data"]:
+                contents["data"][idx]['task'] = contents["flamingo_task"]
+                contents["data"][idx]['name'] = os.path.join(
+                    abs_path, contents["data"][idx]['name']
+                )
+            return contents
+    def blend_dataset(self, dataset_blending_config, dataset_blending_output):
+        if os.path.exists(dataset_blending_output) and not self.force_reblend:
+            print("loading blended dataset file from:", dataset_blending_output)
+            with open(dataset_blending_output) as f:
+                contents = f.read()
+            self_data = json.loads(contents)
+        else:
+            if not self.force_reblend:
+                print("no blended dataset file found; reading all dataset files")
+            else:
+                print("force reblending dataset at epoch {}; reading all dataset files".format(self.epoch))
+            all_data = {}
+            for dataset_name in dataset_blending_config:
+                dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(dataset_name))
+                contents = self._read_dataset_file(dataset_file)
+                contents['data'] = self.shuffle_dict_fixed_rand(
+                    contents['data'],
+                    seed=sum(list(map(ord, dataset_name)))
+                )
+                weight_global = float(self.dataset_blending_global_weight)
+                weight_dataset = float(dataset_blending_config[dataset_name]["weight"])
+                weight = weight_global * weight_dataset
+                all_data[dataset_name] = {
+                    "contents": contents,
+                    "weight": weight
+                }
+            self_data = {
+                "dataset_path": self.data_root,
+                "split_path": None,
+                "total_num": 0,
+                "data": {}  # {id: {'name': rel_path/name or [rel_path/names], 'prompt': prompt or [prompts], 'output': output or [outputs], 'task': task, 'interleaved': interleave_method}}
+            }
+            for dataset_name in all_data:
+                print('blending {}'.format(dataset_name))
+                contents = all_data[dataset_name]["contents"]
+                shuffled_contents_data = contents['data']
+                weight = all_data[dataset_name]["weight"]
+                assert type(weight) == float and weight > 0.0
+                dataset_total_num = contents['total_num']
+                start_idx = int(self.epoch * dataset_total_num * weight)
+                end_idx = int((self.epoch + 1) * dataset_total_num * weight)
+                for idx in range(start_idx, end_idx):
+                    if idx > 0 and idx % dataset_total_num == 0:
+                        print('force shuffling at new epoch {} for dataset {}'.format(idx // dataset_total_num, dataset_name))
+                        shuffled_contents_data = self.shuffle_dict_fixed_rand(
+                            contents['data'],
+                            seed=sum(list(map(ord, '{}-epoch-{}'.format(dataset_name, idx // dataset_total_num))))
+                        )
+                    key = str(idx % dataset_total_num)
+                    item = shuffled_contents_data[key]
+                    found_broken = False
+                    if type(item['name']) is str:
+                        audiopath = item['name']
+                        if self.is_broken_file(audiopath):
+                            print('cannot read {}'.format(audiopath))
+                            found_broken = True
+                    if found_broken:
+                        continue
+                    self_data['data'][self_data['total_num']] = item
+                    self_data['total_num'] += 1
+            if not self.force_reblend:
+                print('writing blended dataset file to:', dataset_blending_output)
+                with open(dataset_blending_output, 'w') as json_file:
+                    json.dump(self_data, json_file)
+            else:
+                print('writing reblended dataset file to:', dataset_blending_output.replace('.json', '-reblended.json'))
+                with open(dataset_blending_output.replace('.json', '-reblended.json'), 'w') as json_file:
+                    json.dump(self_data, json_file)
+        return self_data
+    def get_num_windows(self, T, sr):
+        clap_config = self.clap_config
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+        num_windows = 1
+        if T <= window_length:
+            num_windows = 1
+            full_length = window_length
+        elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+            num_windows = max_num_window
+            full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+        else:
+            num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+            full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+        return num_windows, full_length
+    def load_audio(self, file_path, target_sr=16000, duration=30.0, start=0.0):
+        if file_path.endswith('.mp3'):
+            audio = AudioSegment.from_file(file_path)
+            if len(audio) > (start + duration) * 1000:
+                audio = audio[start * 1000:(start + duration) * 1000]
+            if audio.frame_rate != target_sr:
+                audio = audio.set_frame_rate(target_sr)
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            data = np.array(audio.get_array_of_samples())
+            if audio.sample_width == 2:
+                data = data.astype(np.float32) / np.iinfo(np.int16).max
+            elif audio.sample_width == 4:
+                data = data.astype(np.float32) / np.iinfo(np.int32).max
+            else:
+                raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+        else:
+            with sf.SoundFile(file_path) as audio:
+                original_sr = audio.samplerate
+                channels = audio.channels
+                max_frames = int((start + duration) * original_sr)
+                audio.seek(int(start * original_sr))
+                frames_to_read = min(max_frames, len(audio))
+                data = audio.read(frames_to_read)
+                if data.max() > 1 or data.min() < -1:
+                    data = data / max(abs(data.max()), abs(data.min()))
+            if original_sr != target_sr:
+                if channels == 1:
+                    data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+                else:
+                    data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+            else:
+                if channels != 1:
+                    data = data.T[0]
+        if data.min() >= 0:
+            data = 2 * data / abs(data.max()) - 1.0
+        else:
+            data = data / max(abs(data.max()), abs(data.min()))
+        assert len(data.shape) == 1, data.shape
+        return data
+    def compute_sliding_window(self, audio_file, audio_start=0.0, audio="sound"):
+        if type(audio_start) == str:
+            audio_start = float(audio_start)
+        if audio == "sound":
+            encoder_config = self.clap_config
+        else:
+            raise NotImplementedError
+        if encoder_config["method"] == 'nvclap-large':
+            sr = 16000
+        else:
+            raise NotImplementedError
+        window_length  = int(float(encoder_config["window_length"]) * sr)
+        window_overlap = int(float(encoder_config["window_overlap"]) * sr)
+        max_num_window = int(encoder_config["max_num_window"])
+        duration = max_num_window * (encoder_config["window_length"] - encoder_config["window_overlap"]) + encoder_config["window_overlap"]
+        audio_data = self.load_audio(os.path.join(self.data_root, audio_file), sr, duration, audio_start) # already cuts to max duration
+        T = len(audio_data)
+        num_windows, full_length = self.get_num_windows(T, sr)
+        # pads to the nearest multiple of window_length
+        if full_length > T:
+            audio_data = np.append(audio_data, np.zeros(full_length - T))
+        audio_data = audio_data.reshape(1, -1)
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+        audio_clips = []
+        audio_embed_mask = torch.ones(num_windows)
+        for i in range(num_windows):
+            start = i * (window_length - window_overlap)
+            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+            audio_clips.append(audio_data_tensor_this)
+        return audio_clips, audio_embed_mask
+    def validation_dataset(self, valid_dataset_config, valid_dataset_name):
+        dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(valid_dataset_name))
+        contents = self._read_dataset_file(dataset_file)
+        contents['data'] = self.shuffle_dict_fixed_rand(
+            contents['data'],
+            seed=sum(list(map(ord, valid_dataset_name)))
+        )
+        return contents
+    def preprocess_string_for_eval(self, x):
+        x = x.rstrip().lstrip()
+        x = x.lower()
+        return x
+    def _actual_getitem(self, i):
+        if self.split == 'train':
+            try:
+                item = self.data['data'][str(i)]
+            except:
+                item = self.data['data'][i]
+            if type(item['name']) is str:
+                audio_file = item['name']
+                audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
+            else:
+                raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
+            # compute window for long audios
+            audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
+            # make the text prompt
+            text_prompt = str(item['prompt']).lower()
+            text_output = str(item['output']).lower()
+            sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
+            text = self.tokenizer(
+                sample,
+                max_length=self.max_tokens,
+                padding="longest",
+                truncation="only_first",
+                return_tensors="pt"
+            )
+        elif self.split in ['val', 'test']:
+            try:
+                item = self.valid_data['data'][str(i)]
+            except:
+                item = self.valid_data['data'][i]
+            if type(item['name']) is str:
+                audio_file = os.path.join(self.data_root, item['name'])
+                audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
+            else:
+                raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
+            # compute window for long audios
+            audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
+            # make the text prompt
+            text_prompt = self.preprocess_string_for_eval(str(item['prompt']).lower())
+            text_output = self.preprocess_string_for_eval(str(item['output']).lower())
+            sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
+            text = self.tokenizer(
+                sample,
+                max_length=self.max_tokens,
+                padding="longest",
+                truncation="only_first",
+                return_tensors="pt"
+            )
+        # audio_clips_clap, audio_embed_mask_clap, audio_clips_speech, audio_embed_mask_speech, audio_clips_music, audio_embed_mask_music,
+        return (item['name'], audio_clips, audio_embed_mask, text["input_ids"], text["attention_mask"])
+    def __getitem__(self, i):
+        try:
+            return self._actual_getitem(i)
+        except Exception as e:
+            print('batch {} failed with reason {}'.format(i, e))
+            try:
+                return self._actual_getitem((i-42)%99)
+            except:
+                return self._actual_getitem((i-84)%99)
+    def __len__(self):
+        if self.split == 'train':
+            return len(list(self.data['data'].keys()))
+        elif self.split == 'val':
+            return min(len(list(self.valid_data['data'].keys())), 64)
+        elif self.split == 'test':
+            return len(list(self.valid_data['data'].keys()))
+@dataclass
+class DataInfo:
+    dataset: Dataset
+    dataloader: DataLoader
+    sampler: DistributedSampler = None
+    def set_epoch(self, epoch):
+        if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
+            self.sampler.set_epoch(epoch)
+def get_audiotext_dataloader(data_config, clap_config, text_tokenizer, batch_size, split='train', epoch=0, force_reblend=False):
+    assert split in ['train', 'val', 'test']
+    data_collator = DataCollator(text_tokenizer, clap_config)
+    dataloader_shuffle = False
+    if split == 'train':
+        trainset = AudioTextData(
+            **data_config,
+            clap_config=clap_config,
+            tokenizer=text_tokenizer,
+            split=split,
+            epoch=epoch,
+            force_reblend=force_reblend
+        )
+        sampler = DistributedSampler(trainset, shuffle=True)
+        trainloader = DataLoader(
+            trainset,
+            sampler=sampler,
+            batch_size=batch_size,
+            shuffle=dataloader_shuffle,
+            collate_fn=data_collator,
+            num_workers=data_config["num_workers"]
+        )
+        return DataInfo(dataset=trainset, dataloader=trainloader, sampler=sampler)
+    elif split in ['val', 'test']:
+        all_DataInfo = {}
+        for valid_dataset_name in list(data_config["valid_dataset_config"].keys()):
+            valid_dataset_name = valid_dataset_name.strip()
+            validset = AudioTextData(
+                **data_config,
+                clap_config=clap_config,
+                tokenizer=text_tokenizer,
+                split=split,
+                valid_dataset_name=valid_dataset_name
+            )
+            if split == 'val':
+                # distributed sampler
+                all_DataInfo[valid_dataset_name] = DataInfo(
+                    dataset=validset,
+                    dataloader=DataLoader(
+                        validset,
+                        sampler=DistributedSampler(validset, shuffle=False),
+                        batch_size=batch_size,
+                        shuffle=dataloader_shuffle,
+                        collate_fn=data_collator,
+                        num_workers=data_config["num_workers"]
+                ))
+            else:
+                # single GPU
+                all_DataInfo[valid_dataset_name] = DataInfo(
+                    dataset=validset,
+                    dataloader=DataLoader(
+                        validset,
+                        batch_size=batch_size,
+                        shuffle=dataloader_shuffle,
+                        collate_fn=data_collator,
+                        num_workers=data_config["num_workers"]
+                ))
+        return all_DataInfo
+def main():
+    import time
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../configs/config.yaml', help='yaml config path')
+    args = parser.parse_args()
+    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
+    data_config = config['data_config']
+    clap_config = config['clap_config']
+    whisper_config = config["whisper_config"]
+    mert_config = config["mert_config"]
+    tokenizer_path = "facebook/opt-1.3b"
+    cache_dir = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=False,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.add_special_tokens({"pad_token": "<|PAD_TOKEN|>"})
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+    trainset = AudioTextData(
+        **data_config,
+        clap_config=clap_config, tokenizer=text_tokenizer,
+        epoch=66, force_reblend=True
+    )
+    data_collator = DataCollator(text_tokenizer)
+    dataloader = DataLoader(trainset, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=4)
+    for step, batch in enumerate(dataloader):
+        filenames = batch["filenames"]
+        audio_clips = batch["audio_clips"]
+        audio_embed_mask = batch["audio_embed_mask"]
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        print(
+            'batch {}:'.format(step+1),
+            audio_clips.shape, audio_embed_mask.shape,
+            input_ids.shape, attention_mask.shape
+        )
+        print('filenames', filenames)
+        print('audio_embed_mask', audio_embed_mask)
+        print('input_ids', input_ids)
+        for input_id in input_ids:
+            print('-' * 50)
+            print(text_tokenizer.decode(input_id))
+        print('attention_mask', attention_mask)
+        if step == 20:
+            break
+if __name__ == "__main__":
+    main()

data/prepare_each_dataset.py ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Audio Flamingo Inference

eval/__init__.py ADDED Viewed

File without changes

eval/inference.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import argparse
+import functools
+import glob
+import os
+import random
+import string
+import json
+import sys
+sys.path.append('../')
+from tqdm import tqdm
+import yaml
+from collections import defaultdict
+import io
+import warnings
+import subprocess
+import pickle
+import numpy as np
+import torch
+from data.data import get_audiotext_dataloader
+from src.factory import create_model_and_transforms
+from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
+def inference_this(
+    args, data_config, clap_config, model_config, test_dataset_name, tmp_file,
+    temperature=1.0, num_beams=3, ckpt=-1, end_batch_idx=-2, verbose=False,
+):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=args.offline,
+        gradient_checkpointing=args.gradient_checkpointing,
+        freeze_lm_embeddings=args.freeze_lm_embeddings,
+    )
+    device_id = 0
+    model = model.to(device_id)
+    model.eval()
+    if ckpt == -1:
+        checkpoint_list = glob.glob(f"{args.expdir}/{args.run_name}/checkpoint_*.pt")
+        resume_from_checkpoint = sorted(checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0]))[-1]
+    else:
+        resume_from_checkpoint = f"{args.expdir}/{args.run_name}/checkpoint_{ckpt}.pt"
+    checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
+    msd = checkpoint["model_state_dict"]
+    msd = {k.replace("module.", ""): v for k, v in msd.items()}
+    x,y = model.load_state_dict(msd, False)
+    print(x)
+    print(y)
+    autocast = get_autocast(
+        args.precision, cache_enabled=(not args.fsdp)
+    )
+    cast_dtype = get_cast_dtype(args.precision)
+    # model = model.to(dtype=cast_dtype)
+    if test_dataset_name in data_config["valid_dataset_config"]:
+        data_config["valid_dataset_config"] = {test_dataset_name: data_config["valid_dataset_config"][test_dataset_name]}
+    else:
+        data_config["valid_dataset_config"] = {test_dataset_name: True}
+    all_test_AudioTextDataInfo = get_audiotext_dataloader(data_config, clap_config, tokenizer, args.batch_size, split='test')
+    assert test_dataset_name in list(all_test_AudioTextDataInfo.keys()), "{} not a test set".format(test_dataset_name)
+    dataloader = all_test_AudioTextDataInfo[test_dataset_name].dataloader
+    deduplicate_tasks = ["Clotho-v2-AudioCaptioning", "audiocaps-AudioCaptioning", "MACS-AudioCaptioning", "LP-MusicCaps-MSD-AudioCaptioning", "LP-MusicCaps-MC-AudioCaptioning"]
+    if any([test_dataset_name.startswith(x) for x in deduplicate_tasks]):
+        deduplicate = True
+    else:
+        deduplicate = False
+    if os.path.exists(tmp_file):
+        with open(tmp_file, 'rb') as pickle_file:
+            tmp_data = pickle.load(pickle_file)
+        results_dic = tmp_data['results_dic']
+        results = tmp_data['results']
+        finished_batches = tmp_data['finished_batches']
+        print('reading tmp data from {}: {} batches already computed'.format(tmp_file, finished_batches+1))
+    else:
+        tmp_data = {}
+        results_dic = {}  # for deduplicate
+        results = []  # for non-deduplicate
+        finished_batches = -1
+        print('no tmp data found; will store tmp data to {}'.format(tmp_file))
+    # print(len(dataloader))
+    # print('---------------------')
+    from itertools import islice
+    for batch_idx, batch in tqdm(enumerate(islice(dataloader, finished_batches, None), start=finished_batches)):
+    # for batch_idx, batch in tqdm(enumerate(dataloader)):
+        if end_batch_idx > 0 and batch_idx == end_batch_idx:
+            break
+        if batch_idx <= finished_batches:
+            continue
+        audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)
+        audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)
+        input_ids = batch["input_ids"].to(device_id, non_blocking=True)
+        filenames = batch["filenames"]
+        # print(input_ids)
+        media_token_id = tokenizer.encode("<audio>")[-1]
+        sep_token_id = tokenizer.sep_token_id
+        for idx in range(input_ids.shape[0]):
+            filename = filenames[idx]
+            if type(filename) is list:
+                # interleaved data
+                filename = filename[-1]
+            input_id = input_ids[idx]
+            for sep_location in range(len(input_id)-1, -1, -1):
+                # find last <SEP>
+                if input_id[sep_location] == sep_token_id:
+                    break
+            # print(tokenizer.decode(input_id))
+            prompt = input_id[:sep_location+1]
+            prompt_decoded = tokenizer.decode(prompt).replace(tokenizer.sep_token, '')
+            ground_truth_decoded = tokenizer.decode(input_id).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+            if not (deduplicate and (filename, prompt_decoded) in results_dic):
+                # print(prompt)
+                # print(prompt_decoded)
+                output = model.generate(
+                    audio_x=audio_clips[idx].unsqueeze(0),
+                    audio_x_mask=audio_embed_mask[idx].unsqueeze(0),
+                    lang_x=prompt.unsqueeze(0),
+                    eos_token_id=tokenizer.eos_token_id,
+                    max_new_tokens=256,
+                    temperature=temperature,
+                )[0]
+                output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+                # print(ground_truth_decoded)
+                # print('------')
+                # print(output_decoded)
+            if deduplicate:
+                if (filename, prompt_decoded) in results_dic:
+                    results_dic[(filename, prompt_decoded)]['ground_truth'].append(ground_truth_decoded)
+                else:
+                    results_dic[(filename, prompt_decoded)] = {
+                        'ground_truth': [ground_truth_decoded],
+                        'output': output_decoded
+                    }
+            else:
+                results.append((filename, prompt_decoded, ground_truth_decoded, output_decoded))
+        tmp_data['results_dic'] = results_dic
+        tmp_data['results'] = results
+        tmp_data['finished_batches'] = batch_idx
+        with open(tmp_file, 'wb') as pickle_file:
+            pickle.dump(tmp_data, pickle_file)
+    if deduplicate:
+        for (filename, prompt) in results_dic:
+            ground_truth = '|'.join(results_dic[(filename, prompt)]['ground_truth'])
+            output = results_dic[(filename, prompt)]['output']
+            results.append((filename, prompt, ground_truth, output))
+    # if verbose:
+    #     for filename, prompt, ground_truth, output in results:
+    #         print('-'*30)
+    #         print('filename:', filename)
+    #         print('prompt:', prompt)
+    #         print('ground_truth:', ground_truth)
+    #         print('output:', output)
+    return results
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../config/config.yaml', help='yaml config path')
+    parser.add_argument('-t', '--task', type=str, help='which task to inference')
+    parser.add_argument('-temp', '--temperature', type=float, default=1.0, help='temperature')
+    parser.add_argument('-nb', '--num_beams', type=int, default=1, help='num beams for beam search')
+    parser.add_argument('--ckpt', type=int, default=-1, help='checkpoint idx, -1 means latest')
+    parsed_args = parser.parse_args()
+    print(parsed_args)
+    test_dataset_name = parsed_args.task
+    output_file = os.path.join(
+        '../outputs/',
+        parsed_args.task.replace('/', '-'),
+        '{}-ckpt{}-{}.log'.format(
+            parsed_args.config.split('/')[-1][:-5],
+            parsed_args.ckpt,
+            "sft"
+        )
+    )
+    tmp_file = output_file.replace('.log', '.tmp.pickle')
+    print('output file:', output_file)
+    print('no previous log file; generating samples')
+    config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
+    # print(config)
+    # print('----------------------')
+    data_config = config['data_config']
+    model_config = config['model_config']
+    print(model_config)
+    clap_config = config['clap_config']
+    clap_config = config['clap_config']
+    mert_config = config['mert_config']
+    args = Dict2Class(config['train_config'])
+    results = inference_this(
+        args, data_config, clap_config, model_config, test_dataset_name,
+        temperature=float(parsed_args.temperature),
+        num_beams=int(parsed_args.num_beams),
+        ckpt=parsed_args.ckpt,
+        verbose=True,
+        tmp_file=tmp_file,
+    )
+if __name__ == "__main__":
+    main()

eval/inference.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/bin/bash
+TO_SUBMIT_JOBS=$(ls ../configs | grep "inference.yaml")
+ALL_TASK=$1
+# ALL_TASK=""
+# ALL_TASK="${ALL_TASK} MMAU/test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
+# ALL_TASK="${ALL_TASK} audiocaps-AudioCaptioning/interleaved_knn-test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/interleaved_knn-test"
+# # # ===== Classification =====
+# ALL_TASK="${ALL_TASK} CochlScene-SceneClassification/test"
+# ALL_TASK="${ALL_TASK} NonSpeech7k-EventClassification/test"
+# # # ===== zero-shot =====
+# ALL_TASK="${ALL_TASK} CREMA-D-EmotionClassification/train"
+# ALL_TASK="${ALL_TASK} ravdess-EmotionClassification/train"
+# ALL_TASK="${ALL_TASK} UrbanSound8K-EventClassification/train"
+# ALL_TASK="${ALL_TASK} GTZAN-GenreClassification/train"
+# ALL_TASK="${ALL_TASK} Medley-solos-DB-InstrClassification/test"
+for task in ${ALL_TASK}
+do
+    OUTFOLDER=${task//\//-}  # replace / into -
+    mkdir -p ../outputs/$OUTFOLDER
+done
+temp=0.0
+numbeams=1
+ckpt=199
+for EXP in $TO_SUBMIT_JOBS
+do
+    L=${#EXP}
+    NAME=$(echo ${EXP} | cut -c 1-$(($L-5)))  # remove last .yaml
+    for task in ${ALL_TASK}
+    do
+        echo "task: $task, config: $NAME, ckpt: $ckpt"
+        OUTFOLDER=${task//\//-}
+        OUTFILE="../outputs/$OUTFOLDER/$NAME-ckpt${ckpt}.log"
+        CKPT_DIR="/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3/$NAME"
+        python -u inference.py \
+            -c ../configs/$EXP \
+            -t $task \
+            -temp $temp \
+            -nb $numbeams \
+            --ckpt ${ckpt}
+    done
+    wait
+done

eval/interactive.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# IMAGE=gitlab-master.nvidia.com/zkong/audio_flamingo_v1/audiolm:0.2
+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.2/image.sqsh"
+submit_job -i -n interactive \
+    --gpu 1 \
+    --duration 2 \
+    --image $IMAGE \
+    --mounts /home/zkong,/lustre/fsw/portfolios/adlr/users/zkong

eval/keep_run.sh ADDED Viewed

	@@ -0,0 +1,64 @@

+# CHECK_EVERY=900
+# DURATION_DAYS=10
+# CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
+# NEPOCH_PRE=99
+# NEPOCH_SFT=159
+# NAME="audio-gen-train_audiogen"
+# for (( i = 1; i <= $CHECK_TOTAL; i++ ))
+# do
+#     RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
+#     PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
+#     for STATE in "RUNNING" "PENDING" "NOT-RUN"
+#     do
+#         echo "===========${STATE}=========="
+#         if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
+#             echo ${NAME}
+#         elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
+#             echo ${NAME}
+#         elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
+#             base_path="/lustre/fsw/portfolios/adlr/users/sreyang/ckpts/stable_llm/harmonai_train/"
+#             # Find the last subfolder
+#             last_subfolder=$(ls -d "$base_path"*/ | sort -V | tail -n 1)
+#             # Find the last checkpoint in the subfolder
+#             last_ckpt=$(ls "$last_subfolder/checkpoints/"*.ckpt | sort -V | tail -n 1)
+#             echo $last_ckpt
+#             sh submit_job.sh "True" $last_ckpt
+#             sleep 1
+#         fi
+#     done
+#     echo "============================"
+#     sleep $CHECK_EVERY
+# done
+CHECK_EVERY=900
+DURATION_DAYS=10
+CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
+NEPOCH_PRE=99
+NEPOCH_SFT=159
+NAME="eval"
+for (( i = 1; i <= $CHECK_TOTAL; i++ ))
+do
+    RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
+    PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
+    for STATE in "RUNNING" "PENDING" "NOT-RUN"
+    do
+        echo "===========${STATE}=========="
+        if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
+            echo ${NAME}
+        elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
+            echo ${NAME}
+        elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
+            sh submit.sh
+            sleep 1
+        fi
+    done
+    echo "============================"
+    sleep $CHECK_EVERY
+done

eval/submit.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
+NAME=eval
+PARTITION="polar,polar3,polar4"
+MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
+LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
+# "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
+# Predefined list of strings
+STRING_LIST=("Music4All/train")
+# "MusicCaps-AudioCaptioning/test_2")
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+# "Clotho-v2-AudioCaptioning/test")
+# "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+# "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+#("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
+for i in "${STRING_LIST[@]}"; do
+    OUTFILE=$LOGDIR/output_$i-2.out
+    TASK=""
+    TASK="${TASK} $i"
+    SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
+        --mounts $MOUNTS \
+        --name audio-flamingo-$NAME \
+        --duration 4 \
+        --partition $PARTITION \
+        --gpu 2 \
+        --nodes 1 \
+        --image $IMAGE \
+        --email_mode never \
+        --outfile $OUTFILE \
+        --logdir $LOGDIR \
+        --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
+        --command "sh inference.sh $TASK"
+    sleep 30
+done

eval/submit_2.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
+NAME=eval
+PARTITION="polar,polar2,polar3,polar4"
+MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
+LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
+# "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
+# Predefined list of strings
+STRING_LIST=("Clotho-v2-AudioCaptioning/test" "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test" "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+# "Clotho-v2-AudioCaptioning/test")
+# "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+# "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+#("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
+for i in "${STRING_LIST[@]}"; do
+    OUTFILE=$LOGDIR/output_$i-4096_7.out
+    TASK=""
+    TASK="${TASK} $i"
+    SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
+        --mounts $MOUNTS \
+        --name audio-flamingo-$NAME \
+        --duration 4 \
+        --partition $PARTITION \
+        --gpu 2 \
+        --nodes 1 \
+        --image $IMAGE \
+        --email_mode never \
+        --outfile $OUTFILE \
+        --logdir $LOGDIR \
+        --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
+        --command "sh inference.sh $TASK"
+    sleep 30
+done

my_laion_clap/CLAP/LICENSE ADDED Viewed

	@@ -0,0 +1,121 @@

+Creative Commons Legal Code
+CC0 1.0 Universal
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+Statement of Purpose
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+4. Limitations and Disclaimers.
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.

my_laion_clap/CLAP/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,3 @@

+recursive-include src/laion_clap/clap_module/model_configs *.json
+recursive-include src/laion_clap/clap_module bpe_simple_vocab_16e6.txt.gz
+recursive-include src/laion_clap/training audioset_textmap.npy

my_laion_clap/CLAP/README.md ADDED Viewed

	@@ -0,0 +1,287 @@

+# CLAP
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/logo.PNG" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
+</p>
+<p align="center">
+  <a href="https://arxiv.org/abs/2211.06687"><img src="https://img.shields.io/badge/arXiv-2211.06687-brightgreen.svg?style=flat-square"/></a>
+  <a href="https://pypi.org/project/laion-clap"><img src="https://badge.fury.io/py/laion-clap.svg"/></a>
+  <a href="https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Transformers-blue"/></a>
+</p>
+### This repository provides representations of audios and texts via Contrastive Language-Audio Pretraining (CLAP)
+With CLAP, you can extract a latent representation of any given audio and text for your own model, or for different downstream tasks.
+All codes are comming officially with the following paper, accepted by IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023:
+ - [Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
+**New Updates:**
+<b>1. We release new CLAP pretrained checkpoints pretrained on music and speech data collecstions from [our dataset collection repo](https://github.com/LAION-AI/audio-dataset).</b>
+<b>2. CLAP model is incorporated and supported by [HuggingFace Transformers](https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap). Many thanks to [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://fr.linkedin.com/in/arthur-zucker-8a0445144) for contributing to the HuggingFace support. </b>
+## About this project
+This project is a project in [LAION](https://laion.ai/) that aims at learning better audio understanding and getting more audio data.
+This is an opensource project. We adopt the codebase of [open_clip](https://github.com/mlfoundations/open_clip) for this project.
+many thanks to <a href="https://github.com/cfoster0/CLAP">@cfoster0</a> for allowing us to use his repo name.
+## Architecture
+Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP (Contrastive Language-Image Pretraining) architecture, the CLAP architecture is as follows.
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/audioclip-arch.png" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
+</p>
+## Quick Start
+We provide the PyPI library for our CLAP model:
+```bash
+pip install laion-clap
+```
+Then you can follow the below usage or refer to [unit_test.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/unit_test.py).
+For the documentation of the API, please refer to [hook.py](https://github.com/LAION-AI/CLAP/blob/main/src/laion_clap/hook.py).
+```python
+import numpy as np
+import librosa
+import torch
+import laion_clap
+# quantization
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+model = laion_clap.CLAP_Module(enable_fusion=False)
+model.load_ckpt() # download the default pretrained checkpoint.
+# Directly get audio embeddings from audio files
+audio_file = [
+    '/home/data/test_clap_short.wav',
+    '/home/data/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+# Directly get audio embeddings from audio files, but return torch tensor
+audio_file = [
+    '/home/data/test_clap_short.wav',
+    '/home/data/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+# Get text embedings from texts:
+text_data = ["I love the contrastive learning", "I love the pretrain model"]
+text_embed = model.get_text_embedding(text_data)
+print(text_embed)
+print(text_embed.shape)
+# Get text embedings from texts, but return torch tensor:
+text_data = ["I love the contrastive learning", "I love the pretrain model"]
+text_embed = model.get_text_embedding(text_data, use_tensor=True)
+print(text_embed)
+print(text_embed.shape)
+```
+## Pretrained Models
+The pretrained checkpoints can be found in [here](https://huggingface.co/lukewys/laion_clap/tree/main).
+Please refer to the previous section for how to load and run the checkpoints.
+For the PyPI library, [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) and [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) are our default models (non-fusion and fusion)
+We further provide below pretrained models according to your usages:
+* For general audio less than 10-sec: [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) or [630k-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-best.pt)
+* For general audio with variable-length: [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) or [630k-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-fusion-best.pt)
+* For music: [music_audioset_epoch_15_esc_90.14.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_audioset_epoch_15_esc_90.14.pt)
+* For music and speech: [music_speech_epoch_15_esc_89.25.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_epoch_15_esc_89.25.pt)
+* For speech, music and general audio: [music_speech_audioset_epoch_15_esc_89.98.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_audioset_epoch_15_esc_89.98.pt)
+The checkpoints list here for each model setting is the one with the highest average mAP score in training.
+The average mAP score is calculated by averaging 4 scores: A-->T mAP@10 on AudioCaps, and T-->A mAP@10 on AudioCaps, A-->T mAP@10 on Clotho, and T-->A mAP@10 on Clotho.
+To use above pretrained models, you need to load the ckpt by yourself, as:
+Update 2023.4.7: we have released 3 larger CLAP models trained on music, speech dataset in addition to LAION-Audio-630k. Here are descriptions of the model and their performance:
+ - `music_speech_audioset_epoch_15_esc_89.98.pt`: trained on music + speech + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 89.98%, the GTZAN performance is 51%.
+ - `music_audioset_epoch_15_esc_90.14.pt`: trained on music + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 90.14%, the GTZAN performance is 71%.
+ - `music_speech_epoch_15_esc_89.25.pt`: trained on music + speech + LAION-Audio-630k. The zeroshot ESC50 performance is 89.25%, the GTZAN performance is 69%.
+The model uses a larger audio encoder. To load the model using the pip API:
+```python
+import laion_clap
+model = laion_clap.CLAP_Module(enable_fusion=False, amodel= 'HTSAT-base')
+model.load_ckpt('checkpoint_path/checkpoint_name.pt')
+```
+Please note that this is a temporary release for people who are working on larger-scale down-stream task.
+We will release a more comprehensive version of the model with detailed experiments in the future.
+Please take your own risk when using this model.
+* All the new checkpoints did not trained with fusion. The training dataset size for `music_speech_audioset_epoch_15_esc_89.98.pt` is around 4M samples. The zeroshot GTZAN score is evaluated using the prompt `This audio is a <genre> song.`
+<!-- We provide the CLAP's performance on audio classification tasks under the zero-shot setting or the supervised setting. More results can be found at our paper.
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/clap-zeroshot.PNG" alt="Zero-shot Performance" width="100%"/>
+</p> -->
+## Environment Installation
+If you want to check and reuse our model into your project instead of directly using the pip library, you need to install the same environment as we use, please run the following command:
+```bash
+conda create env -n clap python=3.10
+conda activate clap
+git clone https://github.com/LAION-AI/CLAP.git
+cd CLAP
+# you can also install pytorch by following the official instruction (https://pytorch.org/get-started/locally/)
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
+pip install -r requirements.txt
+```
+## Dataset format
+We use training data in webdataset format. For details of our dataset please see https://github.com/LAION-AI/audio-dataset.
+Due to copyright reasons, we cannot release the dataset we train this model on. However, we released [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k), the data source we used to compose the dataset with link to each audio and their caption. Please refer to [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k) for more details. You could download the dataset, preprocess it on your own and train it locally. To train on the local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
+You can find an example of our dataset format in [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing).
+It contains the full ESC50 dataset, split according to the first 5-fold split.
+## Training, Fine-tuning and Evaluation
+Please find the script of training, fine-tuning and evaluation (zero-shot and retrieval) in the [experiment_scripts](./experiment_scripts) folder.
+The scripts included there are the one we used to train our model on a SLURM cluster.
+You need to change the script to fit your own environment.
+For example, in a single machine multi-GPU setting, you might want to use `torchrun` instead of `srun` to run the script.
+To train on a single GPU machine, use `CUDA_VISIBLE_DEVICES=0 python -m ...` instead of `srun`.
+We use [Weights and Biases](https://wandb.ai/site) for experiment logging. You need to configure the weights and biases in your environment.
+To train on local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
+## Core Code
+Please refer to [main.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/main.py), [train.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/train.py), [data.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/data.py),and [model.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/clap_module/model.py) to quicly get familiar with our model.
+## Reproducibility
+An example of the preprocessed Clotho dataset in webdataset format can be download [here](https://drive.google.com/drive/folders/1mU9mBOe11jTFCrQRJQsUa4S-3TlNuYoI?usp=sharing) (by downloading, you will be agreeing the license described in the [Clotho dataset](https://zenodo.org/record/3490684#.Y9ALPeyZP1w)). The audio encoder pretrained with 48kHz AudioSet can be found [here](https://drive.google.com/drive/folders/1SMQyzJvc6DwJNuhQ_WI8tlCFL5HG2vk6?usp=sharing), where `HTSAT-fullset-imagenet-map=0.467.ckpt` is the checkpoint used to initalize our HTSAT audio encoder. You should get similar result by loading from the audio encoder checkpoint and training on same dataset.
+The script to train the model on Clotho dataset is included [here](experiment_scripts/train-only-clotho.sh). You need to replace the `datasetpath` and `pretrained-audio` to pointing to your own directory. You could check the [report](https://stability.wandb.io/clap/clap/reports/CLAP-trained-on-Clotho-dataset--VmlldzoyNzY?accessToken=c0erq9hhp7h880jclihd9j9if679s6bylwto33vo14yo5jg40ppe38qeoafoonpz) of the training script on a single A100 GPU for reference.
+Because most of the dataset has copyright restriction, unfortunatly we cannot directly share other preprocessed datasets. The caption generated by keyword-to-caption model for Audioset can be found [here](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k#keyword-to-caption-augmentation)
+## Zeroshot Classification with ESC50 official split
+Here is an example code to run the zeroshot classification on **first** ESC50 official split with the pip API:
+```python
+import laion_clap
+import glob
+import json
+import torch
+import numpy as np
+device = torch.device('cuda:0')
+# download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
+esc50_test_dir = './ESC50_1/test/*/'
+class_index_dict_path = './class_labels/ESC50_class_labels_indices_space.json'
+# Load the model
+model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
+model.load_ckpt()
+# Get the class index dict
+class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
+# Get all the data
+audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
+json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
+ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
+with torch.no_grad():
+    ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
+    # Get text features
+    all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
+    text_embed = model.get_text_embedding(all_texts)
+    audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
+    ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
+    preds = torch.where(ranking == ground_truth)[1]
+    preds = preds.cpu().numpy()
+    metrics = {}
+    metrics[f"mean_rank"] = preds.mean() + 1
+    metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
+    for k in [1, 5, 10]:
+        metrics[f"R@{k}"] = np.mean(preds < k)
+    # map@10
+    metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+    print(
+        f"Zeroshot Classification Results: "
+        + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+    )
+```
+For ESC50 dataset, you could either download our processed ESC50 in webdataset format
+from [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing), and extract the
+`./test/0.tar` to `./test/`. Or you could download the original ESC50 dataset and
+preprocess the label to the format of `class_labels/ESC50_class_labels_indices_space.json` by yourself (replace `_` with space).
+The result should be the same as the following:
+For `model = laion_clap.CLAP_Module(enable_fusion=True, device=device)`: `mean_rank: 1.2425	median_rank: 1.0000	R@1: 0.9050	R@5: 0.9900	R@10: 0.9925	mAP@10: 0.9407`
+For `model = laion_clap.CLAP_Module(enable_fusion=False, device=device)`: `mean_rank: 1.1450	median_rank: 1.0000	R@1: 0.9275	R@5: 0.9975	R@10: 1.0000	mAP@10: 0.9556`
+Note that the results is slightly higher than the reported results in the paper, because we use the train + test data of ESC50 and removing the data overlap in other training datasets (mainly freesound).
+## Citation
+If you find this project and the LAION-Audio-630K dataset useful, please cite our paper:
+```
+@inproceedings{laionclap2023,
+  title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
+  author = {Wu*, Yusong and Chen*, Ke and Zhang*, Tianyu and Hui*, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
+  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
+  year = {2023}
+}
+@inproceedings{htsatke2022,
+  author = {Ke Chen and Xingjian Du and Bilei Zhu and Zejun Ma and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
+  title = {HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection},
+  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
+  year = {2022}
+}
+```
+## Acknowledgements
+This project is working in progress, thus the codebase and model might not be perfect or bug-free.
+We will very much appreciate any kind of contribution or and issue raised.
+If you find a bug or have any suggestion, please feel free to open an issue or contact us.
+If you would actively contribute to this project, please join the discord of LAION.

my_laion_clap/CLAP/assets/audioclip-arch.png ADDED Viewed

my_laion_clap/CLAP/assets/clap-zeroshot.PNG ADDED Viewed

my_laion_clap/CLAP/assets/logo.PNG ADDED Viewed

my_laion_clap/CLAP/experiment_scripts/esc50_api.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import laion_clap
+import glob
+import json
+import torch
+import numpy as np
+device = torch.device('cuda:0')
+# download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
+esc50_test_dir = './ESC50_1/test/*/'
+class_index_dict_path = '/fsx/yusong/CLAP/class_labels/ESC50_class_labels_indices_space.json'
+# Load the model
+model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
+model.load_ckpt()
+# Get the class index dict
+class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
+# Get all the data
+audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
+json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
+ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
+with torch.no_grad():
+    ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
+    # Get text features
+    all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
+    text_embed = model.get_text_embedding(all_texts)
+    audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
+    ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
+    preds = torch.where(ranking == ground_truth)[1]
+    preds = preds.cpu().numpy()
+    metrics = {}
+    metrics[f"mean_rank"] = preds.mean() + 1
+    metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
+    for k in [1, 5, 10]:
+        metrics[f"R@{k}"] = np.mean(preds < k)
+    # map@10
+    metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+    print(
+        f"Zeroshot Classification Results: "
+        + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+    )

my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_retrieval_main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=512 \
+    --wd=0.0 \
+    --epochs=50 \
+    --workers=6 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.17-freesound-dataset-4#" \
+    --datasetnames "freesound_no_overlap_noesc50" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained="/fsx/clap_logs/2022_10_17-02_08_21-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"

my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
+    --save-frequency 50 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=160 \
+    --lr=1e-4 \
+    --wd=0.1 \
+    --epochs=100 \
+    --workers=4 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.14-finetune-esc50" \
+    --datasetnames "esc50" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --lp-loss="ce" \
+    --lp-metrics="acc" \
+    --lp-lr=1e-4 \
+    --lp-mlp \
+    --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --optimizer "adam"

my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
+    --save-frequency 50 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=160 \
+    --lr=1e-4 \
+    --wd=0.1 \
+    --epochs=100 \
+    --workers=4 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.14-finetune-fsd50k" \
+    --datasetnames "fsd50k_class_label" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --lp-loss="bce" \
+    --lp-metrics="map" \
+    --lp-lr=1e-4 \
+    --lp-mlp \
+    --class-label-path="../class_labels/FSD50k_class_labels_indices.json" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --optimizer "adam"

my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-2#-htsat-roberta-fusion" \
+    --datasetnames "Clotho" "audiocaps" "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "freesound_no_overlap_noesc50" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" "MACS" "WavText5K" \
+    --full-train-dataset "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" \
+    --exclude-eval-dataset "freesound_no_overlap_noesc50" "MACS" "WavText5K" "fsd50k_class_label" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "fusion" \
+    --enable-fusion \
+    --fusion-type "aff_2d" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt

my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-1#-htsat-roberta" \
+    --datasetnames "Clotho" "audiocaps" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt

my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+python -m laion_clap.training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --datasetpath="<to-your-directory-containing-Clotho-not-the-path-to-Clotho>" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --datasetnames "Clotho" \
+    --datasetinfos "train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --logs 'logs' \
+    --seed 3407 \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio '<path-to>/HTSAT-fullset-imagenet-map=0.467.ckpt' \
+    --prefetch-factor 2

my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+echo go $COUNT_NODE
+echo $HOSTNAMES
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --warmup 500 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-1#-pann-roberta" \
+    --datasetnames "Clotho" "audiocaps" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/PANN-fullset-map=0.439.ckpt

my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+# run from CLAP directory
+python -m evaluate.eval_zeroshot_classification \
+  --dataset-type="webdataset" \
+  --precision="fp32" \
+  --batch-size=512 \
+  --workers=6 \
+  --amodel HTSAT-tiny \
+  --tmodel roberta \
+  --datasetnames "esc50_no_overlap" \
+  --remotedata \
+  --datasetinfos "train" \
+  --seed 3407 \
+  --logs ./logs \
+  --data-filling "repeatpad" \
+  --data-truncating "rand_trunc" \
+  --freeze-text \
+  --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
+  --pretrained="/fsx/clap_logs/2023_02_18-00_03_45-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"

my_laion_clap/CLAP/pyproject.toml ADDED Viewed

	@@ -0,0 +1,54 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "laion_clap"
+version = "1.1.4"
+authors = [
+  { name="Ke Chen", email="knutchen@ucsd.edu" },
+  { name="Yusong Wu" },
+  { name="Tianyu Zhang" },
+  { name="Yuchen Hui" }
+]
+maintainers = [
+  { name="Ke Chen", email="knutchen@ucsd.edu" },
+  { name="Yusong Wu" },
+  { name="Tianyu Zhang" },
+  { name="Yuchen Hui" }
+]
+description = "Contrastive Language-Audio Pretraining Model from LAION"
+license = {file = "LICENSE"}
+readme = "README.md"
+requires-python = ">=3.7"
+dependencies = [
+  "numpy==1.23.5",
+  "soundfile",
+  "librosa",
+  "torchlibrosa",
+  "ftfy",
+  "braceexpand",
+  "webdataset",
+  "wget",
+  "wandb",
+  "llvmlite",
+  "scipy",
+  "scikit-learn",
+  "pandas",
+  "h5py",
+  "tqdm",
+  "regex",
+  "transformers",
+  "progressbar"
+]
+classifiers = [
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+]
+[project.urls]
+"Homepage" = "https://github.com/LAION-AI/CLAP"
+"Bug Tracker" = "https://github.com/LAION-AI/CLAP/issues"

my_laion_clap/CLAP/requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+soundfile
+librosa
+torchlibrosa
+ftfy
+braceexpand
+webdataset
+wget
+wandb
+llvmlite
+scipy
+scikit-learn
+pandas
+h5py
+tqdm
+regex
+transformers<=4.30.2

my_laion_clap/CLAP/src/laion_clap/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+import sys
+dir_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(dir_path)
+from .hook import CLAP_Module

my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (372 Bytes). View file

my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc ADDED Viewed

Binary file (7.78 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .factory import list_models, create_model, create_model_and_transforms, add_model_config
+from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
+from .model import CLAP, CLAPTextCfg, CLAPVisionCfg, CLAPAudioCfp, convert_weights_to_fp16, trace_model
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_tag_models, list_pretrained_model_tags,\
+    get_pretrained_url, download_pretrained
+from .tokenizer import SimpleTokenizer, tokenize
+from .transform import image_transform

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.02 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc ADDED Viewed

Binary file (6.64 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc ADDED Viewed

Binary file (4.23 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc ADDED Viewed

Binary file (30.6 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc ADDED Viewed

Binary file (7.98 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (23.8 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc ADDED Viewed

Binary file (4.51 kB). View file

my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc ADDED Viewed

Binary file (13.2 kB). View file