Spaces:

enclap-team
/

enclap

Runtime error

App Files Files Community

tonyswoo commited on Feb 1, 2024

Commit

73baeae

1 Parent(s): 80d82df

Initial Commit

Browse files

Files changed (48) hide show

.gitattributes +1 -0
README.md +6 -6
cfg/audiocaps/base.yaml +48 -0
cfg/audiocaps/large.yaml +48 -0
cfg/audiocaps_args.yaml +60 -0
cfg/clotho/base.yaml +48 -0
cfg/clotho/large.yaml +48 -0
cfg/clotho_finetune/base.yaml +48 -0
cfg/clotho_finetune/large.yaml +48 -0
ckpt/config.json +75 -0
ckpt/pytorch_model.bin +3 -0
csv/audiocaps/test.csv +0 -0
csv/audiocaps/train.csv +0 -0
csv/audiocaps/valid.csv +0 -0
csv/clotho/test.csv +0 -0
csv/clotho/train.csv +0 -0
csv/clotho/valid.csv +0 -0
data/__init__.py +0 -0
data/collator.py +61 -0
data/infer_clap.py +67 -0
data/infer_encodec.py +41 -0
data/preprocess.py +232 -0
gradio_app.py +60 -0
inference.ipynb +0 -0
inference.py +161 -0
metric/__init__.py +0 -0
metric/compute_metric.py +24 -0
metric/compute_metric_from_scratch.py +70 -0
metric/make_predictions.py +41 -0
modeling/__init__.py +0 -0
modeling/enclap_bart.py +548 -0
modeling/modeling_outputs.py +11 -0
port_weights.py +42 -0
requirements.txt +14 -0
test/bart_test.ipynb +363 -0
test/clap_test.ipynb +0 -0
test/dataset_test.ipynb +0 -0
test/dataset_test.py +57 -0
test/encodec_test.ipynb +0 -0
test/encodec_test.py +24 -0
test/eval_dataset_test.py +42 -0
test/masking_test.ipynb +117 -0
test/metric_test.ipynb +260 -0
test/subsample_test.ipynb +121 -0
test/train_test.ipynb +261 -0
test/train_test.py +41 -0
train.py +527 -0
train.sh +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ckpt/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
 title: Enclap
-emoji: 📈
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 4.16.0
-app_file: app.py
 pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Enclap
+emoji: 🔊
+colorFrom: pink
+colorTo: green
 sdk: gradio
+sdk_version: 3.41.0
+app_file: gradio_app.py
 pinned: false
+license: openrail
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

cfg/audiocaps/base.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: csv/audiocaps/train.csv
+validation_file: csv/audiocaps/valid.csv
+encodec_base_path: /data/audiocaps/encodec
+clap_base_path: /data/audiocaps/clap
+tokenizer_name: facebook/bart-base
+config_name_or_path: facebook/bart-base
+model_name_or_path: facebook/bart-base
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 16
+per_device_eval_batch_size: 16
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 6.5e-5  # peak lr
+num_warmup_steps: 2000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

cfg/audiocaps/large.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: csv/audiocaps/train.csv
+validation_file: csv/audiocaps/valid.csv
+encodec_base_path: /data/audiocaps/encodec
+clap_base_path: /data/audiocaps/clap
+tokenizer_name: facebook/bart-large
+config_name_or_path: facebook/bart-large
+model_name_or_path: facebook/bart-large
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 3e-5  # peak lr
+num_warmup_steps: 2000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

cfg/audiocaps_args.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Experiment Config for each experiment
+output_dir: /data/jyk/aac_results/bart_large/audiocaps_3e5_gpu4_1115_2000
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: /workspace/audiobart/csv/AudioCaps/train.csv
+validation_file: /workspace/audiobart/csv/AudioCaps/val.csv
+test_file: /workspace/audiobart/csv/AudioCaps/test.csv
+base_path: /data/jyk/aac_dataset/AudioCaps/encodec_16
+clap_base_path: /data/jyk/aac_dataset/AudioCaps/clap_audio_fused
+tokenizer_name: facebook/bart-large
+# model_name_or_path: /workspace/audiobart/bart/model
+model_name_or_path: facebook/bart-large
+num_captions: 5
+overwrite_output_dir: False
+# Training Configs
+# Basic Config
+max_encodec_length: 1022
+only_encoder_epochs: 0
+only_encodec_epochs: 0
+clap_masking_prob: -1
+encodec_masking_prob: 0.15
+encodec_masking_length: 10
+random_sampling: true
+num_train_epochs: 30
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Model & Generation Config
+max_source_length: 1024
+max_target_length: 128
+val_max_target_length: 50
+num_beams: null
+pad_to_max_length: false
+num_subsampling: 0
+# Training Hyperparameters
+learning_rate: 3e-5   # peak lr
+# Should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+# lr_scheduler_type: two_stage_inverse_sqrt
+weight_decay: 0.01
+num_warmup_steps: 2000
+max_grad_norm: 1.0
+# Do not Change
+with_tracking: true
+report_to: all
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false

cfg/clotho/base.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: /csv/clotho/train.csv
+validation_file: /csv/clotho/valid.csv
+encodec_base_path: /data/clotho/encodec
+clap_base_path: /data/clotho/clap
+tokenizer_name: facebook/bart-base
+config_name_or_path: facebook/bart-base
+model_name_or_path: facebook/bart-base
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 4e-5  # peak lr
+num_warmup_steps: 1000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

cfg/clotho/large.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: /csv/clotho/train.csv
+validation_file: /csv/clotho/valid.csv
+encodec_base_path: /data/clotho/encodec
+clap_base_path: /data/clotho/clap
+tokenizer_name: facebook/bart-large
+config_name_or_path: facebook/bart-large
+model_name_or_path: facebook/bart-large
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 2.5e-5  # peak lr
+num_warmup_steps: 1000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

cfg/clotho_finetune/base.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: /csv/clotho/train.csv
+validation_file: /csv/clotho/valid.csv
+encodec_base_path: /data/clotho/encodec
+clap_base_path: /data/clotho/clap
+tokenizer_name: facebook/bart-base
+config_name_or_path: facebook/bart-base
+model_name_or_path: /data/enclap_audiocaps
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 2e-5  # peak lr
+num_warmup_steps: 1000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

cfg/clotho_finetune/large.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Experiment Config for each experiment
+output_dir: /output
+logging_dir: runs/tb_log
+logging_steps: 10
+seed: 1115
+train_file: /csv/clotho/train.csv
+validation_file: /csv/clotho/valid.csv
+encodec_base_path: /data/clotho/encodec
+clap_base_path: /data/clotho/clap
+tokenizer_name: facebook/bart-large
+config_name_or_path: facebook/bart-large
+model_name_or_path: /data/enclap_audiocaps
+eval_num_captions: 5
+overwrite_output_dir: False
+# Basic Config
+encodec_masking_prob: 0.15
+encodec_masking_span: 10
+num_train_epochs: 15
+max_train_steps: null
+gradient_accumulation_steps: 1
+per_device_train_batch_size: 64
+per_device_eval_batch_size: 64
+split_batches: true
+checkpointing_steps: epoch  # 'epoch' to save for each epoch, or number of steps
+resume_from_checkpoint: null
+# Generation Config
+max_target_length: 128
+val_max_target_length: 50
+# Training Hyperparameters
+# "lr_schedulre_type" should be one of "linear", "cosine", "cosine_with_restarts", "polynomial",
+# "constant", "constant_with_warmpup", "inverse_sqrt", "reduce_lr_on_plateau", "two_stage_inverse_sqrt"
+lr_scheduler_type: inverse_sqrt
+learning_rate: 1.25e-5  # peak lr
+num_warmup_steps: 1000
+weight_decay: 0.01
+max_grad_norm: 1.0
+# Others
+with_tracking: true
+report_to: tensorboard
+ignore_pad_token_for_loss: true
+preprocessing_num_workers: 32
+use_slow_tokenizer: false
+overwrite_cache: false
+pad_to_max_length: false

ckpt/config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "_name_or_path": "facebook/bart-base",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartModel"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "classif_dropout": 0.1,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 2,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_position_embeddings": 1024,
+  "model_type": "bart",
+  "no_repeat_ngram_size": 3,
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "num_beams": 4,
+  "num_hidden_layers": 6,
+  "pad_token_id": 1,
+  "scale_embedding": false,
+  "task_specific_params": {
+    "summarization": {
+      "length_penalty": 1.0,
+      "max_length": 128,
+      "min_length": 12,
+      "num_beams": 4
+    },
+    "summarization_cnn": {
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "num_beams": 4
+    },
+    "summarization_xsum": {
+      "length_penalty": 1.0,
+      "max_length": 62,
+      "min_length": 11,
+      "num_beams": 6
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.0",
+  "use_cache": true,
+  "vocab_size": 50265
+}

ckpt/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fef9cc30fdf47b82a8bb846d418ac3ef893b4d10e909fafbbe3ed8a1931cf23
+size 663433954

csv/audiocaps/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csv/audiocaps/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csv/audiocaps/valid.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csv/clotho/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csv/clotho/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

csv/clotho/valid.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/__init__.py ADDED Viewed

File without changes

data/collator.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from dataclasses import dataclass
+import torch
+from transformers import BatchEncoding, DataCollatorForSeq2Seq
+@dataclass
+class DataCollatorForEnClapBart(DataCollatorForSeq2Seq):
+    input_pad_token_id: int = 1024
+    num_rvq: int = 16
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        batch_size = len(features)
+        # stacked_features = {k: [f[k] for f in features] for k in features[0]}
+        clap_embedding = torch.Tensor(
+            [feature["clap_embedding"] for feature in features]
+        )
+        pad_token_id = self.tokenizer.pad_token_id
+        self.tokenizer.pad_token_id = self.input_pad_token_id
+        keys = ["input_ids", "mcm_labels"]
+        tmp_key_map = {"input_ids": "input_ids", "mcm_labels": "labels"}
+        input_features = super().__call__(
+            [
+                {tmp_key_map[key]: feature[key][:, i] for key in keys}
+                for feature in features
+                for i in range(feature[keys[0]].shape[-1])
+            ],
+            return_tensors,
+        )
+        self.tokenizer.pad_token_id = 1
+        keys = ["encodec_mask", "attention_mask", "labels"]
+        tmp_key_map = {
+            "encodec_mask": "input_ids",
+            "attention_mask": "attention_mask",
+            "labels": "labels",
+        }
+        other_features = super().__call__(
+            [{tmp_key_map[key]: feature[key] for key in keys} for feature in features],
+            return_tensors,
+        )
+        self.tokenizer.pad_token_id = pad_token_id
+        return BatchEncoding(
+            {
+                "input_ids": input_features["input_ids"]
+                .reshape(batch_size, self.num_rvq, -1)
+                .transpose(1, 2),
+                "mcm_labels": input_features["labels"]
+                .reshape(batch_size, self.num_rvq, -1)
+                .transpose(1, 2),
+                "attention_mask": other_features["attention_mask"],
+                "encodec_mask": other_features["input_ids"],
+                "labels": other_features["labels"],
+                "clap_embedding": clap_embedding,
+            }
+        )

data/infer_clap.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+from pathlib import Path
+import librosa
+import numpy as np
+import torch
+from laion_clap import CLAP_Module
+from tqdm import tqdm
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_path",
+        "-d",
+        required=True,
+        type=str,
+        help="Path of the original wav files",
+    )
+    parser.add_argument(
+        "--save_path",
+        "-s",
+        required=True,
+        type=str,
+        help="Path to save the clap audio embedding '.npy' files",
+    )
+    parser.add_argument(
+        "--clap_ckpt",
+        "-c",
+        required=True,
+        type=str,
+        help="Path of the pretrained clap checkpoint",
+    )
+    parser.add_argument(
+        "--enable_fusion",
+        "-e",
+        default=True,
+        type=bool,
+        help="Whether to enable the feature fusion of the clap model. Depends on the clap checkpoint you are using",
+    )
+    parser.add_argument(
+        "--audio_encoder",
+        "-a",
+        default="HTSAT-tiny",
+        type=str,
+        help="Audio encoder of the clap model. Depends on the clap checkpoint you are using",
+    )
+    args = parser.parse_args()
+    model = CLAP_Module(enable_fusion=args.enable_fusion, aencoder=args.audio_encoder)
+    model.load_ckpt(args.clap_ckpt)
+    data_path = Path(args.data_path)
+    save_path = Path(args.save_path)
+    with torch.no_grad():
+        for wav_path in tqdm(
+            data_path.glob("**/*.wav"), dynamic_ncols=True, colour="yellow"
+        ):
+            wav, _ = librosa.load(wav_path, sr=48000)
+            clap_embeding = model.get_audio_embedding_from_data(
+                x=wav[np.newaxis], use_tensor=False
+            )
+            clap_embeding = clap_embeding.squeeze(axis=0)
+            out_path = save_path / wav_path.with_suffix(".npy").relative_to(data_path)
+            out_path.parent.mkdir(exist_ok=True)
+            np.save(out_path, clap_embeding)

data/infer_encodec.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+from tqdm import tqdm
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_path", type=str, required=True, help="Path of the original wav files"
+    )
+    parser.add_argument(
+        "--save_path", type=str, required=True, help="Path to save encodec .npy files"
+    )
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(12.0)
+    model = model.to(device)
+    data_path = Path(args.data_path)
+    save_path = Path(args.save_path)
+    with torch.no_grad():
+        for wav_path in tqdm(data_path.glob("**/*.wav")):
+            wav, sr = torchaudio.load(wav_path)
+            wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+            wav = wav.unsqueeze(0).to(device)
+            encoded_frames = model.encode(wav)
+            codes = torch.cat([codebook for codebook, _ in encoded_frames], dim=-1)
+            codes = codes.cpu().squeeze(0).transpose(-1, -2).detach().numpy()
+            out_path = save_path / wav_path.with_suffix(".npy").relative_to(data_path)
+            out_path.parent.mkdir(exist_ok=True)
+            np.save(out_path, codes)

data/preprocess.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from dataclasses import dataclass
+from pathlib import Path
+from random import randint
+from typing import Optional, Tuple
+import numpy as np
+import torch
+from transformers import BartTokenizerFast
+@dataclass
+class Preprocessor:
+    encodec_base_path: Path
+    clap_base_path: Path
+    tokenizer: BartTokenizerFast = BartTokenizerFast.from_pretrained(
+        "facebook/bart-base"
+    )
+    max_length: int = 1024
+    mcm_masking_prob: float = 0.15
+    mcm_masking_span: int = 10
+    label_pad_token_id: int = -100
+    mask_token_id: int = 1024
+    num_eval_captions: int = 5
+    def __post_init__(self):
+        if isinstance(self.encodec_base_path, str):
+            self.encodec_base_path = Path(self.encodec_base_path)
+        if isinstance(self.clap_base_path, str):
+            self.clap_base_path = Path(self.clap_base_path)
+        if isinstance(self.tokenizer, str):
+            self.tokenizer = BartTokenizerFast.from_pretrained(self.tokenizer)
+    def preprocess_train(self, example):
+        path = example["file_path"]
+        encodec = np.load(self.encodec_base_path / path)
+        clap_embedding = np.load(self.clap_base_path / path)
+        encodec_mask = np.array(
+            [0, 0] + [1] * min(encodec.shape[0], self.max_length - 3) + [0]
+        )
+        attention_mask = np.ones(min(encodec.shape[0] + 3, self.max_length)).astype(
+            np.int64
+        )
+        target_text = self.tokenizer(text_target=example["caption"])
+        if encodec.shape[0] + 3 > self.max_length:
+            start = randint(0, encodec.shape[0] - self.max_length + 3)
+            encodec = encodec[start : start + self.max_length - 3]
+        mcm_labels = None
+        if self.mcm_masking_prob > 0:
+            num_rvq = encodec.shape[-1]
+            mcm_mask, _ = _compute_mask_indices(
+                encodec.T.shape, self.mcm_masking_prob, self.mcm_masking_span
+            )
+            mcm_mask = mcm_mask.T
+            mcm_labels = np.where(mcm_mask, encodec, self.label_pad_token_id)
+            mcm_labels = np.concatenate(
+                [
+                    np.ones((2, num_rvq), dtype=np.int64) * self.label_pad_token_id,
+                    mcm_labels,
+                    np.ones((1, num_rvq), dtype=np.int64) * self.label_pad_token_id,
+                ],
+                axis=0,
+            )
+            encodec[mcm_mask] = self.mask_token_id
+        encodec = np.concatenate(
+            [
+                np.ones((2, num_rvq), dtype=np.int64) * self.tokenizer.bos_token_id,
+                encodec,
+                np.ones((1, num_rvq), dtype=np.int64) * self.tokenizer.eos_token_id,
+            ],
+            axis=0,
+        )
+        return {
+            "input_ids": encodec,
+            "clap_embedding": clap_embedding,
+            "encodec_mask": encodec_mask,
+            "attention_mask": attention_mask,
+            "mcm_labels": mcm_labels,
+            "labels": target_text["input_ids"],
+        }
+    def preprocess_eval(self, example):
+        path = example["file_path"]
+        encodec = np.load(self.encodec_base_path / path)
+        clap_embedding = np.load(self.clap_base_path / path)
+        attention_mask = np.ones(min(encodec.shape[0] + 3, self.max_length)).astype(
+            np.int64
+        )
+        if encodec.shape[0] + 3 > self.max_length:
+            encodec = encodec[: self.max_length - 3]
+        captions = []
+        for i in range(self.num_eval_captions):
+            captions.append(example[f"caption_{i+1}"])
+        return {
+            "input_ids": encodec,
+            "attention_mask": attention_mask,
+            "clap": clap_embedding,
+            "captions": captions,
+        }
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+        return num_masked_span
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+        spec_aug_mask_idx = np.concatenate(
+            [
+                spec_aug_mask_idx,
+                np.ones(max_num_masked_span - num_masked_span, dtype=np.int32)
+                * dummy_mask_idx,
+            ]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(
+        offsets, (batch_size, max_num_masked_span, mask_length)
+    ).reshape(batch_size, max_num_masked_span * mask_length)
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = (
+            sequence_length - 1
+        )
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+    return torch.from_numpy(spec_aug_mask), spec_aug_mask_idxs

gradio_app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+from typing import Tuple
+import gradio as gr
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from inference import EnClap
+def input_toggle(choice: str):
+    if choice == "file":
+        return gr.update(visible=True), gr.update(visible=False)
+    return gr.update(visible=False), gr.update(visible=True)
+if __name__ == "__main__":
+    import logging
+    logging.getLogger().setLevel(logging.INFO)
+    ckpt_path = "./ckpt"  # os.getenv("ckpt_path")
+    device = "cpu"  # os.getenv("device")
+    enclap = EnClap(ckpt_path=ckpt_path, device=device)
+    def run_enclap(
+        input_type: str,
+        file_input: Tuple[int, np.ndarray],
+        mic_input: Tuple[int, np.ndarray],
+        seed: int,
+    ) -> str:
+        print(input_type, file_input, mic_input)
+        input = file_input if input_type == "file" else mic_input
+        if input is None:
+            raise gr.Error("Input audio was not provided.")
+        res, audio = input
+        torch.manual_seed(seed)
+        return enclap.infer_from_audio(torch.from_numpy(audio), res)[0]
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                radio = gr.Radio(
+                    ["file", "mic"],
+                    value="file",
+                    label="Choose the input method of the audio.",
+                )
+                file = gr.Audio(label="Input", visible=True)
+                mic = gr.Mic(label="Input", visible=False)
+                slider = gr.Slider(minimum=0, maximum=100, label="Seed")
+                radio.change(fn=input_toggle, inputs=radio, outputs=[file, mic])
+                button = gr.Button("Run", label="run")
+            with gr.Column():
+                output = gr.Text(label="Output")
+            button.click(
+                fn=run_enclap, inputs=[radio, file, mic, slider], outputs=output
+            )
+    demo.launch()

inference.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

inference.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import Any, Dict
+import numpy as np
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+from laion_clap import CLAP_Module
+from transformers import AutoTokenizer
+from modeling.enclap_bart import EnClapBartConfig, EnClapBartForConditionalGeneration
+class EnClap:
+    def __init__(
+        self,
+        ckpt_path: str,
+        clap_audio_model: str = "HTSAT-tiny",
+        clap_enable_fusion = True,
+        device: str = "cuda",
+    ):
+        config = EnClapBartConfig.from_pretrained(ckpt_path)
+        self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
+        self.model = (
+            EnClapBartForConditionalGeneration.from_pretrained(ckpt_path)
+            .to(self.device)
+            .eval()
+        )
+        self.encodec = EncodecModel.encodec_model_24khz().to(self.device)
+        self.encodec.set_target_bandwidth(12.0)
+        self.clap_model = CLAP_Module(enable_fusion=clap_enable_fusion, amodel=clap_audio_model, device=self.device)
+        self.clap_model.load_ckpt()
+        self.generation_config = {
+            "_from_model_config": True,
+            "bos_token_id": 0,
+            "decoder_start_token_id": 2,
+            "early_stopping": True,
+            "eos_token_id": 2,
+            "forced_bos_token_id": 0,
+            "forced_eos_token_id": 2,
+            "no_repeat_ngram_size": 3,
+            "num_beams": 4,
+            "pad_token_id": 1,
+            "max_length": 50,
+        }
+        self.scale_factor = 2**15
+        self.max_seq_len = config.max_position_embeddings - 3
+    @torch.no_grad()
+    def infer_from_audio_file(
+        self, audio_file: str, generation_config: Dict[str, Any] = None
+    ) -> str:
+        if generation_config is None:
+            generation_config = self.generation_config
+        audio, res = torchaudio.load(audio_file)
+        return self.infer_from_audio(audio[0], res)
+    @torch.no_grad()
+    def infer_from_audio(
+        self, audio: torch.Tensor, res: int, generation_config: Dict[str, Any] = None
+    ) -> str:
+        if generation_config is None:
+            generation_config = self.generation_config
+        if audio.dtype == torch.int or audio.dtype == torch.short:
+            audio = audio / self.scale_factor
+        encodec_audio = (
+            convert_audio(
+                audio.unsqueeze(0), res, self.encodec.sample_rate, self.encodec.channels
+            )
+            .unsqueeze(0)
+            .to(self.device)
+        )
+        encodec_frames = self.encodec.encode(encodec_audio)
+        encodec_frames = torch.cat(
+            [codebook for codebook, _ in encodec_frames], dim=-1
+        ).mT
+        clap_audio = torchaudio.transforms.Resample(res, 48000)(audio).unsqueeze(0)
+        clap_embedding = self.clap_model.get_audio_embedding_from_data(clap_audio, use_tensor=True)
+        return self._infer(encodec_frames, clap_embedding, generation_config)
+    @torch.no_grad()
+    def _infer(
+        self,
+        encodec_frames: torch.LongTensor,
+        clap_embedding: torch.Tensor,
+        generation_config: Dict[str, Any] = None,
+    ) -> str:
+        input_ids = torch.cat(
+            [
+                torch.ones(
+                    (encodec_frames.shape[0], 2, encodec_frames.shape[-1]),
+                    dtype=torch.long,
+                ).to(self.device)
+                * self.tokenizer.bos_token_id,
+                encodec_frames[:, : self.max_seq_len],
+                torch.ones(
+                    (encodec_frames.shape[0], 1, encodec_frames.shape[-1]),
+                    dtype=torch.long,
+                ).to(self.device)
+                * self.tokenizer.eos_token_id,
+            ],
+            dim=1,
+        )
+        encodec_mask = torch.LongTensor(
+            [[0, 0] + [1] * (input_ids.shape[1] - 3) + [0]]
+        ).to(self.device)
+        enclap_bart_inputs = {
+            "input_ids": input_ids,
+            "encodec_mask": encodec_mask,
+            "clap_embedding": clap_embedding,
+        }
+        results = self.model.generate(**enclap_bart_inputs, **generation_config)
+        caption = self.tokenizer.batch_decode(results, skip_special_tokens=True)
+        return caption
+    @torch.no_grad()
+    def infer_from_encodec(
+        self,
+        file_path,
+        clap_path: str = "clap",
+        generation_config: Dict[str, Any] = None,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        input_ids = np.load(file_path)
+        if input_ids.shape[0] > self.max_encodec_length:
+            input_ids = input_ids[: self.max_encodec_length, :]
+        input_length = input_ids.shape[0]
+        input_ids = np.concatenate([input_ids, self.eos_padding], axis=0)
+        input_ids = torch.LongTensor(input_ids)
+        input_ids = input_ids.unsqueeze(0).to(self.device)
+        attention_mask = (
+            torch.ones(input_length + 3, dtype=torch.int64).unsqueeze(0).to(self.device)
+        )
+        eos_mask = [0] * (input_length + 3)
+        eos_mask[input_length + 2] = 1
+        eos_mask = torch.BoolTensor(eos_mask).unsqueeze(0)
+        # Load CLAP
+        clap_path = file_path.replace("encodec_16", clap_path)
+        clap = np.load(clap_path)
+        clap = torch.Tensor(clap).unsqueeze(0).to(self.device)
+        input = {
+            "input_ids": input_ids,
+            "clap": clap,
+            "attention_mask": attention_mask,
+            "eos_mask": eos_mask,
+        }
+        generated_ids = self.model.generate(**input, **generation_config)
+        text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        return text

metric/__init__.py ADDED Viewed

File without changes

metric/compute_metric.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+from aac_metrics import evaluate
+import copy
+metric_list = ["bleu_1", "bleu_4", "rouge_l", "meteor", "spider_fl"]
+if __name__=='__main__':
+    csv_path = "/workspace/audiobart/csv/predictions/prediction_clap.csv"
+    df = pd.read_csv(csv_path)
+    predictions = []
+    references = []
+    for idx in range(len(df)):
+        predictions.append(df.loc[idx]['prediction'])
+        reference = [df.loc[idx]['caption_1'],df.loc[idx]['caption_2'],df.loc[idx]['caption_3'],df.loc[idx]['caption_4'],df.loc[idx]['caption_5'] ]
+        references.append(reference)
+    print("> Evaluating predictions...")
+    result = evaluate(predictions, references, metrics=metric_list)
+    result = {k: v.item() for k, v in result[0].items()}
+    keys = list(result.keys())
+    for key in keys:
+        if "fluerr" in key:
+            del result[key]
+    print(result)

metric/compute_metric_from_scratch.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import sys
+sys.path.append('..')
+sys.path.append('.')
+from aac_metrics import evaluate
+from inference import AudioBartInference
+from tqdm import tqdm
+import os
+import pandas as pd
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+metric_list = ["bleu_1", "bleu_4", "rouge_l", "meteor", "spider_fl"]
+if __name__ == "__main__":
+    dataset = "AudioCaps"
+    # dataset = "clotho"
+    ckpt_path = "/data/jyk/aac_results/bart_base/audiocaps_35e5_2000/checkpoints/epoch_8"
+    # ckpt_path = "/data/jyk/aac_results/masking/linear_scalinEg/checkpoints/epoch_14"
+    max_encodec_length = 1022
+    infer_module = AudioBartInference(ckpt_path, max_encodec_length)
+    from_encodec = True
+    csv_path = f"/workspace/audiobart/csv/{dataset}/test.csv"
+    base_path = f"/data/jyk/aac_dataset/{dataset}/encodec_16"
+    clap_name = "clap_audio_fused"
+    df = pd.read_csv(csv_path)
+    generation_config = {
+        "_from_model_config": True,
+        "bos_token_id": 0,
+        "decoder_start_token_id": 2,
+        "early_stopping": True,
+        "eos_token_id": 2,
+        "forced_bos_token_id": 0,
+        "forced_eos_token_id": 2,
+        "no_repeat_ngram_size": 3,
+        "num_beams": 4,
+        "pad_token_id": 1,
+        "max_length": 50
+    }
+    print(f"> Making Predictions for model {ckpt_path}...")
+    predictions = []
+    references = []
+    for idx in tqdm(range(len(df)), dynamic_ncols=True, colour="BLUE"):
+        if not from_encodec:
+            wav_path = df.loc[idx]['file_name']
+        else:
+            wav_path = df.loc[idx]['file_path']
+        wav_path = os.path.join(base_path,wav_path)
+        if not os.path.exists(wav_path):
+            pass
+        if not from_encodec:
+            prediction = infer_module.infer(wav_path)
+        else:
+            prediction = infer_module.infer_from_encodec(wav_path, clap_name, generation_config)
+        predictions.append(prediction[0])
+        reference = [df.loc[idx]['caption_1'],df.loc[idx]['caption_2'],df.loc[idx]['caption_3'],df.loc[idx]['caption_4'],df.loc[idx]['caption_5'] ]
+        references.append(reference)
+    print("> Evaluating predictions...")
+    result = evaluate(predictions, references, metrics=metric_list)
+    result = {k: round(v.item(),4) for k, v in result[0].items()}
+    keys = list(result.keys())
+    for key in keys:
+        if "fluerr" in key:
+            del result[key]
+    print(result)

metric/make_predictions.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import sys
+sys.path.append('..')
+from inference import AudioBartInference
+from tqdm import tqdm
+import os
+import pandas as pd
+import csv
+os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+if __name__ == "__main__":
+    ckpt_path = "/data/jyk/aac_results/clap/clap/checkpoints/epoch_12"
+    infer_module = AudioBartInference(ckpt_path)
+    from_encodec = True
+    csv_path = "/workspace/audiobart/csv/test.csv"
+    base_path = "/data/jyk/aac_dataset/clotho/encodec"
+    df = pd.read_csv(csv_path)
+    save_path = "/workspace/audiobart/csv/predictions/prediction_clap.csv"
+    f = open(save_path, 'w', newline='')
+    writer = csv.writer(f)
+    writer.writerow(['file_path', 'prediction', 'caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5'])
+    print(f"> Making Predictions for model {ckpt_path}...")
+    for idx in tqdm(range(len(df)), dynamic_ncols=True, colour="red"):
+        if not from_encodec:
+            wav_path = df.loc[idx]['file_name']
+        else:
+            wav_path = df.loc[idx]['file_path']
+        wav_path = os.path.join(base_path,wav_path)
+        if not os.path.exists(wav_path):
+            pass
+        if not from_encodec:
+            prediction = infer_module.infer(wav_path)
+        else:
+            prediction = infer_module.infer_from_encodec(wav_path)
+        line = [wav_path, prediction[0], df.loc[idx]['caption_1'], df.loc[idx]['caption_2'],df.loc[idx]['caption_3'],df.loc[idx]['caption_4'],df.loc[idx]['caption_5']]
+        writer.writerow(line)
+    f.close()

modeling/__init__.py ADDED Viewed

File without changes

modeling/enclap_bart.py ADDED Viewed

	@@ -0,0 +1,548 @@

+import math
+import random
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from transformers.models.bart.configuration_bart import BartConfig
+from transformers.models.bart.modeling_bart import (
+    BartDecoder,
+    BartEncoderLayer,
+    BartForConditionalGeneration,
+    BartLearnedPositionalEmbedding,
+    BartModel,
+    BartPretrainedModel,
+    _expand_mask,
+    shift_tokens_right,
+)
+from transformers.utils import logging
+from .modeling_outputs import EnClapBartOutput
+logger = logging.get_logger(__name__)
+class EnClapBartConfig(BartConfig):
+    def __init__(
+        self,
+        d_clap: int = 512,
+        num_rvq: int = 16,
+        encodec_vocab_size: int = 1024,
+        encodec_pad_token_id: int = 1024,
+        mcm_loss_scale: float = 0.7,
+        label_smoothing: float = 0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_clap = d_clap
+        self.num_rvq = num_rvq
+        self.encodec_vocab_size = encodec_vocab_size
+        self.encodec_pad_token_id = encodec_pad_token_id
+        self.mcm_loss_scale = mcm_loss_scale
+        self.label_smoothing = label_smoothing
+class EnClapBartEncoder(BartPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(
+        self, config: EnClapBartConfig, embed_tokens: Optional[nn.Embedding] = None
+    ):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        clap_dim = config.d_clap
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.embed_encodec = nn.ModuleList(
+            [
+                nn.Embedding(
+                    math.ceil((config.encodec_vocab_size + 1) / 64) * 64,
+                    config.d_model,
+                    padding_idx=config.encodec_pad_token_id,
+                )
+                for _ in range(config.num_rvq)
+            ]
+        )
+        self.clap_projection = nn.Linear(clap_dim, embed_dim)
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList(
+            [BartEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        clap_embedding: Optional[torch.Tensor] = None,
+        encodec_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            if input_ids.ndim == 2:  # This is effectively just input = input_ids
+                input = input_ids
+                input_ids = input_ids.view(-1, input_ids.shape[-1])
+        elif inputs_embeds is not None:
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            if input_ids.ndim == 2:
+                inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+            elif input_ids.ndim == 3:
+                encodec_ids = torch.where(encodec_mask.unsqueeze(-1) > 0, input_ids, 0)
+                encodec_embeds = torch.zeros(
+                    input_ids.shape[0], input_ids.shape[1], self.config.d_model
+                ).to(self.device)
+                for i, embed in enumerate(self.embed_encodec):
+                    encodec_embeds = encodec_embeds + embed(encodec_ids[..., i])
+                bart_ids = torch.where(encodec_mask == 0, input_ids[..., 0], 0)
+                bart_embeds = self.embed_tokens(bart_ids)
+                input_embeds = torch.where(
+                    encodec_mask.unsqueeze(-1) > 0, encodec_embeds, bart_embeds
+                )
+                # Get CLAP embedding
+                if clap_embedding is not None:
+                    clap_embedding = self.clap_projection(clap_embedding)
+                    input_embeds[:, 0] = clap_embedding
+                    inputs_embeds = input_embeds.to(self.device)
+        batch_size = input_ids.size(0)
+        embed_pos = self.embed_positions(input_ids).to(self.device)
+        embed_pos = torch.cat(
+            [
+                torch.zeros(batch_size, 1, self.config.d_model).to(self.device),
+                embed_pos[:, :-1],
+            ],
+            dim=1,
+        )
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (
+                dropout_probability < self.layerdrop
+            ):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+class EnClapBartModel(BartModel):
+    def __init__(self, config: EnClapBartConfig):
+        super(BartModel, self).__init__(config)
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+        self.encoder = EnClapBartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        clap_embedding: Optional[torch.Tensor] = None,
+        encodec_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                clap_embedding=clap_embedding,
+                encodec_mask=encodec_mask,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+class EnClapBartForConditionalGeneration(BartForConditionalGeneration):
+    config_class = EnClapBartConfig
+    def __init__(self, config: EnClapBartConfig):
+        super(BartForConditionalGeneration, self).__init__(config)
+        self.model = EnClapBartModel(config)
+        self.register_buffer(
+            "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))
+        )
+        self.lm_head = nn.Linear(
+            config.d_model, self.model.shared.num_embeddings, bias=False
+        )
+        self.mcm_heads = nn.ModuleList(
+            [
+                nn.Linear(config.d_model, config.encodec_vocab_size)
+                for _ in range(config.num_rvq)
+            ]
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        clap_embedding: Optional[torch.Tensor] = None,
+        encodec_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mcm_labels: Optional[List[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            if use_cache:
+                logger.warning(
+                    "The `use_cache` argument is changed to `False` since `labels` is provided."
+                )
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.model(
+            input_ids,
+            clap_embedding=clap_embedding,
+            encodec_mask=encodec_mask,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        mcm_loss = None
+        if mcm_labels is not None:
+            mcm_loss = 0.0
+            loss_fct = CrossEntropyLoss()
+            for i, mcm_head in enumerate(self.mcm_heads):
+                mcm_logits = mcm_head(outputs.encoder_last_hidden_state)
+                loss_scale = 1 / 2 ** (i + 1)
+                loss = loss_fct(
+                    mcm_logits.view(-1, self.config.encodec_vocab_size),
+                    mcm_labels[..., i].reshape(-1),
+                )
+                mcm_loss = mcm_loss + loss * loss_scale
+        lm_logits = self.lm_head(outputs[0])
+        lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)
+        masked_lm_loss = None
+        if labels is not None:
+            labels = labels.to(lm_logits.device)
+            loss_fct = CrossEntropyLoss(label_smoothing=self.config.label_smoothing)
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+        loss = None
+        if mcm_loss is None:
+            loss = masked_lm_loss
+        elif masked_lm_loss is None:
+            loss = mcm_loss
+        else:
+            mcm_loss = mcm_loss * self.config.mcm_loss_scale
+            loss = masked_lm_loss + mcm_loss
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+        return EnClapBartOutput(
+            loss=loss,
+            lm_loss=masked_lm_loss,
+            mcm_loss=mcm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )

modeling/modeling_outputs.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+from transformers.modeling_outputs import Seq2SeqLMOutput
+@dataclass
+class EnClapBartOutput(Seq2SeqLMOutput):
+    mcm_loss: Optional[torch.FloatTensor] = None
+    lm_loss: Optional[torch.FloatTensor] = None

port_weights.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import argparse
+from math import ceil
+from pathlib import Path
+import torch
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", "-c", type=str)
+    args = parser.parse_args()
+    weight_name_map = {
+        "model.encodec_embeddings": None,
+        "encodec_embeddings": "embed_encodec",
+        "encodec_mlm_head": "mcm_heads",
+    }
+    ckpt_path = Path(args.ckpt_path)
+    weight_file = ckpt_path / "pytorch_model.bin"
+    state_dict = torch.load(weight_file, map_location="cpu")
+    new_state_dict = {}
+    for key in state_dict:
+        new_key = key
+        for orig, repl in weight_name_map.items():
+            if repl is None:
+                if orig in new_key:
+                    new_key = None
+                    break
+                continue
+            new_key = new_key.replace(orig, repl)
+        if new_key:
+            new_state_dict[new_key] = state_dict[key]
+    for key in new_state_dict:
+        if "model.encoder.embed_encodec" in key:
+            dim = new_state_dict[key].shape[0]
+            new_weight = torch.normal(
+                0, 1, (ceil(dim / 64) * 64, new_state_dict[key].shape[1])
+            )
+            new_weight[:dim] = new_state_dict[key]
+            new_state_dict[key] = new_weight
+    weight_file.rename(weight_file.with_suffix(".bin.bak"))
+    torch.save(new_state_dict, weight_file)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+aac-metrics==0.4.2
+accelerate==0.20.3
+datasets==2.13.1
+encodec==0.1.1
+laion-clap==1.1.4
+librosa==0.10.1
+markupsafe==2.0.1
+omegaconf==2.3.0
+soundfile==0.12.1
+tensorboard==2.13.0
+tokenizers==0.13.3
+torch==1.13.0
+torchaudio==0.13.0
+transformers==4.29.0

test/bart_test.ipynb ADDED Viewed

	@@ -0,0 +1,363 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "from transformers.models.bart.modeling_bart import BartForConditionalGeneration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('bart/tokenizer/tokenizer_config.json',\n",
+       " 'bart/tokenizer/special_tokens_map.json',\n",
+       " 'bart/tokenizer/vocab.json',\n",
+       " 'bart/tokenizer/merges.txt',\n",
+       " 'bart/tokenizer/added_tokens.json',\n",
+       " 'bart/tokenizer/tokenizer.json')"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.save_pretrained(\"bart/tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large\", forced_bos_token_id=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of AudioBartForConditionalGeneration were not initialized from the model checkpoint at bart/model/ and are newly initialized: ['model.encodec_embeddings.3.weight', 'model.encodec_embeddings.4.weight', 'model.encodec_embeddings.1.weight', 'model.encodec_embeddings.0.weight', 'model.encoder.encodec_embeddings.7.weight', 'model.encodec_embeddings.2.weight', 'model.encodec_embeddings.6.weight', 'model.encoder.encodec_embeddings.0.weight', 'model.encodec_embeddings.7.weight', 'model.encoder.encodec_embeddings.4.weight', 'model.encoder.encodec_embeddings.2.weight', 'model.encoder.encodec_embeddings.3.weight', 'model.encodec_embeddings.5.weight', 'model.encoder.encodec_embeddings.5.weight', 'model.encoder.encodec_embeddings.1.weight', 'model.encoder.encodec_embeddings.6.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from modeling.audiobart import AudioBartForConditionalGeneration\n",
+    "model = AudioBartForConditionalGeneration.from_pretrained(\"bart/model/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': tensor([[    0, 31414,   127, 50264, 32440,  3807,   118, 32440,  3807,   118,\n",
+      "         25610,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"Hello my <mask> yeppi yeppi yo\"\n",
+    "input = tokenizer(text, return_tensors='pt')\n",
+    "print(input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_ids = model.generate(input[\"input_ids\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ids = output.logits.detach().numpy().argmax(-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Hello my friends, yeppi yeppiiyeppiyeppii ye']"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.batch_decode(generated_ids, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(\"bart/model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': tensor([[    0,  7842,   330,   506,  1536,   267,   131,  6634, 36807,   571,\n",
+      "         20920,   127,   766,    16, 32440,  3807,   118, 32440,  3807,   118,\n",
+      "         25610,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"adskfalsj;lsdfg Hello my name is yeppi yeppi yo\"\n",
+    "input = tokenizer(text, return_tensors='pt')\n",
+    "print(input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = model.forward(**input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py:1353: UserWarning: Using `max_length`'s default (20) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['</s><s>adskfalsj;lsdfg Hello my name is yeppi ye</s>']"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.batch_decode(model.generate(input['input_ids']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['<s>Hello my name is yeppi yeppi yo</s>']"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.batch_decode(input['input_ids'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.models.bart.modeling_bart import shift_tokens_right"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[    0,     0, 31414,   127,   766,    16, 32440,  3807,   118, 32440,\n",
+      "          3807,   118, 25610]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(shift_tokens_right(input['input_ids'], pad_token_id=1, decoder_start_token_id=0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)lve/main/config.json: 100%|██████████| 1.58k/1.58k [00:00<00:00, 589kB/s]\n",
+      "Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.29MB/s]\n",
+      "Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 884kB/s]\n",
+      "Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 7.43MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "cnn_tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large-cnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_text = \"ArithmeticErrorThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['<s>ArithmeticErrorThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.</s>']\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = cnn_tokenizer(text=original_text, return_tensors='pt')\n",
+    "print(cnn_tokenizer.batch_decode(input['input_ids']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cnn_model = BartForConditionalGeneration.from_pretrained(\"facebook/bart-large-cnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/transformers/generation/utils.py:1353: UserWarning: Using `max_length`'s default (142) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['</s><s>The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world. It was the first structure to reach a height of 300 metres.</s>']"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cnn_tokenizer.batch_decode(cnn_model.generate(input['input_ids']))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test/clap_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

test/dataset_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

test/dataset_test.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sys
+sys.path.append(".")
+sys.path.append("..")
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from modeling.audiobart import AudioBartForConditionalGeneration
+from torch.utils.data import DataLoader
+from data.collator import EncodecCollator
+import numpy as np
+import torch
+import os
+if __name__=="__main__":
+    model = AudioBartForConditionalGeneration.from_pretrained('bart/model')
+    base_path = "/data/jyk/aac_dataset/AudioCaps/encodec_16/"
+    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
+    data_files = {"train": "csv/AudioCaps/train.csv"}
+    max_encodec_length = 1021
+    clap_base_path = "/data/jyk/aac_dataset/AudioCaps/clap"
+    raw_dataset = load_dataset("csv", data_files=data_files)
+    def preprocess_function(example):
+        path = example['file_path']
+        encodec = np.load(os.path.join(base_path, path))
+        if encodec.shape[0]>max_encodec_length:
+            encodec = encodec[:max_encodec_length, :]
+        clap = np.load(os.path.join(clap_base_path, path))
+        attention_mask = np.ones(encodec.shape[0]+3).astype(np.int64)
+        target_text = tokenizer(text_target=example['caption'])
+        return {'input_ids': encodec, 'clap': clap, 'attention_mask': attention_mask, 'labels': target_text['input_ids'], 'decoder_attention_mask': target_text['attention_mask']}
+    train_dataset = raw_dataset['train'].map(preprocess_function)
+    train_dataset.set_format("pt", columns=['input_ids', 'attention_mask', 'clap', 'labels', 'decoder_attention_mask'])
+    train_data_collator = EncodecCollator(
+        tokenizer=tokenizer,
+        model=model,
+        return_tensors="pt",
+        random_sampling=False,
+        max_length=max_encodec_length,
+        num_subsampling=0,
+        clap_masking_prob=-1,
+        encodec_masking_prob=0.15,
+        encodec_masking_length=10
+    )
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=train_data_collator, batch_size=16)
+    for idx, batch in enumerate(train_dataloader):
+        # output = model.generate(**batch, max_length=100)
+        output = model(**batch)
+        print(output)

test/encodec_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

test/encodec_test.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from encodec import EncodecModel
+from encodec.utils import convert_audio
+import torchaudio
+import torch
+# Instantiate a pretrained EnCodec model
+model = EncodecModel.encodec_model_24khz()
+# The number of codebooks used will be determined bythe bandwidth selected.
+# E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used.
+# Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32).
+# For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number
+# of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much.
+model.set_target_bandwidth(6.0)
+# Load and pre-process the audio waveform
+wav, sr = torchaudio.load("<PATH_TO_AUDIO_FILE>")
+wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+wav = wav.unsqueeze(0)
+# Extract discrete codes from EnCodec
+with torch.no_grad():
+    encoded_frames = model.encode(wav)
+codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]

test/eval_dataset_test.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+from modeling.audiobart import AudioBartForConditionalGeneration
+from torch.utils.data import DataLoader
+from data.collator import EncodecCollator
+import numpy as np
+import torch
+import os
+if __name__=="__main__":
+    model = AudioBartForConditionalGeneration.from_pretrained('bart/model')
+    basepath = "/data/jyk/aac_dataset/clotho/encodec/"
+    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
+    data_files = {"validation": "csv/valid_allcaps.csv"}
+    num_captions = 5
+    raw_dataset = load_dataset("csv", data_files=data_files)
+    def preprocess_eval(example):
+        path = example['file_path']
+        encodec = np.load(os.path.join(basepath, path))
+        if encodec.shape[0]>1022:
+            encodec = encodec[:1022, :]
+        attention_mask = np.ones(encodec.shape[0]+2).astype(np.int64)
+        captions = []
+        for i in range(1, num_captions+1):
+            captions.append(example['caption_'+str(i)])
+        return {'input_ids': encodec, 'attention_mask': attention_mask, 'captions': captions}
+    train_dataset = raw_dataset['validation'].map(preprocess_eval)
+    train_dataset.set_format('pt', columns=['input_ids', 'attention_mask'], output_all_columns=True)
+    # train_dataset.remove_columns('file_path', 'caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5')
+    data_collator = EncodecCollator(tokenizer=tokenizer, model=model, return_tensors="pt")
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=16)
+    for idx, batch in enumerate(train_dataloader):
+        output = model.generate(**batch, max_length=100)
+        print(output)

test/masking_test.ipynb ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "probability_matrix = torch.full((8, 15), 0.15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([8, 15])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "probability_matrix.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500],\n",
+       "        [0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500,\n",
+       "         0.1500, 0.1500, 0.1500, 0.1500, 0.1500, 0.1500]])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "probability_matrix.masked_fill_(torch.tensor(0, dtype=torch.bool), value=0.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "masked_indices = torch.bernoulli()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test/metric_test.ipynb ADDED Viewed

	@@ -0,0 +1,260 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n",
+      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
+     ]
+    }
+   ],
+   "source": [
+    "metric = evaluate.load(\"meteor\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv(\"csv/predictions.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = []\n",
+    "references = []\n",
+    "for idx in range(len(df)):\n",
+    "    predictions.append(df.loc[idx]['prediction'])\n",
+    "    reference = [df.loc[idx]['caption1'],df.loc[idx]['caption2'],df.loc[idx]['caption3'],df.loc[idx]['caption4'],df.loc[idx]['caption5'] ]\n",
+    "    references.append(reference)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from aac_metrics import evaluate\n",
+    "corpus_score  = evaluate(predictions, references)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'bleu_1': tensor(0.3913, dtype=torch.float64),\n",
+       " 'bleu_2': tensor(0.1931, dtype=torch.float64),\n",
+       " 'bleu_3': tensor(0.1065, dtype=torch.float64),\n",
+       " 'bleu_4': tensor(0.0569, dtype=torch.float64),\n",
+       " 'meteor': tensor(0.1197, dtype=torch.float64),\n",
+       " 'rouge_l': tensor(0.2745, dtype=torch.float64),\n",
+       " 'cider_d': tensor(0.1235, dtype=torch.float64),\n",
+       " 'spice': tensor(0.0670, dtype=torch.float64),\n",
+       " 'spider': tensor(0.0953, dtype=torch.float64)}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "corpus_score[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'bleu_1': 0.3912776883574468, 'bleu_2': 0.19312066269135236, 'bleu_3': 0.10651188216812753, 'bleu_4': 0.05690269475018141, 'meteor': 0.11968742992878356, 'rouge_l': 0.2744644068893943, 'cider_d': 0.12347016800968286, 'spice': 0.06704068138550699, 'spider': 0.09525542469759493}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print({k: v.item() for k, v in corpus_score[0].items()})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = metric.compute(predictions=predictions, references=references)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'meteor': 0.26686702985116983}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bleu = evaluate.load(\"bleu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bleu_result = bleu.compute(predictions=predictions, references=references, max_order=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'bleu': 0.06128958043343902, 'precisions': [0.42544588056899413, 0.09036238675413934, 0.031210136916404455, 0.01176031360836289], 'brevity_penalty': 1.0, 'length_ratio': 1.3508583690987124, 'translation_length': 13849, 'reference_length': 10252}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(bleu_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'function' object has no attribute 'load'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/workspace/audiobart/metric_test.ipynb Cell 13\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f617564696f62617274222c2273657474696e6773223a7b22686f7374223a227373683a2f2f3138332e3131302e36322e3639227d7d/workspace/audiobart/metric_test.ipynb#X14sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m rouge_metric \u001b[39m=\u001b[39m evaluate\u001b[39m.\u001b[39;49mload(\u001b[39m\"\u001b[39m\u001b[39mrouge\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'function' object has no attribute 'load'"
+     ]
+    }
+   ],
+   "source": [
+    "rouge_metric = evaluate.load(\"rouge\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rouge_result = rouge_metric.compute(predictions=predictions, references=references)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'rouge1': 0.30500784605917763,\n",
+       " 'rouge2': 0.08778194034686765,\n",
+       " 'rougeL': 0.2707178803695874,\n",
+       " 'rougeLsum': 0.27045227295118685}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rouge_result"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test/subsample_test.ipynb ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from modeling.audiobart import Subsampler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subsampler = Subsampler(1024, 3, 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import count_parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1053696"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_parameters(subsampler)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8, 1023, 1024])\n"
+     ]
+    }
+   ],
+   "source": [
+    "input = torch.randn(8, 4095, 1024)\n",
+    "output = subsampler(input)\n",
+    "output = subsampler(output)\n",
+    "print(output.shape)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test/train_test.ipynb ADDED Viewed

	@@ -0,0 +1,261 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"4,5,6,7\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "basepath = \"/data/jyk/aac_dataset/clotho/encodec/\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_files = {\"train\": \"csv/train_short.csv\", \"validation\": \"csv/valid_short.csv\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-8533483370f473b7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)\n",
+      "100%|██████████| 2/2 [00:00<00:00, 923.96it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_dataset = load_dataset(\"csv\", data_files=data_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['file_path', 'caption'],\n",
+       "    num_rows: 19175\n",
+       "})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_dataset['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data.collator import EncodecCollator\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocessing(example):\n",
+    "    path = example['file_path']\n",
+    "    encodec = np.load(os.path.join(basepath, path))\n",
+    "    if encodec.shape[0]>1022:\n",
+    "        encodec = encodec[:1022, :]\n",
+    "    attention_mask = np.ones(encodec.shape[0]+2)\n",
+    "    target_text = tokenizer(text_target=example['caption'])\n",
+    "\n",
+    "    return {'input_ids': encodec , 'attention_mask': attention_mask, 'labels': target_text['input_ids'], 'decoder_attention_mask': target_text['attention_mask']}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                  \r"
+     ]
+    }
+   ],
+   "source": [
+    "train_dataset = raw_dataset['train'].map(preprocessing, num_proc=16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset.set_format(\"np\", columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-8533483370f473b7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-a3db71731640afd3_*_of_00016.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "valid_dataset = raw_dataset['validation'].map(preprocessing, num_proc=16)\n",
+    "valid_dataset.set_format(\"np\", columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of AudioBartForConditionalGeneration were not initialized from the model checkpoint at bart/model and are newly initialized: ['model.encodec_embeddings.6.weight', 'model.encodec_embeddings.4.weight', 'model.encodec_embeddings.1.weight', 'model.encodec_embeddings.7.weight', 'model.encodec_embeddings.5.weight', 'model.encodec_embeddings.3.weight', 'model.encodec_embeddings.2.weight', 'model.encodec_embeddings.0.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from modeling.audiobart import AudioBartForConditionalGeneration\n",
+    "model = AudioBartForConditionalGeneration.from_pretrained('bart/model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "414688256\n"
+     ]
+    }
+   ],
+   "source": [
+    "from utils import count_parameters\n",
+    "print(count_parameters(model))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_collator = EncodecCollator(tokenizer=tokenizer, model=model, return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = Seq2SeqTrainingArguments('summary_test', per_gpu_train_batch_size=16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = Seq2SeqTrainer(\n",
+    "    model, training_args, train_dataset=valid_dataset, eval_dataset=valid_dataset, data_collator=data_collator, tokenizer=tokenizer\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test/train_test.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+from modeling.audiobart import AudioBartForConditionalGeneration
+from data.collator import EncodecCollator
+from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
+import numpy as np
+import torch
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
+if __name__=="__main__":
+    model = AudioBartForConditionalGeneration.from_pretrained('bart/model')
+    basepath = "/data/jyk/aac_dataset/clotho/encodec/"
+    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
+    data_files = {"train": "csv/train_short.csv", "validation": "csv/valid_short.csv"}
+    raw_dataset = load_dataset("csv", data_files=data_files)
+    def preprocessing(example):
+        path = example['file_path']
+        encodec = np.load(os.path.join(basepath, path))
+        if encodec.shape[0]>1022:
+            encodec = encodec[:1022, :]
+        attention_mask = np.ones(encodec.shape[0]+2)
+        target_text = tokenizer(text_target=example['caption'])
+        return {'input_ids': encodec , 'attention_mask': attention_mask, 'labels': target_text['input_ids'], 'decoder_attention_mask': target_text['attention_mask']}
+    train_dataset = raw_dataset['validation'].map(preprocessing)
+    train_dataset.set_format("pt", columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])
+    data_collator = EncodecCollator(tokenizer=tokenizer, model=model, return_tensors="pt")
+    training_args = Seq2SeqTrainingArguments('summary_test', per_gpu_train_batch_size=20)
+    trainer = Seq2SeqTrainer(
+        model, training_args, train_dataset=train_dataset, eval_dataset=train_dataset, data_collator=data_collator, tokenizer=tokenizer
+    )
+    trainer.train()

train.py ADDED Viewed

	@@ -0,0 +1,527 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+import os
+import sys
+import datasets
+import numpy as np
+import torch
+import transformers
+from aac_metrics import evaluate
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    AutoTokenizer,
+    BartConfig,
+    get_inverse_sqrt_schedule,
+    get_scheduler,
+)
+from data.collator import DataCollatorForEnClapBart
+from data.preprocess import Preprocessor
+from modeling.enclap_bart import EnClapBartForConditionalGeneration
+logger = get_logger(__name__)
+metric_list = ["meteor", "spider"]
+def main():
+    # Load Configuration
+    cfg_path = sys.argv[1]
+    args = OmegaConf.load(cfg_path)
+    # Initialize Logging
+    accelerator_log_kwargs = {}
+    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+    # Initialize Accelerator
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        split_batches=args.split_batches,
+        kwargs_handlers=[ddp_kwargs],
+        **accelerator_log_kwargs,
+    )
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        with open(os.path.join(args.output_dir, "args.yaml"), "w") as f:
+            OmegaConf.save(args, f)
+    accelerator.wait_for_everyone()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    file_handler = logging.FileHandler(os.path.join(args.output_dir, "train_log.txt"))
+    logger.logger.addHandler(file_handler)
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Get the datasets
+    data_files = {}
+    data_files_eval = {}
+    if args.train_file is not None:
+        data_files["train"] = args.train_file
+    if args.validation_file is not None:
+        data_files_eval["validation"] = args.validation_file
+    extension = args.train_file.split(".")[-1]
+    raw_datasets = load_dataset(extension, data_files=data_files)
+    raw_datasets_eval = load_dataset(extension, data_files=data_files_eval)
+    # Load pretrained model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
+    if args.config_name_or_path is not None:
+        config = BartConfig.from_pretrained(args.config_name_or_path)
+    else:
+        config = None
+    if args.model_name_or_path is not None:
+        if config is None:
+            model = EnClapBartForConditionalGeneration.from_pretrained(
+                args.model_name_or_path
+            )
+        else:
+            model = EnClapBartForConditionalGeneration.from_pretrained(
+                args.model_name_or_path, config=config
+            )
+    else:
+        model = EnClapBartForConditionalGeneration(config=config)
+    # Set the generation config
+    if args.val_max_target_length is None:
+        args.val_max_target_length = args.max_target_length
+    # Set max encodec length based on the shape of the positional encoding
+    max_encodec_length = model.config.max_position_embeddings - 2
+    label_pad_token_id = (
+        -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    )
+    preprocessor = Preprocessor(
+        args.encodec_base_path,
+        args.clap_base_path,
+        tokenizer,
+        model.config.max_position_embeddings,
+        args.encodec_masking_prob,
+        args.encodec_masking_span,
+        label_pad_token_id,
+        model.config.encodec_vocab_size,
+        args.eval_num_captions,
+    )
+    with accelerator.main_process_first():
+        train_dataset = raw_datasets["train"].map(
+            preprocessor.preprocess_train,
+            num_proc=args.preprocessing_num_workers,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+        train_dataset.set_format(
+            "pt",
+            columns=[
+                "input_ids",
+                "attention_mask",
+                "clap",
+                "labels",
+                "decoder_attention_mask",
+            ],
+        )
+        # Temporarily set max_target_length for validation.
+        eval_dataset = raw_datasets_eval["validation"].map(
+            preprocessor.preprocess_eval,
+            num_proc=args.preprocessing_num_workers,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+        eval_dataset.set_format(
+            "pt",
+            columns=["input_ids", "attention_mask", "clap"],
+            output_all_columns=True,
+        )
+    train_data_collator = DataCollatorForEnClapBart(
+        tokenizer=tokenizer,
+        model=model,
+        return_tensors="pt",
+        label_pad_token_id=label_pad_token_id,
+        max_length=max_encodec_length,
+        encodec_masking_prob=args.encodec_masking_prob,
+        encodec_masking_span=args.encodec_masking_span,
+    )
+    valid_data_collator = DataCollatorForEnClapBart(
+        tokenizer=tokenizer,
+        model=model,
+        return_tensors="pt",
+        label_pad_token_id=label_pad_token_id,
+        max_length=max_encodec_length,
+    )
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=train_data_collator,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        collate_fn=valid_data_collator,
+        batch_size=args.per_device_eval_batch_size,
+    )
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    if args.lr_scheduler_type == "inverse_sqrt" and hasattr(args, "time_scale"):
+        lr_scheduler = get_inverse_sqrt_schedule(
+            optimizer=optimizer,
+            num_warmup_steps=args.num_warmup_steps,
+            timescale=args.time_scale,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            name=args.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=args.num_warmup_steps,
+            num_training_steps=args.max_train_steps,
+        )
+    # Prepare everything with our `accelerator`.
+    (
+        model,
+        optimizer,
+        train_dataloader,
+        eval_dataloader,
+        lr_scheduler,
+    ) = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        accelerator.init_trackers(args.logging_dir)
+    # Train!
+    total_batch_size = (
+        args.per_device_train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+    if args.split_batches:
+        total_batch_size = int(total_batch_size / accelerator.num_processes)
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
+    )
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    completed_steps = 0
+    starting_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if not args.overwrite_output_dir and os.path.exists(
+        os.path.join(args.output_dir, "checkpoints")
+    ):
+        if args.resume_from_checkpoint is not None:
+            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
+            accelerator.load_state(args.resume_from_checkpoint)
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [
+                f
+                for f in os.scandir(os.path.join(args.output_dir, "checkpoints"))
+                if f.is_dir()
+            ]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[
+                -1
+            ].name  # Sorts folders by date modified, most recent checkpoint is the last
+            accelerator.print(f"Resumed from checkpoint: {dirs[-1]}")
+            accelerator.load_state(dirs[-1])
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = (
+                int(training_difference.replace("step_", ""))
+                * args.gradient_accumulation_steps
+            )
+            starting_epoch = resume_step // len(train_dataloader)
+            resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_stepp
+    # update the progress_bar if load from checkpoint
+    if args.with_tracking:
+        total_loss = 0
+        logging_loss = 0
+        before_epoch_loss = 0
+        if args.encodec_masking_prob > 0:
+            total_encodec_loss = 0
+            logging_encodec_loss = 0
+            before_epoch_encodec_loss = 0
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if (
+            args.resume_from_checkpoint
+            and epoch == starting_epoch
+            and resume_step is not None
+        ):
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader, resume_step
+            )
+        else:
+            active_dataloader = train_dataloader
+        logger.info(f"***** Running epoch {epoch} *****")
+        epoch_iterator = tqdm(
+            active_dataloader,
+            desc="Training",
+            disable=not accelerator.is_local_main_process,
+            dynamic_ncols=True,
+            colour="CYAN",
+        )
+        for step, batch in enumerate(epoch_iterator):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += outputs.lm_loss.item()
+                    if args.encodec_masking_prob > 0:
+                        if outputs.encodec_loss is not None:
+                            total_encodec_loss += outputs.encodec_loss.item()
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(
+                        model.parameters(), max_norm=args.max_grad_norm
+                    )
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    completed_steps += 1
+                    # Add loss information to tqdm
+                    epoch_iterator.set_postfix(loss=total_loss / completed_steps)
+                    if completed_steps % args.logging_steps == 0:
+                        train_log = {
+                            "train/learning_rate": lr_scheduler.get_last_lr()[0]
+                        }
+                        train_log["train/loss"] = (
+                            total_loss - logging_loss
+                        ) / args.logging_steps
+                        logging_loss = total_loss
+                        if args.encodec_masking_prob > 0:
+                            train_log["train/encodec_loss"] = (
+                                total_encodec_loss - logging_encodec_loss
+                            ) / args.logging_steps
+                            logging_encodec_loss = total_encodec_loss
+                        accelerator.log(train_log, step=completed_steps)
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(
+                            args.output_dir, "checkpoints", output_dir
+                        )
+                    accelerator.save_state(output_dir)
+            if completed_steps >= args.max_train_steps:
+                break
+        model.eval()
+        gen_kwargs = {
+            "max_length": args.val_max_target_length,
+        }
+        predictions = []
+        references = []
+        eval_iterator = tqdm(
+            eval_dataloader,
+            desc="Validation",
+            disable=not accelerator.is_local_main_process,
+            dynamic_ncols=True,
+            colour="MAGENTA",
+        )
+        for step, batch in enumerate(eval_iterator):
+            # Drop the padded samples of the last batch of dataloader
+            # try:
+            #    if accelerator.gradient_state.end_of_dataloader and accelerator.gradient_state.remainder > 0:
+            #        batch = batch[:accelerator.gradient_state.remainder]
+            # except:
+            #    pass
+            with torch.no_grad():
+                batch["input_ids"] = batch["input_ids"].cuda()
+                batch["clap"] = batch["clap"].cuda()
+                batch["attention_mask"] = batch["attention_mask"].cuda()
+                batch["eos_mask"] = batch["eos_mask"].cuda()
+                generated_tokens = accelerator.unwrap_model(model).generate(
+                    batch["input_ids"],
+                    clap=batch["clap"],
+                    attention_mask=batch["attention_mask"],
+                    eos_mask=batch["eos_mask"],
+                    **gen_kwargs,
+                )
+                generated_tokens = accelerator.pad_across_processes(
+                    generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
+                )
+                generated_tokens = generated_tokens.cpu().numpy()
+                captions = batch["captions"]
+                if isinstance(generated_tokens, tuple):
+                    generated_tokens = generated_tokens[0]
+                decoded_preds = tokenizer.batch_decode(
+                    generated_tokens, skip_special_tokens=True
+                )
+                predictions.extend(decoded_preds)
+                references.extend(captions)
+        logger.info("Evaluating predictions...")
+        result = evaluate(predictions, references, metrics=metric_list)
+        # Gather Result
+        result = {k: v.cuda() for k, v in result[0].items()}
+        result = accelerator.gather_for_metrics(result)
+        # Log the average of metrics among the processes
+        if accelerator.num_processes > 1:
+            result = {f"eval/{k}": round(v.mean().item(), 4) for k, v in result.items()}
+        else:
+            result = {f"eval/{k}": round(v.item(), 4) for k, v in result.items()}
+        logger.info(result)
+        if args.with_tracking:
+            result["train/epoch_train_loss"] = (total_loss - before_epoch_loss) / len(
+                train_dataloader
+            )
+            result["train/steps"] = completed_steps
+            before_epoch_loss = total_loss
+            if args.encodec_masking_prob > 0:
+                result["train/epoch_encodec_loss"] = (
+                    total_encodec_loss - before_epoch_encodec_loss
+                ) / len(train_dataloader)
+                before_epoch_encodec_loss = total_encodec_loss
+            accelerator.log(result, step=epoch)
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, "checkpoints", output_dir)
+            accelerator.save_state(output_dir)
+            if accelerator.is_main_process:
+                unwrapped_model = accelerator.unwrap_model(model)
+                unwrapped_model.config.save_pretrained(output_dir)
+    if args.output_dir is not None:
+        save_dir = os.path.join(args.output_dir, "final")
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            save_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+        )
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(save_dir)
+if __name__ == "__main__":
+    main()

train.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ CFG_PATH="cfg/clotho/base.yaml"
2	+ accelerate launch --multi_gpu --main_process_port=1200 train.py $CFG_PATH