Training in progress, step 200

Browse files

Files changed (5) hide show

model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
train_conv_slurm_full.py +241 -0
train_conv_slurm_full.sh +63 -0
training_args.bin +1 -1

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:94fd27588ea0cf8a350c88bacfecdf465ab06e718fe81d15c5d103708821ef5c
 size 4988522632

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c2110db2c404990f87047aec54bead5bd1b1220df35153c4e554f8cea41c78c
 size 4988522632

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c2834790fc01798e9e85481469cc95cb32a7ad752c03f32a7d3129dd4fcefea
 size 2795955204

 version https://git-lfs.github.com/spec/v1
+oid sha256:87db4707734910285045cd2b90da00c932ea7b3879f6e19bee3b2fb131e77575
 size 2795955204

train_conv_slurm_full.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import os
+import logging
+import datasets
+from datasets import load_dataset
+import torch
+from torch.utils.data import Dataset
+from transformers import Trainer, TrainingArguments
+import wandb
+from mmlm.model_full import MMLMConfig, MMLM
+from mmlm.utility import load_audio_to_tensor
+import numpy as np
+# ========================
+# Global Configuration
+# ========================
+WANDB_PROJECT_NAME = "mmlm-conv-full"
+WANDB_API_KEY = "0793be66347fa388f401f66cb39fd661452d660d"
+DATASET = load_dataset("voidful/all_conv_data_filtered_small")['train']
+# DATASET = datasets.load_from_disk("/mnt/home/ntuspeechlabtaipei1/anthony/Soundon-TTS-preprocessing/hf_dialogue_chinese_llama31_70B_user_long_2_with_silence")
+LM_MODEL_NAME = "voidful/Llama-3.2-8B-Whisper"
+OUTPUT_DIR = "/mnt/home/ntuspeechlabtaipei1/mmlm-conv-training-full"
+MODEL_SAVE_PATH = "/mnt/home/ntuspeechlabtaipei1/mmlm-conv-model-full"
+TRAIN_TEST_SPLIT_RATIO = 0.1
+EPOCHS = 300
+BATCH_SIZE = 1
+LEARNING_RATE = 8e-4
+GRADIENT_ACCUMULATION_STEPS = 2
+USE_BF16 = True
+USE_FP16 = False
+LOGGING_STEPS = 1
+SAVE_TOTAL_LIMIT = 10
+GRADIENT_CHECKPOINTING = True
+PAD_VALUE = 0.0
+MAX_LENGTH = 8000
+# Setup logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def initialize_wandb():
+    """Initialize Weights and Biases for tracking experiments."""
+    wandb.login(key=WANDB_API_KEY)
+    wandb.init(
+        project=WANDB_PROJECT_NAME,
+        config={
+            "epochs": EPOCHS,
+            "batch_size": BATCH_SIZE,
+            "learning_rate": LEARNING_RATE,
+        },
+        group="mmlm",
+    )
+class CustomDataset(Dataset):
+    """Custom dataset class for handling audio-text data."""
+    def __init__(self, data, tokenizer):
+        self.data = data
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        entry = self.data
+        # print(len(entry[idx]["user_audio_path"]['array']),entry[idx]["user_audio_path"]['array'])
+        audio_path = torch.tensor(entry[idx]["user_audio_path"]['array'])
+        # if not os.path.exists(audio_path):
+        #     audio_path = os.path.join("/mnt/home/ntuspeechlabtaipei1/anthony/Soundon-TTS-preprocessing/", audio_path)
+        audio_tensor = load_audio_to_tensor(audio_path)[0]
+        # print("audio_tensor",audio_tensor.shape,)
+        x_vector = entry[idx]["x-vector"]
+        text_with_pad = entry[idx]["text_with_pad"]
+        user_text_with_pad = text_with_pad[0]
+        user_text_with_pad = "[PAD]" + user_text_with_pad
+        audio_tensor = torch.cat([audio_tensor[0], torch.zeros(int(24000 * 0.08 * 1))], dim=0).unsqueeze(dim=0)
+        # machine_text_with_pad = text_with_pad[1]
+        machine_text_with_pad = text_with_pad[1][5:] + "[PAD]"
+        audio_unit = np.array(entry[idx]["machine_unit"])
+        zero_sequences = []  # To store start and end times
+        start = None  # Initialize start as None
+        for i, value in enumerate(audio_unit[0]):  # Iterate through the first element of the audio tensor
+            if value != 0 and start is None:
+                start = i  # Start of a zero sequence
+            elif value == 0 and start is not None:
+                # End of a zero sequence
+                zero_sequences.append((start * 24000 * 0.08, (i - 1) * 24000 * 0.08))
+                start = None
+        # Handle sequence ending at the last element
+        if start is not None:
+            zero_sequences.append((start * 24000 * 0.08, (len(audio_unit[0]) - 1) * 24000 * 0.08))
+        for i in zero_sequences:
+            start, end = i
+            start, end = int(start), int(end)
+            if end > audio_tensor.size(1):
+                end = audio_tensor.size(1)
+            audio_tensor[0, start:end] = torch.zeros(end - start)
+        padding_token = 0
+        bos_token_id = 0
+        eos_token_id = 0
+        audio_unit = np.hstack((audio_unit, np.zeros((audio_unit.shape[0], 1), dtype=int)))
+        for i in range(1, audio_unit.shape[0]):
+            audio_unit[i, 1:] = audio_unit[i, :-1]
+            audio_unit[i, 0] = padding_token
+        matrix_with_bos = np.hstack((np.full((audio_unit.shape[0], 1), bos_token_id), audio_unit))
+        matrix_with_bos_eos = np.hstack((matrix_with_bos, np.full((matrix_with_bos.shape[0], 1), eos_token_id)))
+        input_audio_unit = matrix_with_bos_eos[:, :-1]
+        target_audio_unit = matrix_with_bos_eos[:, 1:]
+        return {
+            "input_values": torch.tensor(audio_tensor),
+            "speaker_codecs": torch.tensor(input_audio_unit),
+            "speaker_codec_labels": torch.tensor(target_audio_unit),
+            "speaker_embs": torch.tensor(x_vector[1]),
+            "speaker_texts": self.tokenizer(machine_text_with_pad, add_special_tokens=False, return_tensors="pt")[
+                "input_ids"],
+            "listener_texts": self.tokenizer(user_text_with_pad, add_special_tokens=False, return_tensors="pt")[
+                "input_ids"],
+        }
+class CustomDataCollator:
+    """Custom data collator for batching audio and text inputs."""
+    def __init__(self, text_pad_value, audio_pad_value=PAD_VALUE):
+        self.text_pad_value = text_pad_value
+        self.audio_pad_value = audio_pad_value
+    def __call__(self, batch):
+        return {
+            "input_values": torch.cat([item["input_values"] for item in batch]),
+            "speaker_codecs": torch.cat([item["speaker_codecs"] for item in batch]),
+            "speaker_codec_labels": torch.cat([item["speaker_codec_labels"] for item in batch]),
+            "speaker_embs": torch.cat([item["speaker_embs"] for item in batch]),
+            "speaker_texts": torch.cat([item["speaker_texts"] for item in batch]),
+            "listener_texts": torch.cat([item["listener_texts"] for item in batch]),
+        }
+def compute_metrics(pred):
+    """Compute loss as a metric."""
+    pred_logits = pred.predictions
+    labels = pred.label_ids
+    loss_fn = torch.nn.CrossEntropyLoss()
+    return {"loss": loss_fn(torch.tensor(pred_logits), torch.tensor(labels)).item()}
+def main():
+    # Initialize WandB if in main process
+    if int(os.environ.get("LOCAL_RANK", "-1")) == 0:
+        initialize_wandb()
+    # Load model and tokenizer
+    config = MMLMConfig(lm_model_name=LM_MODEL_NAME)
+    model = MMLM(config)
+    tokenizer = model.tokenizer
+    logger.info("Model and tokenizer loaded.")
+    # Load dataset
+    data = DATASET
+    logger.info(f"Loaded {len(data)} samples from dataset.")
+    data = data.filter(lambda x: x["not_aligned_percentage"] < 0.5)
+    logger.info(f"Filtered dataset to {len(data)} samples.")
+    # Split dataset
+    # data = data.train_test_split(test_size=0.5, seed=42)
+    data = data.shuffle(seed=42)
+    subset_size = 100
+    data = data.select(range(subset_size))
+    train_dataset = CustomDataset(data, tokenizer)
+    # eval_dataset = CustomDataset(data['test'], tokenizer)
+    # train_dataset = CustomDataset(data.select([0, 1, 2, 3, 4]), tokenizer)
+    # eval_dataset = CustomDataset(data.select([0, 1, 2, 3, 4]), tokenizer)
+    # Data collator
+    data_collator = CustomDataCollator(tokenizer.pad_token_id)
+    # Define training arguments
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        evaluation_strategy="no",
+        logging_strategy="steps",
+        logging_steps=LOGGING_STEPS,
+        save_strategy="steps",
+        save_steps=200,
+        save_total_limit=SAVE_TOTAL_LIMIT,
+        num_train_epochs=EPOCHS,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        learning_rate=LEARNING_RATE,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
+        bf16=USE_BF16,
+        fp16=USE_FP16,
+        do_eval=False,
+        max_grad_norm=1,
+        report_to="wandb",
+        lr_scheduler_type="linear",
+        warmup_steps=100,
+        eval_accumulation_steps=1,
+        run_name=f"{WANDB_PROJECT_NAME}-training",
+        load_best_model_at_end=False,
+        gradient_checkpointing=GRADIENT_CHECKPOINTING,
+        label_names=["listener_text_labels", "speaker_text_labels"],
+        prediction_loss_only=True,
+        remove_unused_columns=False,
+        push_to_hub=True,
+    )
+    # Initialize Trainer
+    trainer = Trainer(
+        model=model,
+        processing_class=tokenizer,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    # Train and evaluate model
+    # resume_from_checkpoint = '/mnt/home/ntuspeechlabtaipei1/mmlm-conv-training-fixed-10k/checkpoint-2000/'
+    trainer.train()
+    # Save model
+    trainer.save_model(MODEL_SAVE_PATH)
+    logger.info(f"Model and tokenizer saved to '{MODEL_SAVE_PATH}'.")
+    # Finalize WandB
+    wandb.finish()
+if __name__ == "__main__":
+    main()

train_conv_slurm_full.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+#SBATCH -N 13
+#SBATCH -p tp1-user
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=200
+#SBATCH --mem=200G
+#SBATCH --gres=gpu:8
+#SBATCH --time=30-00:00:00
+#SBATCH --output=/mnt/home/ntuspeechlabtaipei1/eric/result/%j-slurm.out
+#SBATCH --exclude=cnode3-004,cnode3-019
+module purge
+module load slurm
+source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh
+conda activate base
+CONTAINER_IMAGE="./eric/trl.sqsh"
+GPUS_PER_NODE=8
+echo "SLURM_NNODES=${SLURM_NNODES}"
+echo "NODELIST="$SLURM_JOB_NODELIST
+echo "SLURM_NODEID=$SLURM_NODEID"
+echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID"
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=12345
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export CUDA_LAUNCH_BLOCKING=1
+export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    --mpi=pmix \
+    --container-image=${CONTAINER_IMAGE} \
+    --container-writable \
+    --container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \
+    "
+PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;"
+LAUNCHER="accelerate launch \
+    --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
+    --num_machines $SLURM_NNODES \
+    --machine_rank \${SLURM_NODEID} \
+    --rdzv_backend c10d \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \
+    --deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \
+    --deepspeed_multinode_launcher standard \
+    --dynamo_backend no \
+    --use_deepspeed \
+    --mixed_precision bf16 \
+    "
+CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py"
+clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD"
+echo "END TIME: $(date)"

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9701fce783f2c198703e90b589d200d16c6ffa646d130d05bc0cdc13adb039e
 size 7672

 version https://git-lfs.github.com/spec/v1
+oid sha256:3caa3fd1c46e285f96aeb9d09d1de977f5879a40bd5efb283b0bbc50d1873349
 size 7672