| | import datasets |
| | datasets.config.DOWNLOADED_DATASETS_PATH = "/mnt/jeff/huggingface/data" |
| | import os |
| | os.environ['HF_HOME'] = '/mnt/jeff/huggingface' |
| |
|
| | import argparse |
| | import json |
| | import os |
| | from pathlib import Path |
| |
|
| | import numpy as np |
| | import torch |
| | import sacrebleu |
| |
|
| | from datasets import load_dataset |
| | from torch.utils.data import Dataset, ConcatDataset |
| | from tqdm import tqdm |
| | from transformers import ( |
| | AutoProcessor, |
| | AutoModel, |
| | BatchFeature, |
| | Trainer, |
| | TrainingArguments, |
| | StoppingCriteria, |
| | StoppingCriteriaList, |
| | ) |
| | from collections import defaultdict |
| |
|
| | import soundfile as sf |
| | from datasets import Audio |
| | import random |
| | from ASRDataset import * |
| |
|
| |
|
| | def count_parameters_by_module(model): |
| | |
| | module_params = defaultdict(lambda: {"total": 0, "trainable": 0}) |
| | |
| | |
| | total_params = 0 |
| | total_trainable_params = 0 |
| | |
| | |
| | embedding_masks = {} |
| | for name, param in model.named_parameters(): |
| | if 'embed_tokens.weight' in name and hasattr(param, '_backward_hooks') and param._backward_hooks: |
| | |
| | for hook_id, hook_fn in param._backward_hooks.items(): |
| | if hook_fn.__code__.co_name == 'embedding_grad_mask_hook': |
| | |
| | for cell in hook_fn.__closure__ or []: |
| | if isinstance(cell.cell_contents, torch.Tensor) and cell.cell_contents.dtype == torch.bool: |
| | |
| | embedding_masks[name] = ~cell.cell_contents |
| | |
| | |
| | for name, param in model.named_parameters(): |
| | |
| | module_name = name.split('.')[0] |
| | param_count = param.numel() |
| | |
| | module_params[module_name]["total"] += param_count |
| | total_params += param_count |
| | |
| | if param.requires_grad: |
| | |
| | if name in embedding_masks: |
| | trainable_count = embedding_masks[name].sum().item() |
| | module_params[module_name]["trainable"] += trainable_count |
| | total_trainable_params += trainable_count |
| | else: |
| | module_params[module_name]["trainable"] += param_count |
| | total_trainable_params += param_count |
| | |
| | print(f"All Params: {total_params:,}") |
| | print(f"Trainable Params: {total_trainable_params:,} ({total_trainable_params/total_params*100:.2f}%)") |
| | print("\nParams by Module:") |
| | |
| | for module_name, counts in sorted(module_params.items()): |
| | trainable_percentage = counts["trainable"] / counts["total"] * 100 if counts["total"] > 0 else 0 |
| | total_percentage = counts["total"] / total_params * 100 |
| | |
| | print(f"- {module_name}:") |
| | print(f" Total: {counts['total']:,} ({total_percentage:.2f}% of model)") |
| | print(f" Trainable: {counts['trainable']:,} ({trainable_percentage:.2f}% of module)") |
| | |
| | return module_params |
| |
|
| | def create_model(model_name_or_path, revision="main", use_flash_attention = False): |
| | model = AutoModel.from_pretrained( |
| | model_name_or_path, |
| | revision=revision, |
| | torch_dtype=torch.bfloat16, |
| | device_map="auto", |
| | attn_implementation="flash_attention_2" if use_flash_attention else "eager", |
| | trust_remote_code=True, |
| | ) |
| | |
| | |
| | model.config.use_cache = False |
| |
|
| | |
| | for param in model.parameters(): |
| | param.requires_grad = False |
| | |
| | model.set_lora_adapter('speech') |
| | |
| | model.to(torch.bfloat16) |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | train_embed = True |
| | if train_embed: |
| | embed_tokens = model.language_model.model.model.embed_tokens |
| | |
| | embed_tokens.weight.requires_grad = False |
| |
|
| | |
| | trainable_token_ids = [256001, 256002] |
| |
|
| | embed_tokens.weight.requires_grad = True |
| | mask = torch.ones_like(embed_tokens.weight, dtype=torch.bool) |
| | mask[trainable_token_ids] = False |
| |
|
| | |
| | def embedding_grad_mask_hook(grad): |
| | return grad.masked_fill(mask, 0) |
| |
|
| | embed_tokens.weight.register_hook(embedding_grad_mask_hook) |
| |
|
| | model.language_model.model.model.embed_tokens = embed_tokens |
| | |
| | count_parameters_by_module(model) |
| |
|
| | return model |
| |
|
| | ANSWER_SUFFIX = "<end_of_turn>" |
| | _IGNORE_INDEX = -100 |
| |
|
| | ANSWER_SUFFIX = "<end_of_turn>" |
| | _IGNORE_INDEX = -100 |
| |
|
| | model_name_or_path = '/mnt/jeff/gemma-3-4b-it-omni' |
| | use_flash_attention = False |
| |
|
| | output_dir = '../gemma_tmp14_audio_and_text_speechlora' |
| | batch_size = 16 |
| | batch_size_per_gpu = 1 |
| | learning_rate = 5.0e-5 |
| | wd = 0.01 |
| | num_train_epochs = 10 |
| |
|
| | revision = "main" |
| |
|
| | processor = AutoProcessor.from_pretrained( |
| | model_name_or_path, |
| | revision=revision, |
| | trust_remote_code=True, |
| | ) |
| |
|
| | model = create_model( |
| | model_name_or_path, |
| | revision=revision, |
| | use_flash_attention=use_flash_attention, |
| | ) |
| |
|
| | train_datasets = [] |
| |
|
| | pickup_dataset = MultiturnAudioDataset(processor=processor,text_only=True,json_path='/mnt/jeff/InCar/data/multiturn_data/pickup_processed.json') |
| | train_datasets.append(pickup_dataset) |
| |
|
| | pickup_dataset = MultiturnAudioDataset(processor=processor,json_path='/mnt/jeff/InCar/data/multiturn_data/pickup_processed.json') |
| | train_datasets.append(pickup_dataset) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| |
|
| |
|
| | print("Count Num of Datasets", len(train_datasets)) |
| | print([len(dataset) for dataset in train_datasets]) |
| |
|
| | |
| | train_dataset = ConcatDataset(train_datasets) if len(train_datasets) > 1 else train_datasets[0] |
| | print("Count Length of Datas", len(train_dataset)) |
| |
|
| |
|
| |
|
| | |
| | num_gpus = torch.cuda.device_count() |
| | print(f'training on {num_gpus} GPUs') |
| |
|
| | assert ( |
| | batch_size % (num_gpus * batch_size_per_gpu) == 0 |
| | ), 'Batch size must be divisible by the number of GPUs' |
| | gradient_accumulation_steps = batch_size // (num_gpus * batch_size_per_gpu) |
| |
|
| | |
| | dp_config = { |
| | "fp16": { |
| | "enabled": "auto", |
| | "loss_scale": 0, |
| | "loss_scale_window": 1000, |
| | "initial_scale_power": 16, |
| | "hysteresis": 2, |
| | "min_loss_scale": 1 |
| | }, |
| | "zero_optimization": { |
| | "stage": 2, |
| | "allgather_partitions": True, |
| | "allgather_bucket_size": 5e8, |
| | "overlap_comm": False, |
| | "reduce_scatter": True, |
| | "reduce_bucket_size": 5e8, |
| | "contiguous_gradients": True, |
| | "cpu_offload": True |
| | }, |
| |
|
| | "train_batch_size": "auto", |
| | "gradient_accumulation_steps": "auto", |
| | "optimizer": { |
| | "type": "AdamW", |
| | "params": { |
| | "lr": "auto", |
| | "betas": 'auto', |
| | "eps": 'auto', |
| | "weight_decay": "auto" |
| | } |
| | }, |
| | "scheduler": { |
| | "type": "WarmupDecayLR", |
| | "params": { |
| | "warmup_min_lr": "auto", |
| | "warmup_max_lr": "auto", |
| | "warmup_num_steps": "auto", |
| | "total_num_steps": "auto" |
| | } |
| | }, |
| | "gradient_clipping": 1.0, |
| | "zero_optimization": { |
| | "stage": 0 |
| | } |
| | } |
| | training_args = TrainingArguments( |
| | num_train_epochs=num_train_epochs, |
| | per_device_train_batch_size=batch_size_per_gpu, |
| | gradient_checkpointing=True, |
| | gradient_checkpointing_kwargs={'use_reentrant': False}, |
| | gradient_accumulation_steps=gradient_accumulation_steps, |
| | optim='adamw_torch', |
| | adam_beta1=0.9, |
| | adam_beta2=0.95, |
| | adam_epsilon=1e-7, |
| | learning_rate=learning_rate, |
| | weight_decay=wd, |
| | max_grad_norm=1.0, |
| | lr_scheduler_type='cosine', |
| | warmup_steps=50, |
| | logging_steps=10, |
| | output_dir=output_dir, |
| | save_total_limit=10, |
| | save_only_model=True, |
| | bf16=True, |
| | fp16=False, |
| | remove_unused_columns=False, |
| | report_to='none', |
| | deepspeed=None, |
| | disable_tqdm=False, |
| | dataloader_num_workers=16, |
| | save_strategy='epoch', |
| | |
| | ddp_find_unused_parameters=True, |
| |
|
| | ) |
| |
|
| | out_path = Path(training_args.output_dir) |
| | out_path.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | optimizer = torch.optim.AdamW( |
| | filter(lambda p: p.requires_grad, model.parameters()), |
| | lr=learning_rate, |
| | weight_decay=wd, |
| | betas=(0.9, 0.95), |
| | eps=1e-7, |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | data_collator=covost_collate_fn, |
| | train_dataset=train_dataset, |
| | optimizers=(optimizer, None) |
| | ) |
| |
|
| | trainer.train() |
| |
|
| |
|
| | |
| | model.language_model.model.save_pretrained(output_dir) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | model.save_pretrained(output_dir) |
| |
|