File size: 1,228 Bytes
ae81e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
dataset:
  name: alpaca_clean
  dataset_config:
    name: default
    path: yahma/alpaca-cleaned
    chunk_size: 1024  # sequence length for distilling
    concat_data: true
    cache_dir: 'data/alpaca'  # Change this to where you want to save
  pretrained_model_config:  # will be updated based on model_config
    pretrained_model_name_or_path: 'meta-llama/Meta-Llama-3.1-8B'  
    cache_dir: '/data_persistent2/sim_data/llama-3_1-8b/'
  preprocess_config: null

dataloader:
  batch_size: 1
  num_workers: 2
  drop_last: false
  pin_memory: true

optimizer:
  optim: adamw_torch_fused
  lr: 0.01
  weight_decay: 0.0

lr_scheduler:
  lr_scheduler_type: reduce_lr_on_plateau
  mode: min
  factor: 0.1
  patience: 10
  min_lr: 0.00001

trainer:  # HuggingFace Trainer-like arguments  
  name: distill_attention_xent_mse
  reverse_kl: false
  mse_factor: 1000
  xent_factor: 1
  
  bf16: true
  train_split: train
  val_split: validation
  num_train_epochs: 2
  gradient_accumulation_steps: 8
  seed: 42
  batch_size: 1
  load_best_model_at_end: true
  greater_is_better: false
  metric_for_best_model: distill/eval/loss
  logging_steps: 100
  evaluation_strategy: steps
  max_steps: -1
  eval_steps: 100
  max_eval_batches: null