File size: 4,682 Bytes
0610800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
checkpoints:
  checkpoint_interval: 1000
  checkpoints_path: /fsx/phuc/new_workspace/experiments/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
  checkpoints_path_is_shared_file_system: true
  resume_checkpoint_path: /fsx/phuc/new_workspace/experiments/infini_attention_8b_llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
  save_initial_state: false
data:
  dataset:
    dataloader_type: single
    dataset_max_tokens: null
    dataset_weights:
    - 0.3
    - 0.3
    - 0.45
    - 0.15
    - 0.08
    - 0.02
    datasets:
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/long/
      skip_tokens: 0
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/short/
      skip_tokens: 0
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/long/CC-MAIN-2024-10
      skip_tokens: 0
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/short/CC-MAIN-2024-10
      skip_tokens: 0
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/project-gutenberg/tokenized-llama3/
      skip_tokens: 0
    - dtype: uint32
      filename_pattern: .*.ds
      folder: s3://huggingface-llm-datasets/OpenHermes-2-5/tokenized-llama3
      skip_tokens: 0
    pad_samples_to_global_batch_size: false
    skip_in_stream: true
  num_loading_workers: 0
  seed: 42
data_stages: null
experiment_logger:
  tensorboard_logger:
    flush_secs: 30
    tensorboard_dir: /fsx/phuc/project_data/infini_attention/tb_logs
  wandb_logger:
    wandb_entity: null
    wandb_project: infini_attention_8b_llama
general:
  benchmark_csv_path: null
  consumed_train_samples: 1920000
  ignore_sanity_checks: true
  project: infini_attention_8b_llama
  run: exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
  seed: 42
  step: 20000
infini_attention:
  balance_act_type: orig_sigmoid
  balance_factor_lr: 0.01
  balance_factor_weight_decay: 0.0
  balance_init_type: zeros
  log_grad: false
  log_segment_acts: false
  logging: true
  logging_interval: 250
  segment_length: 64
  turn_on_memory: true
kill_switch_path: null
lighteval: null
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    path: /fsx/phuc/projects/infini-attention/llama3-ckps/haojun-8b-llama-nanotron-ckp/NanotronLlama3-8B
  make_vocab_size_divisible_by: 1
  model_config:
    bos_token_id: 128000
    eos_token_id: 128001
    hidden_act: silu
    hidden_size: 4096
    initializer_range: 0.02
    intermediate_size: 14336
    is_llama_config: true
    max_position_embeddings: 8192
    num_attention_heads: 32
    num_hidden_layers: 32
    num_key_value_heads: 8
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_interleaved: false
    rope_scaling: null
    rope_theta: 500000.0
    tie_word_embeddings: false
    use_cache: true
    vocab_size: 128256
optimizer:
  accumulate_grad_in_fp32: false
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_eps: 1.0e-08
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 1.0e-05
    lr_decay_starting_step: null
    lr_decay_steps: 23500
    lr_decay_style: cosine
    lr_warmup_steps: 1500
    lr_warmup_style: linear
    min_decay_lr: 1.0e-06
  torch_adam_is_fused: true
  weight_decay: 0.1
  zero_stage: 0
parallelism:
  dp: 6
  expert_parallel_size: 1
  pp: 1
  pp_engine: 1f1b
  tp: 4
  tp_linear_async_communication: false
  tp_mode: ALL_REDUCE
profiler: null
s3_upload:
  remove_after_upload: true
  s5cmd_concurrency: 5
  s5cmd_numworkers: 16
  s5cmd_path: null
  upload_s3_path: s3://phuc-experiments/infini-attention/8b-llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: /fsx/haojun/lighteval_evaluation_model/NanotronLlama3-8B
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 16
  sequence_length: 1024
  train_steps: 25000
  val_check_interval: -1