File size: 2,950 Bytes
1361151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
checkpoints:
  checkpoint_interval: 5000
  checkpoints_path: /fsx/phuc/checkpoints/doremi/big-run-02/reference-2.8b-llama-tuned-weights_with_100k_proxy
  checkpoints_path_is_shared_file_system: true
  resume_checkpoint_path: null
  save_initial_state: false
data:
  dataset:
    dataset_overwrite_cache: false
    dataset_processing_num_proc_per_process: 1
    hf_dataset_config_name: null
    hf_dataset_or_datasets: /fsx/phuc/project_data/doremi/datasets/the_pile_raw/tokenized_data/train
    hf_dataset_splits: train
    text_column_name: text
  num_loading_workers: 1
  seed: 42
doremi:
  domain_names:
  - Pile-CC
  - Github
  - OpenWebText2
  - StackExchange
  - Wikipedia (en)
  - PubMed Abstracts
  - USPTO Backgrounds
  - FreeLaw
  - PubMed Central
  - Enron Emails
  - HackerNews
  - NIH ExPorter
  - Books3
  - ArXiv
  - DM Mathematics
  - OpenSubtitles
  - Gutenberg (PG-19)
  - Ubuntu IRC
  - BookCorpus2
  - EuroParl
  - YoutubeSubtitles
  - PhilPapers
  domain_weights:
  - 0.2333
  - 0.07
  - 0.1154
  - 0.0528
  - 0.0665
  - 0.067
  - 0.0366
  - 0.0571
  - 0.0451
  - 0.0036
  - 0.0087
  - 0.0078
  - 0.0708
  - 0.0656
  - 0.0034
  - 0.0048
  - 0.0222
  - 0.0084
  - 0.0038
  - 0.0186
  - 0.0149
  - 0.0235
  ref_model_checkpoint_path: null
  ref_model_resume_checkpoint_path: null
general:
  benchmark_csv_path: null
  consumed_train_samples: 35840000
  ignore_sanity_checks: true
  project: nanotron
  run: train_tuned_2.8b_model
  seed: 42
  step: 70000
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 120
  dtype: bfloat16
  init_method:
    std: 0.025
  make_vocab_size_divisible_by: 1
  model_config:
    bos_token_id: 1
    eos_token_id: 2
    hidden_act: silu
    hidden_size: 4096
    initializer_range: 0.02
    intermediate_size: 24576
    is_llama_config: true
    max_position_embeddings: 1024
    num_attention_heads: 32
    num_hidden_layers: 6
    num_key_value_heads: 16
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_scaling: null
    tie_word_embeddings: true
    use_cache: true
    vocab_size: 49152
optimizer:
  accumulate_grad_in_fp32: true
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_eps: 1.0e-08
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.0003
    lr_decay_steps: 8
    lr_decay_style: cosine
    lr_warmup_steps: 2
    lr_warmup_style: linear
    min_decay_lr: 1.0e-05
  torch_adam_is_fused: true
  weight_decay: 0.01
  zero_stage: 0
parallelism:
  dp: 8
  pp: 1
  pp_engine: 1f1b
  recompute_granularity: SELECTIVE
  tp: 8
  tp_linear_async_communication: true
  tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: gpt2
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 8
  micro_batch_size: 64
  sequence_length: 1024
  train_steps: 70000
  val_check_interval: -1