File size: 7,551 Bytes
0fd282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
defaults:
  - .@model.encoder: megatron_model_ul2base_config
  - .@model.decoder: megatron_model_ul2base_config

name: megatron_ul2
restore_from_path: null # used when starting from a .nemo file

trainer:
  devices: 1
  num_nodes: 1
  accelerator: gpu
  precision: 16
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  replace_sampler_ddp: False
  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
  max_steps: 524288 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
  log_every_n_steps: 100
  val_check_interval: 1000
  limit_val_batches: 30
  limit_test_batches: 500
  accumulate_grad_batches: 1
  gradient_clip_val: 1.0

exp_manager:
  explicit_log_dir: null
  exp_dir: /project/scratch/p200097/nemo_experiments/
  name: megatron.ul2-base-nl36.unigram-64k-pretok-small_data.all-clean
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: null
    name: null
  resume_if_exists: True
  resume_ignore_no_checkpoint: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 10
    mode: min
    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
    filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}

model:
  # model parallelism 
  micro_batch_size: 10
  # 4 GPUS * 24 nodes = 96 GPUS
  # 96 GPUS * 7 micro_batch_size = 672 batch_size
  # 672 * 3 = 2016 global_batch_size
  global_batch_size: 2080 # will use more micro batches to reach global batch size
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  resume_from_checkpoint: null # manually set the checkpoint file to load from
  pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.

  # model architecture
  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.

  megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
  grad_allreduce_chunk_size_mb: 125
  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

  seq_length: 512
  max_position_embeddings: ${.seq_length}


  tokenizer:
    library: 'huggingface'
    type: 'KBLab/unigram-64k-pretok-small_data-tokenizer'
    model: null
    vocab_file: null
    merge_file: null
    num_sentinel_tokens: 256
    sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

  # tokenizer:
  #   library: 'megatron'
  #   type: 'BertWordPieceCase'
  #   model: null
  #   vocab_file: null
  #   merge_file: null
  #   num_sentinel_tokens: 100
  #   sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

  # weight init
  embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')

  # embedding dropout
  embedding_dropout: 0.1

  # embedding sharing
  share_token_embeddings: True # If True share encoder/decoder embeddings
  share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits

  # token head
  tokens_head_bias: False

  # precision
  native_amp_init_scale: 4294967296 # 2 ** 32
  native_amp_growth_interval: 1000
  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

  # miscellaneous
  seed: 1234
  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this

  data:
    # Path to data must be specified by the user.
    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
    # Or see example below:
    # data_prefix: 
    #   - .5
    #   - /raid/data/pile/my-t5_00_text_document
    #   - .5
    #   - /raid/data/pile/my-t5_01_text_document
    data_prefix:
       - 0.005
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/wikipedia-unigram-64k-pretok-small_data_text_sentence 
       - 0.035
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/edepos_html-unigram-64k-pretok-small_data_text_sentence 
       - 0.030
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/oscar-unigram-64k-pretok-small_data_text_sentence 
       - 0.105
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/kw3-2017-unigram-64k-pretok-small_data_text_sentence 
       - 0.177
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/issues-unigram-64k-pretok-small_data_text_sentence 
       - 0.648
       - /project/scratch/p200097/data/unigram-64k-pretok-small_data/mc4-unigram-64k-pretok-small_data_text_sentence 
    index_mapping_dir: /project/scratch/p200097/data/unigram-64k-pretok-small_data/npy_files_ul2/ # path to save index mapping .npy files, by default will save in the same location as data_prefix
    data_impl: mmap
    # data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant)
    #     # defaults for text_memmap
    #     newline_int: 10 # byte-value of newline (Use ord('\n') to get value)
    #     header_lines: 0 # skip first N header lines
    #     workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
    #     sort_dataset_paths: False # if True datasets will be sorted by name
    #     # defaults for csv_memmap
    #     newline_int: 10 # byte-value of newline
    #     header_lines: 1 # skip first N header lines
    #     workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
    #     sort_dataset_paths: False # if True datasets will be sorted by name
    #     data_col: 1 # column to use for data
    #     data_sep: ',' # string to split text into columns
    splits_string: 996,2,2
    seq_length: ${model.seq_length}
    seq_length_dec: ${model.seq_length}
    skip_warmup: True
    num_workers: 32
    dataloader_type: single # cyclic
    masked_lm_prob: 0.15
    extreme_masked_lm_prob: 0.5
    dataset_type: 'ul2'
    short_seq_prob: 0.0
    max_ngram_size: 10
    extreme_max_ngram_size: 128
    extreme_min_ngram_size: 32
    extreme_mean_ngram_size: 64
    ngram_span_length_distribution: 'geometric'
    extreme_ngram_span_length_distribution: 'truncated_normal'
    prefix_lm_pivot_mean: 0.25
    mean_ngram_size: 3
    permutation: False
    whole_word_masking: True
    favor_longer_ngrams: False
    respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of <pad> tokens within a batch.

  optim:
    name: fused_adam
    lr: 0.001
    weight_decay: 0.01 
    betas: 
    - 0.9
    - 0.999
    eps: 1e-8
    sched:
      name: CosineAnnealing
      warmup_steps: 1600
      constant_steps: 30000 #40000
      min_lr: 5e-6

  # optim:
  #   name: fused_adam
  #   lr: 0.0001
  #   betas:
  #     - 0.9
  #     - 0.999
  #   eps: 1e-8
  #   weight_decay: 0.01
  #   sched:
  #     name: WarmupAnnealing
  #     min_lr: 0.00001
  #     last_epoch: -1
  #     warmup_ratio: 0.005