pere commited on
Commit
00a518f
1 Parent(s): a318b46

Saving weights and logs of step 2500

Browse files
.create_assets.py.un~ ADDED
Binary file (523 Bytes). View file
 
.run_mlm_flax.py.un~ ADDED
Binary file (2.47 kB). View file
 
.run_train.sh.un~ ADDED
Binary file (11.5 kB). View file
 
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.47.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 50000
25
+ }
create_assets.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from transformers import AutoConfig, AutoTokenizer
3
+
4
+ def save_model_assets(model_name, output_dir):
5
+ # Load the configuration
6
+ config = AutoConfig.from_pretrained(model_name)
7
+ config.save_pretrained(output_dir)
8
+
9
+ # Load the tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ tokenizer.save_pretrained(output_dir)
12
+
13
+ print(f"Configuration and tokenizer saved to {output_dir}")
14
+
15
+ if __name__ == "__main__":
16
+ parser = argparse.ArgumentParser(description="Save model config and tokenizer locally")
17
+ parser.add_argument("--model_name", type=str, required=True, help="Name of the model to load (e.g., NbAiLab/nb-bert-large)")
18
+ parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the config and tokenizer")
19
+
20
+ args = parser.parse_args()
21
+ save_model_assets(args.model_name, args.output_dir)
events.out.tfevents.1734602192.t1v-n-53cd541d-w-35.1553625.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da3145db309994fabc86f86df1bbb9a22a6559e0afa8adba4c64853ff5764699
3
+ size 78
events.out.tfevents.1734605149.t1v-n-53cd541d-w-35.1557406.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9b6a6062b34228172b8e9f381780e3311166e41a356ad3a66a76cb6639c5acd
3
+ size 294241
events.out.tfevents.1734610378.t1v-n-53cd541d-w-35.1564529.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58bcb913e35d434556fc2419996aac101e8bb551fb75652beb71d58a46fae63b
3
+ size 294241
events.out.tfevents.1734629510.t1v-n-53cd541d-w-35.1584436.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359bf9c89711824e941e0a89509fbfe49e83241c58974256f5a9d718bf606e09
3
+ size 294241
events.out.tfevents.1734648786.t1v-n-53cd541d-w-35.1603548.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d58033531407f7903cc63667e3b0d47163abb8a54244aebf9a8de5ec83bea898
3
+ size 294241
events.out.tfevents.1734685611.t1v-n-53cd541d-w-35.1631410.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f26c3dd650772a3f6f43fc874febe037f9567ac0d5c2e5b69a140de38bc5502c
3
+ size 78
events.out.tfevents.1734728676.t1v-n-53cd541d-w-35.1662130.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71708bc299bd26899f35a8d0898a5986c254151eed5cc751880a1f28b7a3d42
3
+ size 78
events.out.tfevents.1734729693.t1v-n-53cd541d-w-35.1664337.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b5263c6d542fe9d958c7ec3d803b89f8d87e1ea6c31c769e78b839d079c7c16
3
+ size 367810
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdc1e8a384bd49a0e2a36ddf26c4fd2b5f93d9270a0dc413d19302e40840c0d7
3
+ size 1420571729
run_mlm_flax.py ADDED
@@ -0,0 +1,935 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2021 The HuggingFace Team All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
18
+ text file or a dataset.
19
+
20
+ Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
21
+ https://huggingface.co/models?filter=fill-mask
22
+ """
23
+
24
+ import json
25
+ import logging
26
+ import math
27
+ import os
28
+ import sys
29
+ import time
30
+ from dataclasses import asdict, dataclass, field
31
+ from enum import Enum
32
+ from itertools import chain
33
+
34
+ # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
35
+ from pathlib import Path
36
+ from typing import Dict, List, Optional, Tuple
37
+
38
+ import flax
39
+ import jax
40
+ import jax.numpy as jnp
41
+ import numpy as np
42
+ import optax
43
+ from datasets import load_dataset
44
+ from flax import jax_utils, traverse_util
45
+ from flax.jax_utils import pad_shard_unpad
46
+ from flax.training import train_state
47
+ from flax.training.common_utils import get_metrics, onehot, shard
48
+ from huggingface_hub import HfApi
49
+ from tqdm import tqdm
50
+
51
+ from transformers import (
52
+ CONFIG_MAPPING,
53
+ FLAX_MODEL_FOR_MASKED_LM_MAPPING,
54
+ AutoConfig,
55
+ AutoTokenizer,
56
+ FlaxAutoModelForMaskedLM,
57
+ HfArgumentParser,
58
+ PreTrainedTokenizerBase,
59
+ TensorType,
60
+ is_tensorboard_available,
61
+ set_seed,
62
+ )
63
+ from transformers.utils import send_example_telemetry
64
+
65
+
66
+ MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
67
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
68
+
69
+
70
+ @dataclass
71
+ class TrainingArguments:
72
+ output_dir: str = field(
73
+ metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
74
+ )
75
+ overwrite_output_dir: bool = field(
76
+ default=False,
77
+ metadata={
78
+ "help": (
79
+ "Overwrite the content of the output directory. "
80
+ "Use this to continue training if output_dir points to a checkpoint directory."
81
+ )
82
+ },
83
+ )
84
+ do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
85
+ do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
86
+ per_device_train_batch_size: int = field(
87
+ default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
88
+ )
89
+ per_device_eval_batch_size: int = field(
90
+ default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
91
+ )
92
+ learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
93
+ weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
94
+ adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
95
+ adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
96
+ adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
97
+ adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
98
+ num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
99
+ warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
100
+ logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
101
+ save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
102
+ eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
103
+ seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
104
+ push_to_hub: bool = field(
105
+ default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
106
+ )
107
+ hub_model_id: str = field(
108
+ default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
109
+ )
110
+ hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
111
+ gradient_checkpointing: bool = field(
112
+ default=False,
113
+ metadata={
114
+ "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
115
+ },
116
+ )
117
+
118
+ def __post_init__(self):
119
+ if self.output_dir is not None:
120
+ self.output_dir = os.path.expanduser(self.output_dir)
121
+
122
+ def to_dict(self):
123
+ """
124
+ Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
125
+ the token values by removing their value.
126
+ """
127
+ d = asdict(self)
128
+ for k, v in d.items():
129
+ if isinstance(v, Enum):
130
+ d[k] = v.value
131
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
132
+ d[k] = [x.value for x in v]
133
+ if k.endswith("_token"):
134
+ d[k] = f"<{k.upper()}>"
135
+ return d
136
+
137
+
138
+ @dataclass
139
+ class ModelArguments:
140
+ """
141
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
142
+ """
143
+
144
+ model_name_or_path: Optional[str] = field(
145
+ default=None,
146
+ metadata={
147
+ "help": (
148
+ "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
149
+ )
150
+ },
151
+ )
152
+ model_type: Optional[str] = field(
153
+ default=None,
154
+ metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
155
+ )
156
+ config_name: Optional[str] = field(
157
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
158
+ )
159
+ tokenizer_name: Optional[str] = field(
160
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
161
+ )
162
+ cache_dir: Optional[str] = field(
163
+ default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
164
+ )
165
+ use_fast_tokenizer: bool = field(
166
+ default=True,
167
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
168
+ )
169
+ dtype: Optional[str] = field(
170
+ default="float32",
171
+ metadata={
172
+ "help": (
173
+ "Floating-point format in which the model weights should be initialized and trained. Choose one of"
174
+ " `[float32, float16, bfloat16]`."
175
+ )
176
+ },
177
+ )
178
+ token: str = field(
179
+ default=None,
180
+ metadata={
181
+ "help": (
182
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
183
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
184
+ )
185
+ },
186
+ )
187
+ trust_remote_code: bool = field(
188
+ default=False,
189
+ metadata={
190
+ "help": (
191
+ "Whether to trust the execution of code from datasets/models defined on the Hub."
192
+ " This option should only be set to `True` for repositories you trust and in which you have read the"
193
+ " code, as it will execute code present on the Hub on your local machine."
194
+ )
195
+ },
196
+ )
197
+
198
+
199
+ @dataclass
200
+ class DataTrainingArguments:
201
+ """
202
+ Arguments pertaining to what data we are going to input our model for training and eval.
203
+ """
204
+
205
+ dataset_name: Optional[str] = field(
206
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
207
+ )
208
+ dataset_config_name: Optional[str] = field(
209
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
210
+ )
211
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
212
+ validation_file: Optional[str] = field(
213
+ default=None,
214
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
215
+ )
216
+ train_ref_file: Optional[str] = field(
217
+ default=None,
218
+ metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
219
+ )
220
+ validation_ref_file: Optional[str] = field(
221
+ default=None,
222
+ metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
223
+ )
224
+ overwrite_cache: bool = field(
225
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
226
+ )
227
+ validation_split_percentage: Optional[int] = field(
228
+ default=5,
229
+ metadata={
230
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
231
+ },
232
+ )
233
+ max_seq_length: Optional[int] = field(
234
+ default=None,
235
+ metadata={
236
+ "help": (
237
+ "The maximum total input sequence length after tokenization. Sequences longer "
238
+ "than this will be truncated. Default to the max input length of the model."
239
+ )
240
+ },
241
+ )
242
+ preprocessing_num_workers: Optional[int] = field(
243
+ default=None,
244
+ metadata={"help": "The number of processes to use for the preprocessing."},
245
+ )
246
+ mlm_probability: float = field(
247
+ default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
248
+ )
249
+ pad_to_max_length: bool = field(
250
+ default=False,
251
+ metadata={
252
+ "help": (
253
+ "Whether to pad all samples to `max_seq_length`. "
254
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch."
255
+ )
256
+ },
257
+ )
258
+ line_by_line: bool = field(
259
+ default=False,
260
+ metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
261
+ )
262
+
263
+ def __post_init__(self):
264
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
265
+ raise ValueError("Need either a dataset name or a training/validation file.")
266
+ else:
267
+ if self.train_file is not None:
268
+ extension = self.train_file.split(".")[-1]
269
+ assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
270
+ if self.validation_file is not None:
271
+ extension = self.validation_file.split(".")[-1]
272
+ assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
273
+
274
+
275
+ @flax.struct.dataclass
276
+ class FlaxDataCollatorForLanguageModeling:
277
+ """
278
+ Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
279
+ are not all of the same length.
280
+
281
+ Args:
282
+ tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
283
+ The tokenizer used for encoding the data.
284
+ mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
285
+ The probability with which to (randomly) mask tokens in the input.
286
+
287
+ .. note::
288
+
289
+ For best performance, this data collator should be used with a dataset having items that are dictionaries or
290
+ BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
291
+ :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
292
+ argument :obj:`return_special_tokens_mask=True`.
293
+ """
294
+
295
+ tokenizer: PreTrainedTokenizerBase
296
+ mlm_probability: float = 0.15
297
+
298
+ def __post_init__(self):
299
+ if self.tokenizer.mask_token is None:
300
+ raise ValueError(
301
+ "This tokenizer does not have a mask token which is necessary for masked language modeling. "
302
+ "You should pass `mlm=False` to train on causal language modeling instead."
303
+ )
304
+
305
+ def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
306
+ # Handle dict or lists with proper padding and conversion to tensor.
307
+ batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
308
+
309
+ # If special token mask has been preprocessed, pop it from the dict.
310
+ special_tokens_mask = batch.pop("special_tokens_mask", None)
311
+
312
+ batch["input_ids"], batch["labels"] = self.mask_tokens(
313
+ batch["input_ids"], special_tokens_mask=special_tokens_mask
314
+ )
315
+ return batch
316
+
317
+ def mask_tokens(
318
+ self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
319
+ ) -> Tuple[np.ndarray, np.ndarray]:
320
+ """
321
+ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
322
+ """
323
+ labels = inputs.copy()
324
+ # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
325
+ probability_matrix = np.full(labels.shape, self.mlm_probability)
326
+ special_tokens_mask = special_tokens_mask.astype("bool")
327
+
328
+ probability_matrix[special_tokens_mask] = 0.0
329
+ masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
330
+ labels[~masked_indices] = -100 # We only compute loss on masked tokens
331
+
332
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
333
+ indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
334
+ inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
335
+
336
+ # 10% of the time, we replace masked input tokens with random word
337
+ indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
338
+ indices_random &= masked_indices & ~indices_replaced
339
+
340
+ random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
341
+ inputs[indices_random] = random_words[indices_random]
342
+
343
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
344
+ return inputs, labels
345
+
346
+
347
+ def generate_batch_splits(samples_idx: np.ndarray, batch_size: int, drop_last=True) -> np.ndarray:
348
+ """Generate batches of data for a specified batch size from sample indices. If the dataset size is not divisible by
349
+ the batch size and `drop_last` is `True`, the last incomplete batch is dropped. Else, it is returned."""
350
+ num_samples = len(samples_idx)
351
+ if drop_last:
352
+ samples_to_remove = num_samples % batch_size
353
+ if samples_to_remove != 0:
354
+ samples_idx = samples_idx[:-samples_to_remove]
355
+ sections_split = num_samples // batch_size
356
+ samples_idx = samples_idx.reshape((sections_split, batch_size))
357
+ else:
358
+ sections_split = math.ceil(num_samples / batch_size)
359
+ samples_idx = np.array_split(samples_idx, sections_split)
360
+ return samples_idx
361
+
362
+
363
+ def write_train_metric(summary_writer, train_metrics, train_time, step):
364
+ summary_writer.scalar("train_time", train_time, step)
365
+
366
+ train_metrics = get_metrics(train_metrics)
367
+ for key, vals in train_metrics.items():
368
+ tag = f"train_{key}"
369
+ for i, val in enumerate(vals):
370
+ summary_writer.scalar(tag, val, step - len(vals) + i + 1)
371
+
372
+
373
+ def write_eval_metric(summary_writer, eval_metrics, step):
374
+ for metric_name, value in eval_metrics.items():
375
+ summary_writer.scalar(f"eval_{metric_name}", value, step)
376
+
377
+
378
+ def main():
379
+ # See all possible arguments in src/transformers/training_args.py
380
+ # or by passing the --help flag to this script.
381
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
382
+
383
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
384
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
385
+ # If we pass only one argument to the script and it's the path to a json file,
386
+ # let's parse it to get our arguments.
387
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
388
+ else:
389
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
390
+
391
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
392
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
393
+ send_example_telemetry("run_mlm", model_args, data_args, framework="flax")
394
+
395
+ if (
396
+ os.path.exists(training_args.output_dir)
397
+ and os.listdir(training_args.output_dir)
398
+ and training_args.do_train
399
+ and not training_args.overwrite_output_dir
400
+ ):
401
+ raise ValueError(
402
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
403
+ "Use --overwrite_output_dir to overcome."
404
+ )
405
+
406
+ # Setup logging
407
+ logging.basicConfig(
408
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
409
+ level=logging.INFO,
410
+ datefmt="[%X]",
411
+ )
412
+
413
+ # Log on each process the small summary:
414
+ logger = logging.getLogger(__name__)
415
+
416
+ # Set the verbosity to info of the Transformers logger (on main process only):
417
+ logger.info(f"Training/evaluation parameters {training_args}")
418
+
419
+ # Set seed before initializing model.
420
+ set_seed(training_args.seed)
421
+
422
+ # Handle the repository creation
423
+ if training_args.push_to_hub:
424
+ # Retrieve of infer repo_name
425
+ repo_name = training_args.hub_model_id
426
+ if repo_name is None:
427
+ repo_name = Path(training_args.output_dir).absolute().name
428
+ # Create repo and retrieve repo_id
429
+ api = HfApi()
430
+ repo_id = api.create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
431
+
432
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
433
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
434
+ # (the dataset will be downloaded automatically from the datasets Hub).
435
+ #
436
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
437
+ # 'text' is found. You can easily tweak this behavior (see below).
438
+ #
439
+ # In distributed training, the load_dataset function guarantees that only one local process can concurrently
440
+ # download the dataset.
441
+ if data_args.dataset_name is not None:
442
+ # Downloading and loading a dataset from the hub.
443
+ datasets = load_dataset(
444
+ data_args.dataset_name,
445
+ data_args.dataset_config_name,
446
+ cache_dir=model_args.cache_dir,
447
+ token=model_args.token,
448
+ num_proc=data_args.preprocessing_num_workers,
449
+ trust_remote_code=model_args.trust_remote_code,
450
+ )
451
+
452
+ if "validation" not in datasets.keys():
453
+ datasets["validation"] = load_dataset(
454
+ data_args.dataset_name,
455
+ data_args.dataset_config_name,
456
+ split=f"train[:{data_args.validation_split_percentage}%]",
457
+ cache_dir=model_args.cache_dir,
458
+ token=model_args.token,
459
+ num_proc=data_args.preprocessing_num_workers,
460
+ trust_remote_code=model_args.trust_remote_code,
461
+ )
462
+ datasets["train"] = load_dataset(
463
+ data_args.dataset_name,
464
+ data_args.dataset_config_name,
465
+ split=f"train[{data_args.validation_split_percentage}%:]",
466
+ cache_dir=model_args.cache_dir,
467
+ token=model_args.token,
468
+ num_proc=data_args.preprocessing_num_workers,
469
+ trust_remote_code=model_args.trust_remote_code,
470
+ )
471
+ else:
472
+ data_files = {}
473
+ if data_args.train_file is not None:
474
+ data_files["train"] = data_args.train_file
475
+ extension = data_args.train_file.split(".")[-1]
476
+ if data_args.validation_file is not None:
477
+ data_files["validation"] = data_args.validation_file
478
+ extension = data_args.validation_file.split(".")[-1]
479
+ if extension == "txt":
480
+ extension = "text"
481
+ datasets = load_dataset(
482
+ extension,
483
+ data_files=data_files,
484
+ cache_dir=model_args.cache_dir,
485
+ token=model_args.token,
486
+ num_proc=data_args.preprocessing_num_workers,
487
+ )
488
+
489
+ if "validation" not in datasets.keys():
490
+ datasets["validation"] = load_dataset(
491
+ extension,
492
+ data_files=data_files,
493
+ split=f"train[:{data_args.validation_split_percentage}%]",
494
+ cache_dir=model_args.cache_dir,
495
+ token=model_args.token,
496
+ num_proc=data_args.preprocessing_num_workers,
497
+ )
498
+ datasets["train"] = load_dataset(
499
+ extension,
500
+ data_files=data_files,
501
+ split=f"train[{data_args.validation_split_percentage}%:]",
502
+ cache_dir=model_args.cache_dir,
503
+ token=model_args.token,
504
+ num_proc=data_args.preprocessing_num_workers,
505
+ )
506
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
507
+ # https://huggingface.co/docs/datasets/loading_datasets.
508
+
509
+ # Load pretrained model and tokenizer
510
+
511
+ # Distributed training:
512
+ # The .from_pretrained methods guarantee that only one local process can concurrently
513
+ # download model & vocab.
514
+ if model_args.config_name:
515
+ config = AutoConfig.from_pretrained(
516
+ model_args.config_name,
517
+ cache_dir=model_args.cache_dir,
518
+ token=model_args.token,
519
+ trust_remote_code=model_args.trust_remote_code,
520
+ )
521
+ elif model_args.model_name_or_path:
522
+ config = AutoConfig.from_pretrained(
523
+ model_args.model_name_or_path,
524
+ cache_dir=model_args.cache_dir,
525
+ token=model_args.token,
526
+ trust_remote_code=model_args.trust_remote_code,
527
+ )
528
+ else:
529
+ config = CONFIG_MAPPING[model_args.model_type]()
530
+ logger.warning("You are instantiating a new config instance from scratch.")
531
+
532
+ if model_args.tokenizer_name:
533
+ tokenizer = AutoTokenizer.from_pretrained(
534
+ model_args.tokenizer_name,
535
+ cache_dir=model_args.cache_dir,
536
+ use_fast=model_args.use_fast_tokenizer,
537
+ token=model_args.token,
538
+ trust_remote_code=model_args.trust_remote_code,
539
+ )
540
+ elif model_args.model_name_or_path:
541
+ tokenizer = AutoTokenizer.from_pretrained(
542
+ model_args.model_name_or_path,
543
+ cache_dir=model_args.cache_dir,
544
+ use_fast=model_args.use_fast_tokenizer,
545
+ token=model_args.token,
546
+ trust_remote_code=model_args.trust_remote_code,
547
+ )
548
+ else:
549
+ raise ValueError(
550
+ "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
551
+ "You can do it from another script, save it, and load it from here, using --tokenizer_name."
552
+ )
553
+
554
+ # Preprocessing the datasets.
555
+ # First we tokenize all the texts.
556
+ if training_args.do_train:
557
+ column_names = datasets["train"].column_names
558
+ else:
559
+ column_names = datasets["validation"].column_names
560
+ text_column_name = "text" if "text" in column_names else column_names[0]
561
+
562
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
563
+
564
+ if data_args.line_by_line:
565
+ # When using line_by_line, we just tokenize each nonempty line.
566
+ padding = "max_length" if data_args.pad_to_max_length else False
567
+
568
+ def tokenize_function(examples):
569
+ # Remove empty lines
570
+ examples = [line for line in examples if len(line) > 0 and not line.isspace()]
571
+ return tokenizer(
572
+ examples,
573
+ return_special_tokens_mask=True,
574
+ padding=padding,
575
+ truncation=True,
576
+ max_length=max_seq_length,
577
+ )
578
+
579
+ tokenized_datasets = datasets.map(
580
+ tokenize_function,
581
+ input_columns=[text_column_name],
582
+ batched=True,
583
+ num_proc=data_args.preprocessing_num_workers,
584
+ remove_columns=column_names,
585
+ load_from_cache_file=not data_args.overwrite_cache,
586
+ )
587
+
588
+ else:
589
+ # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
590
+ # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
591
+ # efficient when it receives the `special_tokens_mask`.
592
+ def tokenize_function(examples):
593
+ return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
594
+
595
+ tokenized_datasets = datasets.map(
596
+ tokenize_function,
597
+ batched=True,
598
+ num_proc=data_args.preprocessing_num_workers,
599
+ remove_columns=column_names,
600
+ load_from_cache_file=not data_args.overwrite_cache,
601
+ )
602
+
603
+ # Main data processing function that will concatenate all texts from our dataset and generate chunks of
604
+ # max_seq_length.
605
+ def group_texts(examples):
606
+ # Concatenate all texts.
607
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
608
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
609
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
610
+ # customize this part to your needs.
611
+ if total_length >= max_seq_length:
612
+ total_length = (total_length // max_seq_length) * max_seq_length
613
+ # Split by chunks of max_len.
614
+ result = {
615
+ k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
616
+ for k, t in concatenated_examples.items()
617
+ }
618
+ return result
619
+
620
+ # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
621
+ # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
622
+ # might be slower to preprocess.
623
+ #
624
+ # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
625
+ # https://huggingface.co/docs/datasets/process#map
626
+ tokenized_datasets = tokenized_datasets.map(
627
+ group_texts,
628
+ batched=True,
629
+ num_proc=data_args.preprocessing_num_workers,
630
+ load_from_cache_file=not data_args.overwrite_cache,
631
+ )
632
+
633
+ # Enable tensorboard only on the master node
634
+ has_tensorboard = is_tensorboard_available()
635
+ if has_tensorboard and jax.process_index() == 0:
636
+ try:
637
+ from flax.metrics.tensorboard import SummaryWriter
638
+
639
+ summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
640
+ except ImportError as ie:
641
+ has_tensorboard = False
642
+ logger.warning(
643
+ f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
644
+ )
645
+ else:
646
+ logger.warning(
647
+ "Unable to display metrics through TensorBoard because the package is not installed: "
648
+ "Please run pip install tensorboard to enable."
649
+ )
650
+
651
+ # Data collator
652
+ # This one will take care of randomly masking the tokens.
653
+ data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
654
+
655
+ # Initialize our training
656
+ rng = jax.random.PRNGKey(training_args.seed)
657
+ dropout_rngs = jax.random.split(rng, jax.local_device_count())
658
+
659
+ if model_args.model_name_or_path:
660
+ model = FlaxAutoModelForMaskedLM.from_pretrained(
661
+ model_args.model_name_or_path,
662
+ config=config,
663
+ seed=training_args.seed,
664
+ dtype=getattr(jnp, model_args.dtype),
665
+ token=model_args.token,
666
+ trust_remote_code=model_args.trust_remote_code,
667
+ )
668
+ else:
669
+ model = FlaxAutoModelForMaskedLM.from_config(
670
+ config,
671
+ seed=training_args.seed,
672
+ dtype=getattr(jnp, model_args.dtype),
673
+ trust_remote_code=model_args.trust_remote_code,
674
+ )
675
+
676
+ if training_args.gradient_checkpointing:
677
+ model.enable_gradient_checkpointing()
678
+
679
+ # Store some constant
680
+ num_epochs = int(training_args.num_train_epochs)
681
+
682
+ # Use local_device_count for per-process batch size
683
+ local_device_count = jax.local_device_count()
684
+
685
+ # Each process handles per_device_train_batch_size * local_device_count
686
+ train_batch_size = training_args.per_device_train_batch_size * local_device_count
687
+ per_device_eval_batch_size = training_args.per_device_eval_batch_size
688
+ eval_batch_size = per_device_eval_batch_size * local_device_count
689
+
690
+ num_train_steps = (len(tokenized_datasets["train"]) // (train_batch_size * jax.process_count())) * num_epochs
691
+
692
+ # Create learning rate schedule
693
+ warmup_fn = optax.linear_schedule(
694
+ init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
695
+ )
696
+ decay_fn = optax.linear_schedule(
697
+ init_value=training_args.learning_rate,
698
+ end_value=0,
699
+ transition_steps=num_train_steps - training_args.warmup_steps,
700
+ )
701
+ linear_decay_lr_schedule_fn = optax.join_schedules(
702
+ schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
703
+ )
704
+
705
+ # We use Optax's "masking" functionality to not apply weight decay
706
+ # to bias and LayerNorm scale parameters. decay_mask_fn returns a
707
+ # mask boolean with the same structure as the parameters.
708
+ # The mask is True for parameters that should be decayed.
709
+ def decay_mask_fn(params):
710
+ flat_params = traverse_util.flatten_dict(params)
711
+ # find out all LayerNorm parameters
712
+ layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
713
+ layer_norm_named_params = {
714
+ layer[-2:]
715
+ for layer_norm_name in layer_norm_candidates
716
+ for layer in flat_params.keys()
717
+ if layer_norm_name in "".join(layer).lower()
718
+ }
719
+ flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_named_params) for path in flat_params}
720
+ return traverse_util.unflatten_dict(flat_mask)
721
+
722
+ # create adam optimizer
723
+ if training_args.adafactor:
724
+ # We use the default parameters here to initialize adafactor,
725
+ # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
726
+ optimizer = optax.adafactor(
727
+ learning_rate=linear_decay_lr_schedule_fn,
728
+ )
729
+ else:
730
+ optimizer = optax.adamw(
731
+ learning_rate=linear_decay_lr_schedule_fn,
732
+ b1=training_args.adam_beta1,
733
+ b2=training_args.adam_beta2,
734
+ eps=training_args.adam_epsilon,
735
+ weight_decay=training_args.weight_decay,
736
+ mask=decay_mask_fn,
737
+ )
738
+
739
+ # Setup train state
740
+ state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
741
+
742
+ # Define gradient update step fn
743
+ def train_step(state, batch, dropout_rng):
744
+ dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
745
+
746
+ def loss_fn(params):
747
+ labels = batch.pop("labels")
748
+
749
+ logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
750
+
751
+ # compute loss, ignore padded input tokens
752
+ label_mask = jnp.where(labels > 0, 1.0, 0.0)
753
+ loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
754
+
755
+ # take average
756
+ loss = loss.sum()
757
+ num_labels = label_mask.sum()
758
+
759
+ return loss, num_labels
760
+
761
+ grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
762
+ (loss, num_labels), grad = grad_fn(state.params)
763
+ num_labels = jax.lax.psum(num_labels, "batch")
764
+
765
+ # true loss = total loss / total samples
766
+ loss = jax.lax.psum(loss, "batch")
767
+ loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
768
+
769
+ # true grad = total grad / total samples
770
+ grad = jax.lax.psum(grad, "batch")
771
+ grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
772
+ new_state = state.apply_gradients(grads=grad)
773
+
774
+ metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
775
+
776
+ return new_state, metrics, new_dropout_rng
777
+
778
+ # Create parallel version of the train step
779
+ p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
780
+
781
+ # Define eval fn
782
+ def eval_step(params, batch):
783
+ labels = batch.pop("labels")
784
+
785
+ logits = model(**batch, params=params, train=False)[0]
786
+
787
+ # compute loss, ignore padded input tokens
788
+ label_mask = jnp.where(labels > 0, 1.0, 0.0)
789
+ loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
790
+
791
+ # compute accuracy
792
+ accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
793
+
794
+ # summarize metrics
795
+ metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
796
+ metrics = jax.lax.psum(metrics, axis_name="batch")
797
+
798
+ return metrics
799
+
800
+ p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
801
+
802
+ # Replicate the train state on each device
803
+ state = jax_utils.replicate(state)
804
+
805
+ train_time = 0
806
+ epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
807
+ for epoch in epochs:
808
+ # ======================== Training ================================
809
+ train_start = time.time()
810
+ train_metrics = []
811
+
812
+ # Create sampling rng
813
+ rng, input_rng = jax.random.split(rng)
814
+
815
+ # Generate an epoch by shuffling sampling indices from the train dataset
816
+ num_train_samples = len(tokenized_datasets["train"])
817
+
818
+ train_samples_idx = np.arange(num_train_samples)
819
+ train_samples_idx = np.random.permutation(train_samples_idx)
820
+ # Split the training indices across processes train_samples_idx = np.array_split(train_samples_idx, jax.process_count())[jax.process_index()]
821
+ train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size, drop_last=True)
822
+
823
+ # Gather the indexes for creating the batch and do a training step
824
+ for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
825
+ samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
826
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
827
+
828
+ # Model forward
829
+ model_inputs = shard(model_inputs.data)
830
+ state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
831
+ train_metrics.append(train_metric)
832
+
833
+ cur_step = epoch * (num_train_samples // (train_batch_size * jax.process_count())) + step
834
+
835
+ if cur_step % training_args.logging_steps == 0 and cur_step > 0:
836
+ # Save metrics
837
+ train_metric = jax_utils.unreplicate(train_metric)
838
+ train_time += time.time() - train_start
839
+ if has_tensorboard and jax.process_index() == 0:
840
+ write_train_metric(summary_writer, train_metrics, train_time, cur_step)
841
+
842
+ epochs.write(
843
+ f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate:"
844
+ f" {train_metric['learning_rate']})"
845
+ )
846
+
847
+ train_metrics = []
848
+
849
+ if cur_step % training_args.eval_steps == 0 and cur_step > 0:
850
+ # ======================== Evaluating ==============================
851
+ num_eval_samples = len(tokenized_datasets["validation"])
852
+ # Avoid using jax.numpy here in case of TPU training
853
+ eval_samples_idx = np.arange(num_eval_samples)
854
+ # Shard the eval dataset by process as well
855
+ eval_samples_idx = np.array_split(eval_samples_idx, jax.process_count())[jax.process_index()]
856
+ eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
857
+
858
+ eval_metrics = []
859
+ for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
860
+ samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
861
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
862
+
863
+ # Model forward
864
+ metrics = pad_shard_unpad(p_eval_step, static_return=True)(
865
+ state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size
866
+ )
867
+ eval_metrics.append(metrics)
868
+
869
+ # normalize eval metrics
870
+ eval_metrics = get_metrics(eval_metrics)
871
+ eval_metrics = jax.tree_util.tree_map(jnp.sum, eval_metrics)
872
+ eval_normalizer = eval_metrics.pop("normalizer")
873
+ eval_metrics = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics)
874
+
875
+ # Update progress bar
876
+ epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
877
+
878
+ # Save metrics
879
+ if has_tensorboard and jax.process_index() == 0:
880
+ write_eval_metric(summary_writer, eval_metrics, cur_step)
881
+
882
+ if cur_step % training_args.save_steps == 0 and cur_step > 0:
883
+ # save checkpoint after each epoch and push checkpoint to the hub
884
+ if jax.process_index() == 0:
885
+ params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
886
+ model.save_pretrained(training_args.output_dir, params=params)
887
+ tokenizer.save_pretrained(training_args.output_dir)
888
+ if training_args.push_to_hub:
889
+ api.upload_folder(
890
+ commit_message=f"Saving weights and logs of step {cur_step}",
891
+ folder_path=training_args.output_dir,
892
+ repo_id=repo_id,
893
+ repo_type="model",
894
+ token=training_args.hub_token,
895
+ )
896
+ # Eval after training
897
+ if training_args.do_eval:
898
+ num_eval_samples = len(tokenized_datasets["validation"])
899
+ # Avoid using jax.numpy here in case of TPU training
900
+ eval_samples_idx = np.arange(num_eval_samples)
901
+ eval_samples_idx = np.array_split(eval_samples_idx, jax.process_count())[jax.process_index()]
902
+ eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
903
+
904
+ eval_metrics = []
905
+ for _, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
906
+ samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
907
+ model_inputs = data_collator(samples, pad_to_multiple_of=16)
908
+
909
+ # Model forward
910
+ metrics = pad_shard_unpad(p_eval_step, static_return=True)(
911
+ state.params, model_inputs.data, min_device_batch=per_device_eval_batch_size
912
+ )
913
+ eval_metrics.append(metrics)
914
+
915
+ # normalize eval metrics
916
+ eval_metrics = get_metrics(eval_metrics)
917
+ eval_metrics = jax.tree_util.tree_map(lambda metric: jnp.sum(metric).item(), eval_metrics)
918
+ eval_normalizer = eval_metrics.pop("normalizer")
919
+ eval_metrics = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics)
920
+
921
+ try:
922
+ perplexity = math.exp(eval_metrics["loss"])
923
+ except OverflowError:
924
+ perplexity = float("inf")
925
+ eval_metrics["perplexity"] = perplexity
926
+
927
+ if jax.process_index() == 0:
928
+ eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
929
+ path = os.path.join(training_args.output_dir, "eval_results.json")
930
+ with open(path, "w") as f:
931
+ json.dump(eval_metrics, f, indent=4, sort_keys=True)
932
+
933
+
934
+ if __name__ == "__main__":
935
+ main()
run_train.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python run_mlm_flax.py \
2
+ --output_dir="./" \
3
+ --model_type="bert" \
4
+ --hub_model_id="norwegian-bert-large-newcode" \
5
+ --config_name="./" \
6
+ --tokenizer_name="./" \
7
+ --dataset_name="oscar" \
8
+ --dataset_config_name="unshuffled_deduplicated_no" \
9
+ --max_seq_length="128" \
10
+ --weight_decay="0.01" \
11
+ --per_device_train_batch_size="8" \
12
+ --per_device_eval_batch_size="2" \
13
+ --learning_rate="3e-4" \
14
+ --warmup_steps="1000" \
15
+ --overwrite_output_dir \
16
+ --num_train_epochs="20" \
17
+ --adam_beta1="0.9" \
18
+ --adam_beta2="0.98" \
19
+ --logging_steps="500" \
20
+ --save_steps="2500" \
21
+ --eval_steps="100000000" \
22
+ --dtype="bfloat16" \
23
+ --trust_remote_code \
24
+ --push_to_hub
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "501": {
4
+ "content": "[CLS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "502": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "503": {
20
+ "content": "[PAD]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "504": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "505": {
36
+ "content": "[UNK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff