pere commited on
Commit
5bedc03
1 Parent(s): a25a352

main scripts ready for training. No tokenizer yet.

Browse files
__pycache__/t5_tokenizer_model.cpython-38.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5fe110f60beb470a8651b6f08ad0fa70ed76f09433a9952520db00965485dbc
3
+ size 3425
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "gated-gelu",
13
+ "gradient_checkpointing": false,
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "layer_norm_epsilon": 1e-06,
17
+ "model_type": "t5",
18
+ "num_decoder_layers": 12,
19
+ "num_heads": 12,
20
+ "num_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_num_buckets": 32,
24
+ "tie_word_embeddings": false,
25
+ "transformers_version": "4.10.0.dev0",
26
+ "use_cache": true,
27
+ "vocab_size": 50000
28
+ }
run.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./run_t5_mlm_flax.py \
2
+ --output_dir="./" \
3
+ --model_type="t5" \
4
+ --config_name="./" \
5
+ --tokenizer_name="./" \
6
+ --train_file /mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_train.json \
7
+ --validation_file /mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json \
8
+ --max_seq_length="128" \
9
+ --weight_decay="0.01" \
10
+ --per_device_train_batch_size="128" \
11
+ --per_device_eval_batch_size="128" \
12
+ --learning_rate="8e-3" \
13
+ --warmup_steps="2000" \
14
+ --overwrite_output_dir \
15
+ --cache_dir /mnt/disks/flaxdisk/cache/ \
16
+ --num_train_epochs="3" \
17
+ --adam_beta1="0.9" \
18
+ --adam_beta2="0.98" \
19
+ --logging_steps="100" \
20
+ --save_steps="2500" \
21
+ --eval_steps="2500" \
22
+ --preprocessing_num_workers 96 \
23
+ --adafactor \
24
+ --push_to_hub
25
+
t5_tokenizer_model.py ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4fa09755fb0bc80e4fdd49742cd0d5aed816ceb6cd91fe796420b32752b521
3
+ size 3883
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
train_tokenizer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, concatenate_datasets
2
+ from tokenizers import trainers, Tokenizer, normalizers
3
+ from t5_tokenizer_model import SentencePieceUnigramTokenizer
4
+
5
+
6
+ vocab_size = 50_000
7
+ input_sentence_size = None
8
+ model_dir = "./" # ${MODEL_DIR}
9
+
10
+ # Initialize a dataset
11
+ dataset = load_dataset("json", data_files=["/mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json","/mnt/disks/flaxdisk/corpus/special_chars.json"], split='train')
12
+
13
+ tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
14
+
15
+
16
+ # Build an iterator over this dataset
17
+ def batch_iterator(input_sentence_size=None):
18
+ if input_sentence_size is None:
19
+ input_sentence_size = len(dataset)
20
+ batch_length = 100
21
+ for i in range(0, input_sentence_size, batch_length):
22
+ yield dataset[i: i + batch_length]["text"]
23
+
24
+
25
+ # Train tokenizer
26
+ tokenizer.train_from_iterator(
27
+ iterator=batch_iterator(input_sentence_size=input_sentence_size),
28
+ vocab_size=vocab_size,
29
+ show_progress=True,
30
+ )
31
+
32
+ # Save files to disk
33
+ tokenizer.save(f"{model_dir}/tokenizer.json")
34
+