ibraheemmoosa
commited on
Commit
•
c43ef85
1
Parent(s):
80fbf50
Add tokenizer and model training script
Browse files- t5-train.sh +22 -0
- train_unigram_tokenizer_for_t5.py +20 -0
t5-train.sh
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
HF_DATASETS_IN_MEMORY_MAX_SIZE=200000000
|
3 |
+
MODEL_DIR=bengali-t5-on-mc4
|
4 |
+
run_t5_mlm_flax.py \
|
5 |
+
--output_dir="${MODEL_DIR}" \
|
6 |
+
--model_type="t5" \
|
7 |
+
--config_name="${MODEL_DIR}" \
|
8 |
+
--tokenizer_name="${MODEL_DIR}" \
|
9 |
+
--dataset_name="mc4" \
|
10 |
+
--dataset_config_name="bn" \
|
11 |
+
--max_seq_length="512" \
|
12 |
+
--per_device_train_batch_size="128" \
|
13 |
+
--per_device_eval_batch_size="128" \
|
14 |
+
--learning_rate="1e-3" \
|
15 |
+
--weight_decay="0.001" \
|
16 |
+
--warmup_steps="500" \
|
17 |
+
--overwrite_output_dir \
|
18 |
+
--num_train_epochs="10" \
|
19 |
+
--logging_steps="500" \
|
20 |
+
--save_steps="500" \
|
21 |
+
--eval_steps="500" \
|
22 |
+
#--push_to_hub
|
train_unigram_tokenizer_for_t5.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer
|
2 |
+
from tokenizers.models import Unigram
|
3 |
+
from tokenizers.trainers import UnigramTrainer
|
4 |
+
from tokenizers.normalizers import NFKC
|
5 |
+
from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
mc4 = load_dataset('mc4', 'bn', split='train')
|
9 |
+
|
10 |
+
def batch_iterator(dataset, batch_size=1000):
|
11 |
+
for i in range(0, len(dataset), batch_size):
|
12 |
+
yield dataset[i: i + batch_size]["text"]
|
13 |
+
|
14 |
+
tokenizer = Tokenizer(Unigram())
|
15 |
+
tokenizer.normalizer = NFKC()
|
16 |
+
tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)])
|
17 |
+
trainer = UnigramTrainer(vocab_size=32000, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])
|
18 |
+
tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4))
|
19 |
+
tokenizer.save('tokenizer-mc4-unigram.json')
|
20 |
+
|