ibraheemmoosa commited on
Commit
c43ef85
1 Parent(s): 80fbf50

Add tokenizer and model training script

Browse files
Files changed (2) hide show
  1. t5-train.sh +22 -0
  2. train_unigram_tokenizer_for_t5.py +20 -0
t5-train.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ HF_DATASETS_IN_MEMORY_MAX_SIZE=200000000
3
+ MODEL_DIR=bengali-t5-on-mc4
4
+ run_t5_mlm_flax.py \
5
+ --output_dir="${MODEL_DIR}" \
6
+ --model_type="t5" \
7
+ --config_name="${MODEL_DIR}" \
8
+ --tokenizer_name="${MODEL_DIR}" \
9
+ --dataset_name="mc4" \
10
+ --dataset_config_name="bn" \
11
+ --max_seq_length="512" \
12
+ --per_device_train_batch_size="128" \
13
+ --per_device_eval_batch_size="128" \
14
+ --learning_rate="1e-3" \
15
+ --weight_decay="0.001" \
16
+ --warmup_steps="500" \
17
+ --overwrite_output_dir \
18
+ --num_train_epochs="10" \
19
+ --logging_steps="500" \
20
+ --save_steps="500" \
21
+ --eval_steps="500" \
22
+ #--push_to_hub
train_unigram_tokenizer_for_t5.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
2
+ from tokenizers.models import Unigram
3
+ from tokenizers.trainers import UnigramTrainer
4
+ from tokenizers.normalizers import NFKC
5
+ from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence
6
+ from datasets import load_dataset
7
+
8
+ mc4 = load_dataset('mc4', 'bn', split='train')
9
+
10
+ def batch_iterator(dataset, batch_size=1000):
11
+ for i in range(0, len(dataset), batch_size):
12
+ yield dataset[i: i + batch_size]["text"]
13
+
14
+ tokenizer = Tokenizer(Unigram())
15
+ tokenizer.normalizer = NFKC()
16
+ tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)])
17
+ trainer = UnigramTrainer(vocab_size=32000, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])
18
+ tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4))
19
+ tokenizer.save('tokenizer-mc4-unigram.json')
20
+