sakares commited on
Commit
5ad5b72
1 Parent(s): e76044d

Create Thai Tokenizer script

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +43 -0
train_tokenizer.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pip install -U pythainlp
2
+
3
+ from datasets import load_dataset, concatenate_datasets
4
+ from tokenizers import ByteLevelBPETokenizer
5
+ from transformers import AutoConfig
6
+ from pythainlp.tokenize import word_tokenize
7
+
8
+
9
+ language = "th"
10
+ model_config = "roberta-base"
11
+ model_dir = model_config + f"-pretrained-{language}"
12
+ config = AutoConfig.from_pretrained(model_config)
13
+ config.save_pretrained(f"{model_dir}")
14
+
15
+ # load dataset
16
+ # only the train subset for tokenizing purposes
17
+ raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_{language}")
18
+ raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_th")
19
+
20
+ # Instantiate tokenizer
21
+ tokenizer = ByteLevelBPETokenizer()
22
+
23
+
24
+ ## For Thai NLP Library, please feel free to check https://pythainlp.github.io/docs/2.3/api/tokenize.html
25
+ def th_tokenize(text):
26
+ result = " ".join(word_tokenize(text, engine="newmm", keep_whitespace=False))
27
+ return result
28
+
29
+ def batch_iterator(batch_size=1000):
30
+ for i in range(0, len(raw_dataset), batch_size):
31
+ yield [th_tokenize(text) for text in raw_dataset["train"][i: i + batch_size]["text"]]
32
+
33
+
34
+ # Customized training
35
+ tokenizer.train_from_iterator(
36
+ batch_iterator(),
37
+ vocab_size=50265,
38
+ min_frequency=2,
39
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", ],
40
+ )
41
+
42
+ # Save files to disk
43
+ tokenizer.save(f"./tokenizer.json")