sakares commited on
Commit
13d02d7
1 Parent(s): 70b401c

change config and fix training tokenizer script

Browse files
config.json CHANGED
@@ -21,5 +21,5 @@
21
  "transformers_version": "4.9.0.dev0",
22
  "type_vocab_size": 1,
23
  "use_cache": true,
24
- "vocab_size": 21153
25
  }
 
21
  "transformers_version": "4.9.0.dev0",
22
  "type_vocab_size": 1,
23
  "use_cache": true,
24
+ "vocab_size": 50265
25
  }
events.out.tfevents.1626329391.t1v-n-bf8aeee7-w-0.4713.3.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0306cd10b128d1cf1aa7395239854fb0fea8ac8263aebc218c879dc141dbece3
3
- size 73496
 
 
 
 
train_tokenizer.py CHANGED
@@ -1,11 +1,8 @@
1
- #pip install -U pythainlp
2
-
3
  from datasets import load_dataset, concatenate_datasets
4
  from tokenizers import ByteLevelBPETokenizer
5
  from transformers import AutoConfig
6
  from pythainlp.tokenize import word_tokenize
7
 
8
-
9
  language = "th"
10
  model_config = "roberta-base"
11
  model_dir = model_config + f"-pretrained-{language}"
@@ -14,21 +11,22 @@ config.save_pretrained(f"{model_dir}")
14
 
15
  # load dataset
16
  # only the train subset for tokenizing purposes
17
- raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_{language}")
18
- raw_dataset = load_dataset("oscar", f"unshuffled_deduplicated_th")
 
19
 
20
  # Instantiate tokenizer
21
  tokenizer = ByteLevelBPETokenizer()
22
 
23
-
24
  ## For Thai NLP Library, please feel free to check https://pythainlp.github.io/docs/2.3/api/tokenize.html
25
  def th_tokenize(text):
26
  result = " ".join(word_tokenize(text, engine="newmm", keep_whitespace=False))
27
  return result
28
 
 
29
  def batch_iterator(batch_size=1000):
30
  for i in range(0, len(raw_dataset), batch_size):
31
- yield [th_tokenize(text) for text in raw_dataset["train"][i: i + batch_size]["text"]]
32
 
33
 
34
  # Customized training
@@ -36,7 +34,7 @@ tokenizer.train_from_iterator(
36
  batch_iterator(),
37
  vocab_size=50265,
38
  min_frequency=2,
39
- special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", ],
40
  )
41
 
42
  # Save files to disk
 
 
 
1
  from datasets import load_dataset, concatenate_datasets
2
  from tokenizers import ByteLevelBPETokenizer
3
  from transformers import AutoConfig
4
  from pythainlp.tokenize import word_tokenize
5
 
 
6
  language = "th"
7
  model_config = "roberta-base"
8
  model_dir = model_config + f"-pretrained-{language}"
 
11
 
12
  # load dataset
13
  # only the train subset for tokenizing purposes
14
+ raw_dataset = load_dataset(
15
+ "oscar", f"unshuffled_deduplicated_{language}", split="train"
16
+ )
17
 
18
  # Instantiate tokenizer
19
  tokenizer = ByteLevelBPETokenizer()
20
 
 
21
  ## For Thai NLP Library, please feel free to check https://pythainlp.github.io/docs/2.3/api/tokenize.html
22
  def th_tokenize(text):
23
  result = " ".join(word_tokenize(text, engine="newmm", keep_whitespace=False))
24
  return result
25
 
26
+
27
  def batch_iterator(batch_size=1000):
28
  for i in range(0, len(raw_dataset), batch_size):
29
+ yield [th_tokenize(text) for text in raw_dataset[i : i + batch_size]["text"]]
30
 
31
 
32
  # Customized training
 
34
  batch_iterator(),
35
  vocab_size=50265,
36
  min_frequency=2,
37
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",],
38
  )
39
 
40
  # Save files to disk