yhavinga commited on
Commit
a37447a
1 Parent(s): d9993eb

Retrain tokenizer for case sensitive

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. train_tokenizer.py +1 -3
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py CHANGED
@@ -18,7 +18,7 @@ def train_val_files():
18
  print(f"Number of files {len(data_files)} after adding {path}")
19
 
20
  # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
21
- add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
22
  add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
23
  add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
24
  random.Random(SEED).shuffle(data_files)
@@ -42,8 +42,6 @@ train, val = train_val_files()
42
 
43
  dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
44
 
45
- model_dir = "/t5-small-dutch" # ${MODEL_DIR}
46
-
47
  vocab_size = 32000
48
  input_sentence_size = None
49
  tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
 
18
  print(f"Number of files {len(data_files)} after adding {path}")
19
 
20
  # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
21
+ add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz")
22
  add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
23
  add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
24
  random.Random(SEED).shuffle(data_files)
 
42
 
43
  dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
44
 
 
 
45
  vocab_size = 32000
46
  input_sentence_size = None
47
  tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")