fgaim commited on
Commit
4d48c1f
1 Parent(s): be50205

New tokenizer with cleaned data

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -0
  2. train_tokenizer.py +1 -1
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py CHANGED
@@ -3,7 +3,7 @@ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
 
4
  # load dataset
5
  # dataset = load_dataset("mc4", "sw", split="train")
6
- dataset = load_dataset("text", "sw", split="train", data_files={"train": ["/home/shared/clean_swahili/train.txt"]})
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()
 
3
 
4
  # load dataset
5
  # dataset = load_dataset("mc4", "sw", split="train")
6
+ dataset = load_dataset("text", "sw", split="train", data_files={"train": ["/home/shared/clean_swahili/train_v1.4.txt"]})
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()