fgaim commited on
Commit
e5bd3ab
1 Parent(s): 517faed

set new dataset in train_tokenizer

Browse files
README.md CHANGED
File without changes
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "RobertaForMaskedLM"
4
  ],
@@ -18,6 +19,7 @@
18
  "num_hidden_layers": 12,
19
  "pad_token_id": 1,
20
  "position_embedding_type": "absolute",
 
21
  "transformers_version": "4.9.0.dev0",
22
  "type_vocab_size": 1,
23
  "use_cache": true,
 
1
  {
2
+ "_name_or_path": "./",
3
  "architectures": [
4
  "RobertaForMaskedLM"
5
  ],
 
19
  "num_hidden_layers": 12,
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
  "transformers_version": "4.9.0.dev0",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
events.out.tfevents.1625831062.t1v-n-6a2ff29b-w-0.1152929.3.v2 CHANGED
File without changes
events.out.tfevents.1625850549.t1v-n-6a2ff29b-w-0.1178206.3.v2 CHANGED
File without changes
events.out.tfevents.1625996487.t1v-n-6a2ff29b-w-0.1982849.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac60639757fe4b60f9e4a84623140d1dae79d4ded1896534e2ae43a2a58e404d
3
+ size 10516780
flax_model.msgpack CHANGED
File without changes
flax_to_torch.py CHANGED
File without changes
train_tokenizer.py CHANGED
@@ -2,7 +2,8 @@ from datasets import load_dataset
2
  from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
 
4
  # load dataset
5
- dataset = load_dataset("mc4", "sw", split="train")
 
6
 
7
  # Instantiate tokenizer
8
  tokenizer = ByteLevelBPETokenizer()
 
2
  from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
 
4
  # load dataset
5
+ # dataset = load_dataset("mc4", "sw", split="train")
6
+ dataset = load_dataset("text", "sw", split="train", data_files={"train": ["/home/shared/clean_swahili/train.txt"]})
7
 
8
  # Instantiate tokenizer
9
  tokenizer = ByteLevelBPETokenizer()