Doron Adler commited on
Commit
56f66a3
1 Parent(s): 9b5ecc1

Refreshed metadata and configs and tokenizer files

Browse files
added_tokens.json CHANGED
@@ -1 +1,5 @@
1
- {"<|unknown|>": 50259, "<|startoftext|>": 50258, "<|endoftext|>": 50257}
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50257,
3
+ "<|startoftext|>": 50258,
4
+ "<|unknown|>": 50259
5
+ }
config.json CHANGED
@@ -59,8 +59,7 @@
59
  "summary_type": "cls_index",
60
  "summary_use_proj": true,
61
  "torch_dtype": "float32",
62
- "transformers_version": "4.9.0.dev0",
63
- "use_cache": true,
64
  "vocab_size": 50257,
65
  "window_size": 256
66
  }
59
  "summary_type": "cls_index",
60
  "summary_use_proj": true,
61
  "torch_dtype": "float32",
62
+ "transformers_version": "4.21.0.dev0",
 
63
  "vocab_size": 50257,
64
  "window_size": 256
65
  }
merges.txt CHANGED
@@ -1,4 +1,4 @@
1
- #version: 0.2
2
  Ġ ×
3
  × Ļ
4
  × ķ
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
  Ġ ×
3
  × Ļ
4
  × ķ
special_tokens_map.json CHANGED
@@ -2,4 +2,4 @@
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "unk_token": "<unk>"
5
- }
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "unk_token": "<unk>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1,36 @@
1
- {"do_lower_case": false, "max_len": 1024, "bos_token": "<|startoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "special_tokens_map_file": "special_tokens_map.json", "full_tokenizer_file": null}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "do_lower_case": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "full_tokenizer_file": null,
23
+ "max_len": 1024,
24
+ "name_or_path": "./hebrew-gpt_neo-xl",
25
+ "pad_token": null,
26
+ "special_tokens_map_file": "special_tokens_map.json",
27
+ "tokenizer_class": "GPT2Tokenizer",
28
+ "unk_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ }
36
+ }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff