arshiaafshani commited on
Commit
8765337
·
verified ·
1 Parent(s): 21e4682

Upload tokenizer

Browse files
Files changed (3) hide show
  1. tokenizer.json +10 -18
  2. tokenizer_config.json +0 -1
  3. vocab.json +0 -0
tokenizer.json CHANGED
@@ -879,25 +879,17 @@
879
  ],
880
  "normalizer": null,
881
  "pre_tokenizer": {
882
- "type": "Sequence",
883
- "pretokenizers": [
884
- {
885
- "type": "Split",
886
- "pattern": {
887
- "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
888
- },
889
- "behavior": "Removed",
890
- "invert": true
891
- },
892
- {
893
- "type": "ByteLevel",
894
- "add_prefix_space": false,
895
- "trim_offsets": true,
896
- "use_regex": false
897
- }
898
- ]
899
  },
900
- "post_processor": null,
901
  "decoder": {
902
  "type": "ByteLevel",
903
  "add_prefix_space": true,
 
879
  ],
880
  "normalizer": null,
881
  "pre_tokenizer": {
882
+ "type": "ByteLevel",
883
+ "add_prefix_space": false,
884
+ "trim_offsets": true,
885
+ "use_regex": true
886
+ },
887
+ "post_processor": {
888
+ "type": "ByteLevel",
889
+ "add_prefix_space": true,
890
+ "trim_offsets": false,
891
+ "use_regex": true
 
 
 
 
 
 
 
892
  },
 
893
  "decoder": {
894
  "type": "ByteLevel",
895
  "add_prefix_space": true,
tokenizer_config.json CHANGED
@@ -785,7 +785,6 @@
785
  "eos_token": "<|im_end|>",
786
  "errors": "replace",
787
  "extra_special_tokens": {},
788
- "from_slow": true,
789
  "legacy": false,
790
  "model_max_length": 16384,
791
  "pad_token": "<|dummy_87|>",
 
785
  "eos_token": "<|im_end|>",
786
  "errors": "replace",
787
  "extra_special_tokens": {},
 
788
  "legacy": false,
789
  "model_max_length": 16384,
790
  "pad_token": "<|dummy_87|>",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff