Xenova HF staff commited on
Commit
1e4d773
1 Parent(s): c9238fc
Files changed (1) hide show
  1. tokenizer.json +18 -10
tokenizer.json CHANGED
@@ -51,17 +51,25 @@
51
  ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
- "type": "ByteLevel",
55
- "add_prefix_space": false,
56
- "trim_offsets": true,
57
- "use_regex": true
58
- },
59
- "post_processor": {
60
- "type": "ByteLevel",
61
- "add_prefix_space": true,
62
- "trim_offsets": false,
63
- "use_regex": true
 
 
 
 
 
 
 
64
  },
 
65
  "decoder": {
66
  "type": "ByteLevel",
67
  "add_prefix_space": true,
 
51
  ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
+ "type": "Sequence",
55
+ "pretokenizers": [
56
+ {
57
+ "type": "Split",
58
+ "pattern": {
59
+ "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
60
+ },
61
+ "behavior": "Removed",
62
+ "invert": true
63
+ },
64
+ {
65
+ "type": "ByteLevel",
66
+ "add_prefix_space": false,
67
+ "trim_offsets": true,
68
+ "use_regex": false
69
+ }
70
+ ]
71
  },
72
+ "post_processor": null,
73
  "decoder": {
74
  "type": "ByteLevel",
75
  "add_prefix_space": true,