RaymondLi commited on
Commit
a80ab01
1 Parent(s): c9238fc

adjust pretokenizer regex

Browse files
Files changed (1) hide show
  1. tokenizer.json +17 -4
tokenizer.json CHANGED
@@ -51,10 +51,23 @@
51
  ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
- "type": "ByteLevel",
55
- "add_prefix_space": false,
56
- "trim_offsets": true,
57
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  "post_processor": {
60
  "type": "ByteLevel",
 
51
  ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
+ "type": "Sequence",
55
+ "pretokenizers": [
56
+ {
57
+ "type": "Split",
58
+ "pattern": {
59
+ "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
60
+ },
61
+ "behavior": "Removed",
62
+ "invert": true
63
+ },
64
+ {
65
+ "type": "ByteLevel",
66
+ "add_prefix_space": false,
67
+ "trim_offsets": true,
68
+ "use_regex": false
69
+ }
70
+ ]
71
  },
72
  "post_processor": {
73
  "type": "ByteLevel",