Shaltiel commited on
Commit
0a9e2c4
1 Parent(s): 847a490

Added missing normalizer to tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +7 -0
tokenizer.json CHANGED
@@ -76,6 +76,13 @@
76
  "String": "<foreign>"
77
  },
78
  "content": "UNK"
 
 
 
 
 
 
 
79
  }
80
  ]
81
  },
 
76
  "String": "<foreign>"
77
  },
78
  "content": "UNK"
79
+ },
80
+ {
81
+ "type": "Replace",
82
+ "pattern": {
83
+ "Regex": "[^֐-׿\u0000-‌-‿₠-₿∀-⋿⅐-↋ff-ﭏ]+"
84
+ },
85
+ "content": "UNK"
86
  }
87
  ]
88
  },