diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,128150 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "[PAD]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "[UNK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "[CLS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "[SEP]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "[MASK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "BertNormalizer", + "clean_text": true, + "handle_chinese_chars": true, + "strip_accents": null, + "lowercase": true + }, + "pre_tokenizer": { + "type": "BertPreTokenizer" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "[CLS]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "[CLS]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 1 + } + } + ], + "special_tokens": { + "[CLS]": { + "id": "[CLS]", + "ids": [ + 2 + ], + "tokens": [ + "[CLS]" + ] + }, + "[SEP]": { + "id": "[SEP]", + "ids": [ + 3 + ], + "tokens": [ + "[SEP]" + ] + } + } + }, + "decoder": { + "type": "WordPiece", + "prefix": "##", + "cleanup": true + }, + "model": { + "type": "WordPiece", + "unk_token": "[UNK]", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, + "vocab": { + "[PAD]": 0, + "[UNK]": 1, + "[CLS]": 2, + "[SEP]": 3, + "[MASK]": 4, + "##>": 5, + "<": 6, + "##hashtag": 7, + "##user": 8, + "": 127748, + "[UNUSED_1]": 127749, + "[UNUSED_2]": 127750, + "[UNUSED_3]": 127751, + "[UNUSED_4]": 127752, + "[UNUSED_5]": 127753, + "[UNUSED_6]": 127754, + "[UNUSED_7]": 127755, + "[UNUSED_8]": 127756, + "[UNUSED_9]": 127757, + "[UNUSED_10]": 127758, + "[UNUSED_11]": 127759, + "[UNUSED_12]": 127760, + "[UNUSED_13]": 127761, + "[UNUSED_14]": 127762, + "[UNUSED_15]": 127763, + "[UNUSED_16]": 127764, + "[UNUSED_17]": 127765, + "[UNUSED_18]": 127766, + "[UNUSED_19]": 127767, + "[UNUSED_20]": 127768, + "[UNUSED_21]": 127769, + "[UNUSED_22]": 127770, + "[UNUSED_23]": 127771, + "[UNUSED_24]": 127772, + "[UNUSED_25]": 127773, + "[UNUSED_26]": 127774, + "[UNUSED_27]": 127775, + "[UNUSED_28]": 127776, + "[UNUSED_29]": 127777, + "[UNUSED_30]": 127778, + "[UNUSED_31]": 127779, + "[UNUSED_32]": 127780, + "[UNUSED_33]": 127781, + "[UNUSED_34]": 127782, + "[UNUSED_35]": 127783, + "[UNUSED_36]": 127784, + "[UNUSED_37]": 127785, + "[UNUSED_38]": 127786, + "[UNUSED_39]": 127787, + "[UNUSED_40]": 127788, + "[UNUSED_41]": 127789, + "[UNUSED_42]": 127790, + "[UNUSED_43]": 127791, + "[UNUSED_44]": 127792, + "[UNUSED_45]": 127793, + "[UNUSED_46]": 127794, + "[UNUSED_47]": 127795, + "[UNUSED_48]": 127796, + "[UNUSED_49]": 127797, + "[UNUSED_50]": 127798, + "[UNUSED_51]": 127799, + "[UNUSED_52]": 127800, + "[UNUSED_53]": 127801, + "[UNUSED_54]": 127802, + "[UNUSED_55]": 127803, + "[UNUSED_56]": 127804, + "[UNUSED_57]": 127805, + "[UNUSED_58]": 127806, + "[UNUSED_59]": 127807, + "[UNUSED_60]": 127808, + "[UNUSED_61]": 127809, + "[UNUSED_62]": 127810, + "[UNUSED_63]": 127811, + "[UNUSED_64]": 127812, + "[UNUSED_65]": 127813, + "[UNUSED_66]": 127814, + "[UNUSED_67]": 127815, + "[UNUSED_68]": 127816, + "[UNUSED_69]": 127817, + "[UNUSED_70]": 127818, + "[UNUSED_71]": 127819, + "[UNUSED_72]": 127820, + "[UNUSED_73]": 127821, + "[UNUSED_74]": 127822, + "[UNUSED_75]": 127823, + "[UNUSED_76]": 127824, + "[UNUSED_77]": 127825, + "[UNUSED_78]": 127826, + "[UNUSED_79]": 127827, + "[UNUSED_80]": 127828, + "[UNUSED_81]": 127829, + "[UNUSED_82]": 127830, + "[UNUSED_83]": 127831, + "[UNUSED_84]": 127832, + "[UNUSED_85]": 127833, + "[UNUSED_86]": 127834, + "[UNUSED_87]": 127835, + "[UNUSED_88]": 127836, + "[UNUSED_89]": 127837, + "[UNUSED_90]": 127838, + "[UNUSED_91]": 127839, + "[UNUSED_92]": 127840, + "[UNUSED_93]": 127841, + "[UNUSED_94]": 127842, + "[UNUSED_95]": 127843, + "[UNUSED_96]": 127844, + "[UNUSED_97]": 127845, + "[UNUSED_98]": 127846, + "[UNUSED_99]": 127847, + "[UNUSED_100]": 127848, + "[UNUSED_101]": 127849, + "[UNUSED_102]": 127850, + "[UNUSED_103]": 127851, + "[UNUSED_104]": 127852, + "[UNUSED_105]": 127853, + "[UNUSED_106]": 127854, + "[UNUSED_107]": 127855, + "[UNUSED_108]": 127856, + "[UNUSED_109]": 127857, + "[UNUSED_110]": 127858, + "[UNUSED_111]": 127859, + "[UNUSED_112]": 127860, + "[UNUSED_113]": 127861, + "[UNUSED_114]": 127862, + "[UNUSED_115]": 127863, + "[UNUSED_116]": 127864, + "[UNUSED_117]": 127865, + "[UNUSED_118]": 127866, + "[UNUSED_119]": 127867, + "[UNUSED_120]": 127868, + "[UNUSED_121]": 127869, + "[UNUSED_122]": 127870, + "[UNUSED_123]": 127871, + "[UNUSED_124]": 127872, + "[UNUSED_125]": 127873, + "[UNUSED_126]": 127874, + "[UNUSED_127]": 127875, + "[UNUSED_128]": 127876, + "[UNUSED_129]": 127877, + "[UNUSED_130]": 127878, + "[UNUSED_131]": 127879, + "[UNUSED_132]": 127880, + "[UNUSED_133]": 127881, + "[UNUSED_134]": 127882, + "[UNUSED_135]": 127883, + "[UNUSED_136]": 127884, + "[UNUSED_137]": 127885, + "[UNUSED_138]": 127886, + "[UNUSED_139]": 127887, + "[UNUSED_140]": 127888, + "[UNUSED_141]": 127889, + "[UNUSED_142]": 127890, + "[UNUSED_143]": 127891, + "[UNUSED_144]": 127892, + "[UNUSED_145]": 127893, + "[UNUSED_146]": 127894, + "[UNUSED_147]": 127895, + "[UNUSED_148]": 127896, + "[UNUSED_149]": 127897, + "[UNUSED_150]": 127898, + "[UNUSED_151]": 127899, + "[UNUSED_152]": 127900, + "[UNUSED_153]": 127901, + "[UNUSED_154]": 127902, + "[UNUSED_155]": 127903, + "[UNUSED_156]": 127904, + "[UNUSED_157]": 127905, + "[UNUSED_158]": 127906, + "[UNUSED_159]": 127907, + "[UNUSED_160]": 127908, + "[UNUSED_161]": 127909, + "[UNUSED_162]": 127910, + "[UNUSED_163]": 127911, + "[UNUSED_164]": 127912, + "[UNUSED_165]": 127913, + "[UNUSED_166]": 127914, + "[UNUSED_167]": 127915, + "[UNUSED_168]": 127916, + "[UNUSED_169]": 127917, + "[UNUSED_170]": 127918, + "[UNUSED_171]": 127919, + "[UNUSED_172]": 127920, + "[UNUSED_173]": 127921, + "[UNUSED_174]": 127922, + "[UNUSED_175]": 127923, + "[UNUSED_176]": 127924, + "[UNUSED_177]": 127925, + "[UNUSED_178]": 127926, + "[UNUSED_179]": 127927, + "[UNUSED_180]": 127928, + "[UNUSED_181]": 127929, + "[UNUSED_182]": 127930, + "[UNUSED_183]": 127931, + "[UNUSED_184]": 127932, + "[UNUSED_185]": 127933, + "[UNUSED_186]": 127934, + "[UNUSED_187]": 127935, + "[UNUSED_188]": 127936, + "[UNUSED_189]": 127937, + "[UNUSED_190]": 127938, + "[UNUSED_191]": 127939, + "[UNUSED_192]": 127940, + "[UNUSED_193]": 127941, + "[UNUSED_194]": 127942, + "[UNUSED_195]": 127943, + "[UNUSED_196]": 127944, + "[UNUSED_197]": 127945, + "[UNUSED_198]": 127946, + "[UNUSED_199]": 127947, + "[UNUSED_200]": 127948, + "[UNUSED_201]": 127949, + "[UNUSED_202]": 127950, + "[UNUSED_203]": 127951, + "[UNUSED_204]": 127952, + "[UNUSED_205]": 127953, + "[UNUSED_206]": 127954, + "[UNUSED_207]": 127955, + "[UNUSED_208]": 127956, + "[UNUSED_209]": 127957, + "[UNUSED_210]": 127958, + "[UNUSED_211]": 127959, + "[UNUSED_212]": 127960, + "[UNUSED_213]": 127961, + "[UNUSED_214]": 127962, + "[UNUSED_215]": 127963, + "[UNUSED_216]": 127964, + "[UNUSED_217]": 127965, + "[UNUSED_218]": 127966, + "[UNUSED_219]": 127967, + "[UNUSED_220]": 127968, + "[UNUSED_221]": 127969, + "[UNUSED_222]": 127970, + "[UNUSED_223]": 127971, + "[UNUSED_224]": 127972, + "[UNUSED_225]": 127973, + "[UNUSED_226]": 127974, + "[UNUSED_227]": 127975, + "[UNUSED_228]": 127976, + "[UNUSED_229]": 127977, + "[UNUSED_230]": 127978, + "[UNUSED_231]": 127979, + "[UNUSED_232]": 127980, + "[UNUSED_233]": 127981, + "[UNUSED_234]": 127982, + "[UNUSED_235]": 127983, + "[UNUSED_236]": 127984, + "[UNUSED_237]": 127985, + "[UNUSED_238]": 127986, + "[UNUSED_239]": 127987, + "[UNUSED_240]": 127988, + "[UNUSED_241]": 127989, + "[UNUSED_242]": 127990, + "[UNUSED_243]": 127991, + "[UNUSED_244]": 127992, + "[UNUSED_245]": 127993, + "[UNUSED_246]": 127994, + "[UNUSED_247]": 127995, + "[UNUSED_248]": 127996, + "[UNUSED_249]": 127997, + "[UNUSED_250]": 127998, + "[UNUSED_251]": 127999 + } + } +} \ No newline at end of file