|
{ |
|
"num_threads": 224, |
|
"split_by_whitespace": true, |
|
"model_type": "unigram", |
|
"vocab_size": 250680, |
|
"character_coverage": 0.9999, |
|
"byte_fallback": true, |
|
"split_by_number": true, |
|
"split_digits": true, |
|
"normalization_rule_name": "nfkc", |
|
"max_sentence_length": 4096, |
|
"shuffle_input_sentence": true, |
|
"input_sentence_size": 0, |
|
"train_extremely_large_corpus": true, |
|
"allow_whitespace_only_pieces": true, |
|
"required_chars": "", |
|
"remove_extra_whitespaces": false, |
|
"user_defined_symbols": [ |
|
"<s>", |
|
"</s>", |
|
"<pad>", |
|
"<eod>", |
|
"<placeholder_tok_0>", |
|
"<placeholder_tok_1>", |
|
"<placeholder_tok_2>", |
|
"<placeholder_tok_3>", |
|
"<placeholder_tok_4>", |
|
"<placeholder_tok_5>", |
|
"<placeholder_tok_6>", |
|
"<placeholder_tok_7>", |
|
"<placeholder_tok_8>", |
|
"<placeholder_tok_9>", |
|
"<placeholder_tok_10>", |
|
"<placeholder_tok_11>", |
|
"<placeholder_tok_12>", |
|
"<placeholder_tok_13>", |
|
"<placeholder_tok_14>", |
|
"<placeholder_tok_15>", |
|
"<placeholder_tok_16>", |
|
"<placeholder_tok_17>", |
|
"<placeholder_tok_18>", |
|
"<placeholder_tok_19>", |
|
"<placeholder_tok_20>", |
|
"<placeholder_tok_21>", |
|
"<placeholder_tok_22>", |
|
"<placeholder_tok_23>", |
|
"<placeholder_tok_24>", |
|
"<placeholder_tok_25>", |
|
"<placeholder_tok_26>", |
|
"<placeholder_tok_27>", |
|
"<placeholder_tok_28>", |
|
"<placeholder_tok_29>", |
|
"<placeholder_tok_30>", |
|
"<placeholder_tok_31>", |
|
"<placeholder_tok_32>", |
|
"<placeholder_tok_33>", |
|
"<placeholder_tok_34>", |
|
"<placeholder_tok_35>", |
|
"<placeholder_tok_36>", |
|
"<placeholder_tok_37>", |
|
"<placeholder_tok_38>", |
|
"<placeholder_tok_39>", |
|
"<placeholder_tok_40>", |
|
"<placeholder_tok_41>", |
|
"<placeholder_tok_42>", |
|
"<placeholder_tok_43>", |
|
"<placeholder_tok_44>", |
|
"<placeholder_tok_45>", |
|
"<placeholder_tok_46>", |
|
"<placeholder_tok_47>", |
|
"<placeholder_tok_48>", |
|
"<placeholder_tok_49>", |
|
"<placeholder_tok_50>", |
|
"<placeholder_tok_51>", |
|
"<placeholder_tok_52>", |
|
"<placeholder_tok_53>", |
|
"<placeholder_tok_54>", |
|
"<placeholder_tok_55>", |
|
"<placeholder_tok_56>", |
|
"<placeholder_tok_57>", |
|
"<placeholder_tok_58>", |
|
"<placeholder_tok_59>", |
|
"<placeholder_tok_60>", |
|
"<placeholder_tok_61>", |
|
"<placeholder_tok_62>", |
|
"<placeholder_tok_63>", |
|
"<placeholder_tok_64>", |
|
"<placeholder_tok_65>", |
|
"<placeholder_tok_66>", |
|
"<placeholder_tok_67>", |
|
"<placeholder_tok_68>", |
|
"<placeholder_tok_69>", |
|
"<placeholder_tok_70>", |
|
"<placeholder_tok_71>", |
|
"<placeholder_tok_72>", |
|
"<placeholder_tok_73>", |
|
"<placeholder_tok_74>", |
|
"<placeholder_tok_75>", |
|
"<placeholder_tok_76>", |
|
"<placeholder_tok_77>", |
|
"<placeholder_tok_78>", |
|
"<placeholder_tok_79>", |
|
"<placeholder_tok_80>", |
|
"<placeholder_tok_81>", |
|
"<placeholder_tok_82>", |
|
"<placeholder_tok_83>", |
|
"<placeholder_tok_84>", |
|
"<placeholder_tok_85>", |
|
"<placeholder_tok_86>", |
|
"<placeholder_tok_87>", |
|
"<placeholder_tok_88>", |
|
"<placeholder_tok_89>", |
|
"<placeholder_tok_90>", |
|
"<placeholder_tok_91>", |
|
"<placeholder_tok_92>", |
|
"<placeholder_tok_93>", |
|
"<placeholder_tok_94>", |
|
"<placeholder_tok_95>", |
|
"<placeholder_tok_96>", |
|
"<placeholder_tok_97>", |
|
"<placeholder_tok_98>", |
|
"<placeholder_tok_99>", |
|
"<placeholder_tok_100>", |
|
"<placeholder_tok_101>", |
|
"<placeholder_tok_102>", |
|
"<placeholder_tok_103>", |
|
"<placeholder_tok_104>", |
|
"<placeholder_tok_105>", |
|
"<placeholder_tok_106>", |
|
"<placeholder_tok_107>", |
|
"<placeholder_tok_108>", |
|
"<placeholder_tok_109>", |
|
"<placeholder_tok_110>", |
|
"<placeholder_tok_111>", |
|
"<placeholder_tok_112>", |
|
"<placeholder_tok_113>", |
|
"<placeholder_tok_114>", |
|
"<placeholder_tok_115>", |
|
"<placeholder_tok_116>", |
|
"<placeholder_tok_117>", |
|
"<placeholder_tok_118>", |
|
"<placeholder_tok_119>", |
|
"<placeholder_tok_120>", |
|
"<placeholder_tok_121>", |
|
"<placeholder_tok_122>", |
|
"<placeholder_tok_123>", |
|
"<placeholder_tok_124>", |
|
"<placeholder_tok_125>", |
|
"<placeholder_tok_126>", |
|
"<placeholder_tok_127>", |
|
"<placeholder_tok_128>", |
|
"<placeholder_tok_129>", |
|
"<placeholder_tok_130>", |
|
"<placeholder_tok_131>", |
|
"<placeholder_tok_132>", |
|
"<placeholder_tok_133>", |
|
"<placeholder_tok_134>", |
|
"<placeholder_tok_135>", |
|
"<placeholder_tok_136>", |
|
"<placeholder_tok_137>", |
|
"<placeholder_tok_138>", |
|
"<placeholder_tok_139>", |
|
"<placeholder_tok_140>", |
|
"<placeholder_tok_141>", |
|
"<placeholder_tok_142>", |
|
"<placeholder_tok_143>", |
|
"<placeholder_tok_144>", |
|
"<placeholder_tok_145>", |
|
"<placeholder_tok_146>", |
|
"<placeholder_tok_147>", |
|
"<placeholder_tok_148>", |
|
"<placeholder_tok_149>", |
|
"<placeholder_tok_150>", |
|
"<placeholder_tok_151>", |
|
"<placeholder_tok_152>", |
|
"<placeholder_tok_153>", |
|
"<placeholder_tok_154>", |
|
"<placeholder_tok_155>", |
|
"<placeholder_tok_156>", |
|
"<placeholder_tok_157>", |
|
"<placeholder_tok_158>", |
|
"<placeholder_tok_159>", |
|
"<placeholder_tok_160>", |
|
"<placeholder_tok_161>", |
|
"<placeholder_tok_162>", |
|
"<placeholder_tok_163>", |
|
"<placeholder_tok_164>", |
|
"<placeholder_tok_165>", |
|
"<placeholder_tok_166>", |
|
"<placeholder_tok_167>", |
|
"<placeholder_tok_168>", |
|
"<placeholder_tok_169>", |
|
"<placeholder_tok_170>", |
|
"<placeholder_tok_171>", |
|
"<placeholder_tok_172>", |
|
"<placeholder_tok_173>", |
|
"<placeholder_tok_174>", |
|
"<placeholder_tok_175>", |
|
"<placeholder_tok_176>", |
|
"<placeholder_tok_177>", |
|
"<placeholder_tok_178>", |
|
"<placeholder_tok_179>", |
|
"<placeholder_tok_180>", |
|
"<placeholder_tok_181>", |
|
"<placeholder_tok_182>", |
|
"<placeholder_tok_183>", |
|
"<placeholder_tok_184>", |
|
"<placeholder_tok_185>", |
|
"<placeholder_tok_186>", |
|
"<placeholder_tok_187>", |
|
"<placeholder_tok_188>", |
|
"<placeholder_tok_189>", |
|
"<placeholder_tok_190>", |
|
"<placeholder_tok_191>", |
|
"<placeholder_tok_192>", |
|
"<placeholder_tok_193>", |
|
"<placeholder_tok_194>", |
|
"<placeholder_tok_195>", |
|
"<placeholder_tok_196>", |
|
"<placeholder_tok_197>", |
|
"<placeholder_tok_198>", |
|
"<placeholder_tok_199>", |
|
"<placeholder_tok_200>", |
|
"<placeholder_tok_201>", |
|
"<placeholder_tok_202>", |
|
"<placeholder_tok_203>", |
|
"<placeholder_tok_204>", |
|
"<placeholder_tok_205>", |
|
"<placeholder_tok_206>", |
|
"<placeholder_tok_207>", |
|
"<placeholder_tok_208>", |
|
"<placeholder_tok_209>", |
|
"<placeholder_tok_210>", |
|
"<placeholder_tok_211>", |
|
"<placeholder_tok_212>", |
|
"<placeholder_tok_213>", |
|
"<placeholder_tok_214>", |
|
"<placeholder_tok_215>", |
|
"<placeholder_tok_216>", |
|
"<placeholder_tok_217>", |
|
"<placeholder_tok_218>", |
|
"<placeholder_tok_219>", |
|
"<placeholder_tok_220>", |
|
"<placeholder_tok_221>", |
|
"<placeholder_tok_222>", |
|
"<placeholder_tok_223>", |
|
"<placeholder_tok_224>", |
|
"<placeholder_tok_225>", |
|
"<placeholder_tok_226>", |
|
"<placeholder_tok_227>", |
|
"<placeholder_tok_228>", |
|
"<placeholder_tok_229>", |
|
"<placeholder_tok_230>", |
|
"<placeholder_tok_231>", |
|
"<placeholder_tok_232>", |
|
"<placeholder_tok_233>", |
|
"<placeholder_tok_234>", |
|
"<placeholder_tok_235>", |
|
"<placeholder_tok_236>", |
|
"<placeholder_tok_237>", |
|
"<placeholder_tok_238>", |
|
"<placeholder_tok_239>", |
|
"<placeholder_tok_240>", |
|
"<placeholder_tok_241>", |
|
"<placeholder_tok_242>", |
|
"<placeholder_tok_243>", |
|
"<placeholder_tok_244>", |
|
"<placeholder_tok_245>", |
|
"<placeholder_tok_246>", |
|
"<placeholder_tok_247>", |
|
"<placeholder_tok_248>", |
|
"<placeholder_tok_249>", |
|
"<placeholder_tok_250>", |
|
"<placeholder_tok_251>", |
|
"<placeholder_tok_252>", |
|
"<placeholder_tok_253>", |
|
"<placeholder_tok_254>", |
|
"<placeholder_tok_255>" |
|
], |
|
"datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/", |
|
"save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24", |
|
"text_key": "text", |
|
"cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache", |
|
"library": "sentencepiece", |
|
"auto_map": { |
|
"AutoTokenizer": [ |
|
"gptx_tokenizer.SPTokenizer", |
|
null |
|
] |
|
}, |
|
"tokenizer_class": "SPTokenizer" |
|
} |