name: wordpiece_tokenizer config_type: preprocessor pretrained_path: checkpoints/5 max_length: 512 truncation_strategy: longest_first truncation_direction: right stride: 0 padding_strategy: longest padding_direction: right pad_to_multiple_of: 0 pad_token_id: 0 pad_token: '[PAD]' pad_token_type_id: 0 unk_token: '[UNK]' special_tokens: - '[UNK]' - '[SEP]' - '[CLS]' - '[PAD]' - '[MASK]' wordpieces_prefix: '##' vocab_size: 42000 min_frequency: 2 limit_alphabet: 1000 initial_alphabet: [] show_progress: true