arxyzan commited on
Commit
c457969
1 Parent(s): 975e75b

Hezar: Upload model and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +21 -0
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: wordpiece_tokenizer
2
+ config_type: preprocessor
3
+ max_length: 512
4
+ truncation_strategy: longest_first
5
+ truncation_direction: right
6
+ stride: 0
7
+ padding_strategy: longest
8
+ padding_direction: right
9
+ pad_to_multiple_of: 0
10
+ pad_token_type_id: 0
11
+ unk_token: '[UNK]'
12
+ sep_token: '[SEP]'
13
+ pad_token: '[PAD]'
14
+ cls_token: '[CLS]'
15
+ mask_token: '[MASK]'
16
+ wordpieces_prefix: '##'
17
+ vocab_size: 42000
18
+ min_frequency: 2
19
+ limit_alphabet: 1000
20
+ initial_alphabet: []
21
+ show_progress: true