arxyzan commited on
Commit
0bd1a18
1 Parent(s): d0b5a35

Upload preprocessor with huggingface_hub

Browse files
preprocessor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: wordpiece_tokenizer
2
+ config_type: preprocessor
3
+ pretrained_path: hezar-ai/distilbert-fa
4
+ max_length: 512
5
+ truncation_strategy: longest_first
6
+ truncation_direction: right
7
+ stride: 0
8
+ padding_strategy: longest
9
+ padding_direction: right
10
+ pad_to_multiple_of: 0
11
+ pad_token_id: 0
12
+ pad_token: '[PAD]'
13
+ pad_token_type_id: 0
14
+ unk_token: '[UNK]'
15
+ special_tokens:
16
+ - '[UNK]'
17
+ - '[SEP]'
18
+ - '[CLS]'
19
+ - '[PAD]'
20
+ - '[MASK]'
21
+ wordpieces_prefix: '##'
22
+ train_config:
23
+ name: wordpiece_tokenizer
24
+ config_type: preprocessor
25
+ vocab_size: 30000
26
+ min_frequency: 2
27
+ limit_alphabet: 1000
28
+ initial_alphabet: []
29
+ show_progress: true