arxyzan commited on
Commit
9bee4db
1 Parent(s): 4000e9d

Hezar: Upload tokenizer_config.yaml

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +11 -10
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,6 +1,5 @@
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
3
- pretrained_path: hezarai/whisper-small
4
  max_length: 512
5
  truncation_strategy: longest_first
6
  truncation_direction: right
@@ -8,11 +7,15 @@ stride: 0
8
  padding_strategy: longest
9
  padding_direction: right
10
  pad_to_multiple_of: 0
11
- pad_token_id: 0
12
- pad_token: <pad>
13
  pad_token_type_id: 0
 
 
14
  unk_token: <|endoftext|>
15
- special_tokens:
 
 
 
 
16
  - <|endoftext|>
17
  - <|endoftext|>
18
  - <|startoftranscript|>
@@ -124,16 +127,14 @@ special_tokens:
124
  continuing_subword_prefix: ''
125
  end_of_word_suffix: ''
126
  fuse_unk: false
127
- vocab_size: 50364
128
  min_frequency: 2
129
  limit_alphabet: 1000
130
  initial_alphabet: []
131
  show_progress: true
132
- unk_token_id: 50257
133
- bos_token: <|startoftranscript|>
134
- bos_token_id: 50257
135
- eos_token: <|endoftext|>
136
- eos_token_id: 50257
137
  add_prefix_space: false
138
  add_bos_token: false
139
  model_max_length: 1024
 
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
 
3
  max_length: 512
4
  truncation_strategy: longest_first
5
  truncation_direction: right
 
7
  padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
 
 
10
  pad_token_type_id: 0
11
+ bos_token: <|startoftranscript|>
12
+ eos_token: <|endoftext|>
13
  unk_token: <|endoftext|>
14
+ sep_token: <sep>
15
+ pad_token: <pad>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
18
+ additional_special_tokens:
19
  - <|endoftext|>
20
  - <|endoftext|>
21
  - <|startoftranscript|>
 
127
  continuing_subword_prefix: ''
128
  end_of_word_suffix: ''
129
  fuse_unk: false
130
+ vocab_size: 30000
131
  min_frequency: 2
132
  limit_alphabet: 1000
133
  initial_alphabet: []
134
  show_progress: true
135
+ translate_token: <|translate|>
136
+ transcribe_token: <|transcribe|>
137
+ notimestamps_token: <|notimestamps|>
 
 
138
  add_prefix_space: false
139
  add_bos_token: false
140
  model_max_length: 1024