arxyzan commited on
Commit
021ecaa
1 Parent(s): afa05ca

Hezar: Upload tokenizer and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +4 -4
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,6 +1,6 @@
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
3
- max_length: 512
4
  truncation_strategy: longest_first
5
  truncation_direction: right
6
  stride: 0
@@ -8,11 +8,11 @@ padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
- bos_token: <|startoftranscript|>
12
  eos_token: <|endoftext|>
13
  unk_token: <|endoftext|>
14
  sep_token: <sep>
15
- pad_token: <pad>
16
  cls_token: <cls>
17
  mask_token: <mask>
18
  additional_special_tokens:
@@ -127,7 +127,7 @@ additional_special_tokens:
127
  continuing_subword_prefix: ''
128
  end_of_word_suffix: ''
129
  fuse_unk: false
130
- vocab_size: 30000
131
  min_frequency: 2
132
  limit_alphabet: 1000
133
  initial_alphabet: []
 
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
3
+ max_length: 448
4
  truncation_strategy: longest_first
5
  truncation_direction: right
6
  stride: 0
 
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
+ bos_token: <|endoftext|>
12
  eos_token: <|endoftext|>
13
  unk_token: <|endoftext|>
14
  sep_token: <sep>
15
+ pad_token: <|endoftext|>
16
  cls_token: <cls>
17
  mask_token: <mask>
18
  additional_special_tokens:
 
127
  continuing_subword_prefix: ''
128
  end_of_word_suffix: ''
129
  fuse_unk: false
130
+ vocab_size: 50364
131
  min_frequency: 2
132
  limit_alphabet: 1000
133
  initial_alphabet: []