explcre commited on
Commit
5bb1c56
·
verified ·
1 Parent(s): 402c33f

v15 unified+mdlm+aux T1 final ckpt: final/tokenizer_config.json

Browse files
runs/exp_t1_validation_unified_mdlm_bug14_fixed_20260429_h100/final/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "audio_bos_token": "<|audio_start|>",
4
+ "audio_eos_token": "<|audio_end|>",
5
+ "audio_token": "<|audio_pad|>",
6
+ "backend": "tokenizers",
7
+ "bos_token": null,
8
+ "clean_up_tokenization_spaces": false,
9
+ "eos_token": "<|im_end|>",
10
+ "errors": "replace",
11
+ "extra_special_tokens": [
12
+ "<text_mask>"
13
+ ],
14
+ "image_token": "<|image_pad|>",
15
+ "is_local": false,
16
+ "local_files_only": false,
17
+ "model_max_length": 262144,
18
+ "model_specific_special_tokens": {
19
+ "audio_bos_token": "<|audio_start|>",
20
+ "audio_eos_token": "<|audio_end|>",
21
+ "audio_token": "<|audio_pad|>",
22
+ "image_token": "<|image_pad|>",
23
+ "video_token": "<|video_pad|>",
24
+ "vision_bos_token": "<|vision_start|>",
25
+ "vision_eos_token": "<|vision_end|>"
26
+ },
27
+ "pad_token": "<|endoftext|>",
28
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
29
+ "split_special_tokens": false,
30
+ "tokenizer_class": "Qwen2Tokenizer",
31
+ "unk_token": null,
32
+ "video_token": "<|video_pad|>",
33
+ "vision_bos_token": "<|vision_start|>",
34
+ "vision_eos_token": "<|vision_end|>"
35
+ }