|
|
|
|
|
## 简介 |
|
|
|
``` |
|
num_image_tokens = 20000 image_tokenizer |
|
num_text_tokens = 130000 text_tokenizer |
|
``` |
|
|
|
一共 150000 |
|
|
|
## text_tokenizer |
|
|
|
``` |
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[0] 对应 20000+0 |
|
piece: "<unk>" |
|
score: 0.0 |
|
type: UNKNOWN |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[1] 对应 20000+1 |
|
piece: "<s>" |
|
score: 0.0 |
|
type: CONTROL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[2] 对应 20000+2 |
|
piece: "</s>" |
|
score: 0.0 |
|
type: CONTROL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[3] 对应 20000+3 |
|
piece: "<pad>" |
|
score: 0.0 |
|
type: CONTROL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[4] 对应 20000+4 |
|
piece: "<n>" |
|
score: 0.0 |
|
type: USER_DEFINED |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[5] 对应 20000+5 |
|
piece: "\342\226\201" |
|
score: -2.6171817779541016 |
|
type: NORMAL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[6] |
|
piece: "," |
|
score: -3.151700019836426 |
|
type: NORMAL |
|
|
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[50] |
|
piece: "{" |
|
score: -7.532660961151123 |
|
type: NORMAL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[100] |
|
piece: "\342\226\201the" # "\342\226\201" 这是啥?? |
|
score: -3.922896385192871 |
|
type: NORMAL |
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[200] |
|
piece: "\342\226\201This" |
|
score: -7.821105480194092 |
|
type: NORMAL |
|
|
|
|
|
tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[128293] |
|
piece: "\342\226\201pa\303\255ses" |
|
score: -14.182646751403809 |
|
type: NORMAL |
|
``` |
|
|
|
|