update
Browse files- examples.py +2 -2
- util.py +1 -1
examples.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
examples = {
|
2 |
"en": [
|
3 |
-
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "
|
4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
6 |
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
@@ -8,7 +8,7 @@ examples = {
|
|
8 |
]
|
9 |
,
|
10 |
"zh": [
|
11 |
-
["空格测试: 2个空格 8个空格", "llama", "
|
12 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
13 |
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
14 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
|
|
1 |
examples = {
|
2 |
"en": [
|
3 |
+
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
6 |
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
|
|
8 |
]
|
9 |
,
|
10 |
"zh": [
|
11 |
+
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
12 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
13 |
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
14 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
util.py
CHANGED
@@ -31,7 +31,7 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
31 |
token_str = token.decode("utf-8")
|
32 |
except:
|
33 |
token_str = token.decode("utf-8", errors="ignore")
|
34 |
-
logger.error("decode_error: " + json.dumps(
|
35 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
36 |
ensure_ascii=False))
|
37 |
|
|
|
31 |
token_str = token.decode("utf-8")
|
32 |
except:
|
33 |
token_str = token.decode("utf-8", errors="ignore")
|
34 |
+
logger.error("decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
35 |
{"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
|
36 |
ensure_ascii=False))
|
37 |
|