|
""" |
|
https://huggingface.co/ClueAI/ChatYuan-large-v2 |
|
|
|
支持\n \t |
|
|
|
- 英文编码很烂 |
|
|
|
为什么不直接编码\n \t,反而要过一套前处理和后处理? |
|
|
|
""" |
|
import json |
|
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
def preprocess(text): |
|
""" |
|
词典里 |
|
""" |
|
print("原文本: ", text) |
|
text = text.replace("\n", "\\n").replace("\t", "\\t") |
|
print("预处理后文本: ", text) |
|
return text |
|
|
|
|
|
def postprocess(text): |
|
return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', ' ') |
|
|
|
|
|
model_dir = 'ChatYuan-large-v2' |
|
tokenizer = AutoTokenizer.from_pretrained(model_dir) |
|
|
|
text = "中国\nabcde jump \tnice" |
|
tokens = tokenizer.tokenize(text) |
|
|
|
print(tokens) |
|
|
|
print(tokenizer.tokenize(preprocess(text))) |
|
|
|
|
|
tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93, |
|
3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56, |
|
4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456, |
|
84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38, |
|
6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6, |
|
7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34, |
|
12029, 22, 7, 51, 158, 5, 1] |
|
|
|
tokens = [0, 12, 14381, 10, 19849, 3, 7, 7, 34, 313, |
|
1344, 9017, 3, 276, 26455, 2258, 3, 578, 864, 529, |
|
2771, 874, 26455, 1442, 6, 7, 7, 26455, 9220, 19849, |
|
937, 16, 11726, 33, 11726, 52, 6, 7, 12, 7, |
|
7, 8353, 1036, 8093, 67, 276, 1036, 3338, 3, 480, |
|
4490, 30, 34, 1325, 6, 7, 2200, 53, 7321, 2187, |
|
648, 78, 7321, 2899, 25823, 6, 7, 2964, 3402, 1203, |
|
13, 537, 6, 7, 1660, 2795, 3402, 1203, 6, 7, |
|
407, 1802, 7, 7, 3095, 1477, 37, 7, 7, 19849, |
|
7, 7, 11726, 16, 11726, 7893, 42, 1] |
|
|
|
|
|
print(tokenizer.decode(tokens)) |
|
|