xu-song's picture
update
751936e
raw
history blame
2.09 kB
"""
https://huggingface.co/ClueAI/ChatYuan-large-v2
支持\n \t
- 英文编码很烂
为什么不直接编码\n \t,反而要过一套前处理和后处理?
"""
import json
from transformers import AutoTokenizer
def preprocess(text):
"""
词典里
"""
print("原文本: ", text)
text = text.replace("\n", "\\n").replace("\t", "\\t")
print("预处理后文本: ", text)
return text
def postprocess(text):
return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', ' ')
model_dir = 'ChatYuan-large-v2'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
text = "中国\nabcde jump \tnice"
tokens = tokenizer.tokenize(text)
print(tokens)
# ['▁中国', '▁', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁n', 'ice']
print(tokenizer.tokenize(preprocess(text)))
# ['▁中国', '\\n', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁', '\\t', 'n', 'ice']
tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93,
3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56,
4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456,
84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38,
6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6,
7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34,
12029, 22, 7, 51, 158, 5, 1]
tokens = [0, 12, 14381, 10, 19849, 3, 7, 7, 34, 313,
1344, 9017, 3, 276, 26455, 2258, 3, 578, 864, 529,
2771, 874, 26455, 1442, 6, 7, 7, 26455, 9220, 19849,
937, 16, 11726, 33, 11726, 52, 6, 7, 12, 7,
7, 8353, 1036, 8093, 67, 276, 1036, 3338, 3, 480,
4490, 30, 34, 1325, 6, 7, 2200, 53, 7321, 2187,
648, 78, 7321, 2899, 25823, 6, 7, 2964, 3402, 1203,
13, 537, 6, 7, 1660, 2795, 3402, 1203, 6, 7,
407, 1802, 7, 7, 3095, 1477, 37, 7, 7, 19849,
7, 7, 11726, 16, 11726, 7893, 42, 1]
print(tokenizer.decode(tokens))