""" https://huggingface.co/ClueAI/ChatYuan-large-v2 支持\n \t - 英文编码很烂 为什么不直接编码\n \t,反而要过一套前处理和后处理? """ import json from transformers import AutoTokenizer def preprocess(text): """ 词典里 """ print("原文本: ", text) text = text.replace("\n", "\\n").replace("\t", "\\t") print("预处理后文本: ", text) return text def postprocess(text): return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', ' ') model_dir = 'ChatYuan-large-v2' tokenizer = AutoTokenizer.from_pretrained(model_dir) text = "中国\nabcde jump \tnice" tokens = tokenizer.tokenize(text) print(tokens) # ['▁中国', '▁', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁n', 'ice'] print(tokenizer.tokenize(preprocess(text))) # ['▁中国', '\\n', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁', '\\t', 'n', 'ice'] tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93, 3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56, 4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456, 84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38, 6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6, 7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34, 12029, 22, 7, 51, 158, 5, 1] tokens = [0, 12, 14381, 10, 19849, 3, 7, 7, 34, 313, 1344, 9017, 3, 276, 26455, 2258, 3, 578, 864, 529, 2771, 874, 26455, 1442, 6, 7, 7, 26455, 9220, 19849, 937, 16, 11726, 33, 11726, 52, 6, 7, 12, 7, 7, 8353, 1036, 8093, 67, 276, 1036, 3338, 3, 480, 4490, 30, 34, 1325, 6, 7, 2200, 53, 7321, 2187, 648, 78, 7321, 2899, 25823, 6, 7, 2964, 3402, 1203, 13, 537, 6, 7, 1660, 2795, 3402, 1203, 6, 7, 407, 1802, 7, 7, 3095, 1477, 37, 7, 7, 19849, 7, 7, 11726, 16, 11726, 7893, 42, 1] print(tokenizer.decode(tokens))