{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "d5c3dff6-bd21-4e6a-8f8b-a83dc6895e08",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "549ca852-199a-4d55-9e29-17837dd1f975",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "QWenTokenizer(name_or_path='Qwen/Qwen-7B', vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "fe6c9cf7-fbc6-4073-81e9-5dfaefe88fdf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "151851"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4b583802-0e69-40d2-a32e-745ea50ede63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[14990, 5562]\n",
      "text : hello dog\n"
     ]
    }
   ],
   "source": [
    "#tokenizer('hello dog')['input_ids']\n",
    "#tokenizer('hello dog').tokens()\n",
    "tks = tokenizer.encode('hello dog')\n",
    "print(tks)\n",
    "text = tokenizer.decode(tks)\n",
    "print(\"text :\", text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "9b360ba8-692d-4e3c-b9c9-fae3969c3617",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' �'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([51461])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "85b41d25-acbd-4818-a8e9-5020a3009b1d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' 根'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([51461, 117])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "abb10266-3da5-4586-ab6e-8812dc83ce3d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{b'!': 0,\n",
       " b'\"': 1,\n",
       " b'#': 2,\n",
       " b'$': 3,\n",
       " b'%': 4,\n",
       " b'&': 5,\n",
       " b\"'\": 6,\n",
       " b'(': 7,\n",
       " b')': 8,\n",
       " b'*': 9,\n",
       " b'+': 10,\n",
       " b',': 11,\n",
       " b'-': 12,\n",
       " b'.': 13,\n",
       " b'/': 14,\n",
       " b'0': 15,\n",
       " b'1': 16,\n",
       " b'2': 17,\n",
       " b'3': 18,\n",
       " b'4': 19,\n",
       " b'5': 20,\n",
       " b'6': 21,\n",
       " b'7': 22,\n",
       " b'8': 23,\n",
       " b'9': 24,\n",
       " b':': 25,\n",
       " b';': 26,\n",
       " b'<': 27,\n",
       " b'=': 28,\n",
       " b'>': 29,\n",
       " b'?': 30,\n",
       " b'@': 31,\n",
       " b'A': 32,\n",
       " b'B': 33,\n",
       " b'C': 34,\n",
       " b'D': 35,\n",
       " b'E': 36,\n",
       " b'F': 37,\n",
       " b'G': 38,\n",
       " b'H': 39,\n",
       " b'I': 40,\n",
       " b'J': 41,\n",
       " b'K': 42,\n",
       " b'L': 43,\n",
       " b'M': 44,\n",
       " b'N': 45,\n",
       " b'O': 46,\n",
       " b'P': 47,\n",
       " b'Q': 48,\n",
       " b'R': 49,\n",
       " b'S': 50,\n",
       " b'T': 51,\n",
       " b'U': 52,\n",
       " b'V': 53,\n",
       " b'W': 54,\n",
       " b'X': 55,\n",
       " b'Y': 56,\n",
       " b'Z': 57,\n",
       " b'[': 58,\n",
       " b'\\\\': 59,\n",
       " b']': 60,\n",
       " b'^': 61,\n",
       " b'_': 62,\n",
       " b'`': 63,\n",
       " b'a': 64,\n",
       " b'b': 65,\n",
       " b'c': 66,\n",
       " b'd': 67,\n",
       " b'e': 68,\n",
       " b'f': 69,\n",
       " b'g': 70,\n",
       " b'h': 71,\n",
       " b'i': 72,\n",
       " b'j': 73,\n",
       " b'k': 74,\n",
       " b'l': 75,\n",
       " b'm': 76,\n",
       " b'n': 77,\n",
       " b'o': 78,\n",
       " b'p': 79,\n",
       " b'q': 80,\n",
       " b'r': 81,\n",
       " b's': 82,\n",
       " b't': 83,\n",
       " b'u': 84,\n",
       " b'v': 85,\n",
       " b'w': 86,\n",
       " b'x': 87,\n",
       " b'y': 88,\n",
       " b'z': 89,\n",
       " b'{': 90,\n",
       " b'|': 91,\n",
       " b'}': 92,\n",
       " b'~': 93,\n",
       " b'\\xa1': 94,\n",
       " b'\\xa2': 95,\n",
       " b'\\xa3': 96,\n",
       " b'\\xa4': 97,\n",
       " b'\\xa5': 98,\n",
       " b'\\xa6': 99,\n",
       " b'\\xa7': 100,\n",
       " b'\\xa8': 101,\n",
       " b'\\xa9': 102,\n",
       " b'\\xaa': 103,\n",
       " b'\\xab': 104,\n",
       " b'\\xac': 105,\n",
       " b'\\xae': 106,\n",
       " b'\\xaf': 107,\n",
       " b'\\xb0': 108,\n",
       " b'\\xb1': 109,\n",
       " b'\\xb2': 110,\n",
       " b'\\xb3': 111,\n",
       " b'\\xb4': 112,\n",
       " b'\\xb5': 113,\n",
       " b'\\xb6': 114,\n",
       " b'\\xb7': 115,\n",
       " b'\\xb8': 116,\n",
       " b'\\xb9': 117,\n",
       " b'\\xba': 118,\n",
       " b'\\xbb': 119,\n",
       " b'\\xbc': 120,\n",
       " b'\\xbd': 121,\n",
       " b'\\xbe': 122,\n",
       " b'\\xbf': 123,\n",
       " b'\\xc0': 124,\n",
       " b'\\xc1': 125,\n",
       " b'\\xc2': 126,\n",
       " b'\\xc3': 127,\n",
       " b'\\xc4': 128,\n",
       " b'\\xc5': 129,\n",
       " b'\\xc6': 130,\n",
       " b'\\xc7': 131,\n",
       " b'\\xc8': 132,\n",
       " b'\\xc9': 133,\n",
       " b'\\xca': 134,\n",
       " b'\\xcb': 135,\n",
       " b'\\xcc': 136,\n",
       " b'\\xcd': 137,\n",
       " b'\\xce': 138,\n",
       " b'\\xcf': 139,\n",
       " b'\\xd0': 140,\n",
       " b'\\xd1': 141,\n",
       " b'\\xd2': 142,\n",
       " b'\\xd3': 143,\n",
       " b'\\xd4': 144,\n",
       " b'\\xd5': 145,\n",
       " b'\\xd6': 146,\n",
       " b'\\xd7': 147,\n",
       " b'\\xd8': 148,\n",
       " b'\\xd9': 149,\n",
       " b'\\xda': 150,\n",
       " b'\\xdb': 151,\n",
       " b'\\xdc': 152,\n",
       " b'\\xdd': 153,\n",
       " b'\\xde': 154,\n",
       " b'\\xdf': 155,\n",
       " b'\\xe0': 156,\n",
       " b'\\xe1': 157,\n",
       " b'\\xe2': 158,\n",
       " b'\\xe3': 159,\n",
       " b'\\xe4': 160,\n",
       " b'\\xe5': 161,\n",
       " b'\\xe6': 162,\n",
       " b'\\xe7': 163,\n",
       " b'\\xe8': 164,\n",
       " b'\\xe9': 165,\n",
       " b'\\xea': 166,\n",
       " b'\\xeb': 167,\n",
       " b'\\xec': 168,\n",
       " b'\\xed': 169,\n",
       " b'\\xee': 170,\n",
       " b'\\xef': 171,\n",
       " b'\\xf0': 172,\n",
       " b'\\xf1': 173,\n",
       " b'\\xf2': 174,\n",
       " b'\\xf3': 175,\n",
       " b'\\xf4': 176,\n",
       " b'\\xf5': 177,\n",
       " b'\\xf6': 178,\n",
       " b'\\xf7': 179,\n",
       " b'\\xf8': 180,\n",
       " b'\\xf9': 181,\n",
       " b'\\xfa': 182,\n",
       " b'\\xfb': 183,\n",
       " b'\\xfc': 184,\n",
       " b'\\xfd': 185,\n",
       " b'\\xfe': 186,\n",
       " b'\\xff': 187,\n",
       " b'\\x00': 188,\n",
       " b'\\x01': 189,\n",
       " b'\\x02': 190,\n",
       " b'\\x03': 191,\n",
       " b'\\x04': 192,\n",
       " b'\\x05': 193,\n",
       " b'\\x06': 194,\n",
       " b'\\x07': 195,\n",
       " b'\\x08': 196,\n",
       " b'\\t': 197,\n",
       " b'\\n': 198,\n",
       " b'\\x0b': 199,\n",
       " b'\\x0c': 200,\n",
       " b'\\r': 201,\n",
       " b'\\x0e': 202,\n",
       " b'\\x0f': 203,\n",
       " b'\\x10': 204,\n",
       " b'\\x11': 205,\n",
       " b'\\x12': 206,\n",
       " b'\\x13': 207,\n",
       " b'\\x14': 208,\n",
       " b'\\x15': 209,\n",
       " b'\\x16': 210,\n",
       " b'\\x17': 211,\n",
       " b'\\x18': 212,\n",
       " b'\\x19': 213,\n",
       " b'\\x1a': 214,\n",
       " b'\\x1b': 215,\n",
       " b'\\x1c': 216,\n",
       " b'\\x1d': 217,\n",
       " b'\\x1e': 218,\n",
       " b'\\x1f': 219,\n",
       " b' ': 220,\n",
       " b'\\x7f': 221,\n",
       " b'\\x80': 222,\n",
       " b'\\x81': 223,\n",
       " b'\\x82': 224,\n",
       " b'\\x83': 225,\n",
       " b'\\x84': 226,\n",
       " b'\\x85': 227,\n",
       " b'\\x86': 228,\n",
       " b'\\x87': 229,\n",
       " b'\\x88': 230,\n",
       " b'\\x89': 231,\n",
       " b'\\x8a': 232,\n",
       " b'\\x8b': 233,\n",
       " b'\\x8c': 234,\n",
       " b'\\x8d': 235,\n",
       " b'\\x8e': 236,\n",
       " b'\\x8f': 237,\n",
       " b'\\x90': 238,\n",
       " b'\\x91': 239,\n",
       " b'\\x92': 240,\n",
       " b'\\x93': 241,\n",
       " b'\\x94': 242,\n",
       " b'\\x95': 243,\n",
       " b'\\x96': 244,\n",
       " b'\\x97': 245,\n",
       " b'\\x98': 246,\n",
       " b'\\x99': 247,\n",
       " b'\\x9a': 248,\n",
       " b'\\x9b': 249,\n",
       " b'\\x9c': 250,\n",
       " b'\\x9d': 251,\n",
       " b'\\x9e': 252,\n",
       " b'\\x9f': 253,\n",
       " b'\\xa0': 254,\n",
       " b'\\xad': 255,\n",
       " b'  ': 256,\n",
       " b'    ': 257,\n",
       " b'in': 258,\n",
       " b' t': 259,\n",
       " b'        ': 260,\n",
       " b'er': 261,\n",
       " b'   ': 262,\n",
       " b'on': 263,\n",
       " b' a': 264,\n",
       " b're': 265,\n",
       " b'at': 266,\n",
       " b'st': 267,\n",
       " b'en': 268,\n",
       " b'or': 269,\n",
       " b' th': 270,\n",
       " b'\\n\\n': 271,\n",
       " b' c': 272,\n",
       " b'le': 273,\n",
       " b' s': 274,\n",
       " b'it': 275,\n",
       " b'an': 276,\n",
       " b'ar': 277,\n",
       " b'al': 278,\n",
       " b' the': 279,\n",
       " b';\\n': 280,\n",
       " b' p': 281,\n",
       " b' f': 282,\n",
       " b'ou': 283,\n",
       " b' =': 284,\n",
       " b'is': 285,\n",
       " b'       ': 286,\n",
       " b'ing': 287,\n",
       " b'es': 288,\n",
       " b' w': 289,\n",
       " b'ion': 290,\n",
       " b'ed': 291,\n",
       " b'ic': 292,\n",
       " b' b': 293,\n",
       " b' d': 294,\n",
       " b'et': 295,\n",
       " b' m': 296,\n",
       " b' o': 297,\n",
       " b'\\t\\t': 298,\n",
       " b'ro': 299,\n",
       " b'as': 300,\n",
       " b'el': 301,\n",
       " b'ct': 302,\n",
       " b'nd': 303,\n",
       " b' in': 304,\n",
       " b' h': 305,\n",
       " b'ent': 306,\n",
       " b'id': 307,\n",
       " b' n': 308,\n",
       " b'am': 309,\n",
       " b'           ': 310,\n",
       " b' to': 311,\n",
       " b' re': 312,\n",
       " b'--': 313,\n",
       " b' {': 314,\n",
       " b' of': 315,\n",
       " b'om': 316,\n",
       " b');\\n': 317,\n",
       " b'im': 318,\n",
       " b'\\r\\n': 319,\n",
       " b' (': 320,\n",
       " b'il': 321,\n",
       " b'//': 322,\n",
       " b' and': 323,\n",
       " b'ur': 324,\n",
       " b'se': 325,\n",
       " b' l': 326,\n",
       " b'ex': 327,\n",
       " b' S': 328,\n",
       " b'ad': 329,\n",
       " b' \"': 330,\n",
       " b'ch': 331,\n",
       " b'ut': 332,\n",
       " b'if': 333,\n",
       " b'**': 334,\n",
       " b' }': 335,\n",
       " b'em': 336,\n",
       " b'ol': 337,\n",
       " b'                ': 338,\n",
       " b'th': 339,\n",
       " b')\\n': 340,\n",
       " b' {\\n': 341,\n",
       " b' g': 342,\n",
       " b'ig': 343,\n",
       " b'iv': 344,\n",
       " b',\\n': 345,\n",
       " b'ce': 346,\n",
       " b'od': 347,\n",
       " b' v': 348,\n",
       " b'ate': 349,\n",
       " b' T': 350,\n",
       " b'ag': 351,\n",
       " b'ay': 352,\n",
       " b' *': 353,\n",
       " b'ot': 354,\n",
       " b'us': 355,\n",
       " b' C': 356,\n",
       " b' st': 357,\n",
       " b' I': 358,\n",
       " b'un': 359,\n",
       " b'ul': 360,\n",
       " b'ue': 361,\n",
       " b' A': 362,\n",
       " b'ow': 363,\n",
       " b\" '\": 364,\n",
       " b'ew': 365,\n",
       " b' <': 366,\n",
       " b'ation': 367,\n",
       " b'()': 368,\n",
       " b' for': 369,\n",
       " b'ab': 370,\n",
       " b'ort': 371,\n",
       " b'um': 372,\n",
       " b'ame': 373,\n",
       " b' is': 374,\n",
       " b'pe': 375,\n",
       " b'tr': 376,\n",
       " b'ck': 377,\n",
       " b'\\xe2\\x80': 378,\n",
       " b' y': 379,\n",
       " b'ist': 380,\n",
       " b'----': 381,\n",
       " b'.\\n\\n': 382,\n",
       " b'he': 383,\n",
       " b' e': 384,\n",
       " b'lo': 385,\n",
       " b' M': 386,\n",
       " b' be': 387,\n",
       " b'ers': 388,\n",
       " b' on': 389,\n",
       " b' con': 390,\n",
       " b'ap': 391,\n",
       " b'ub': 392,\n",
       " b' P': 393,\n",
       " b'               ': 394,\n",
       " b'ass': 395,\n",
       " b'int': 396,\n",
       " b'>\\n': 397,\n",
       " b'ly': 398,\n",
       " b'urn': 399,\n",
       " b' $': 400,\n",
       " b';\\n\\n': 401,\n",
       " b'av': 402,\n",
       " b'port': 403,\n",
       " b'ir': 404,\n",
       " b'->': 405,\n",
       " b'nt': 406,\n",
       " b'ction': 407,\n",
       " b'end': 408,\n",
       " b' de': 409,\n",
       " b'ith': 410,\n",
       " b'out': 411,\n",
       " b'turn': 412,\n",
       " b'our': 413,\n",
       " b'     ': 414,\n",
       " b'lic': 415,\n",
       " b'res': 416,\n",
       " b'pt': 417,\n",
       " b'==': 418,\n",
       " b' this': 419,\n",
       " b' wh': 420,\n",
       " b' if': 421,\n",
       " b' D': 422,\n",
       " b'ver': 423,\n",
       " b'age': 424,\n",
       " b' B': 425,\n",
       " b'ht': 426,\n",
       " b'ext': 427,\n",
       " b'=\"': 428,\n",
       " b' that': 429,\n",
       " b'****': 430,\n",
       " b' R': 431,\n",
       " b' it': 432,\n",
       " b'ess': 433,\n",
       " b' F': 434,\n",
       " b' r': 435,\n",
       " b'os': 436,\n",
       " b'and': 437,\n",
       " b' as': 438,\n",
       " b'ect': 439,\n",
       " b'ke': 440,\n",
       " b'rom': 441,\n",
       " b' //': 442,\n",
       " b'con': 443,\n",
       " b' L': 444,\n",
       " b'(\"': 445,\n",
       " b'qu': 446,\n",
       " b'lass': 447,\n",
       " b' with': 448,\n",
       " b'iz': 449,\n",
       " b'de': 450,\n",
       " b' N': 451,\n",
       " b' al': 452,\n",
       " b'op': 453,\n",
       " b'up': 454,\n",
       " b'get': 455,\n",
       " b' }\\n': 456,\n",
       " b'ile': 457,\n",
       " b' an': 458,\n",
       " b'ata': 459,\n",
       " b'ore': 460,\n",
       " b'ri': 461,\n",
       " b' pro': 462,\n",
       " b';\\r\\n': 463,\n",
       " b'\\t\\t\\t\\t': 464,\n",
       " b'ter': 465,\n",
       " b'ain': 466,\n",
       " b' W': 467,\n",
       " b' E': 468,\n",
       " b' com': 469,\n",
       " b' return': 470,\n",
       " b'art': 471,\n",
       " b' H': 472,\n",
       " b'ack': 473,\n",
       " b'import': 474,\n",
       " b'ublic': 475,\n",
       " b' or': 476,\n",
       " b'est': 477,\n",
       " b'ment': 478,\n",
       " b' G': 479,\n",
       " b'able': 480,\n",
       " b' -': 481,\n",
       " b'ine': 482,\n",
       " b'ill': 483,\n",
       " b'ind': 484,\n",
       " b'ere': 485,\n",
       " b'::': 486,\n",
       " b'ity': 487,\n",
       " b' +': 488,\n",
       " b' tr': 489,\n",
       " b'elf': 490,\n",
       " b'ight': 491,\n",
       " b\"('\": 492,\n",
       " b'orm': 493,\n",
       " b'ult': 494,\n",
       " b'str': 495,\n",
       " b'..': 496,\n",
       " b'\",': 497,\n",
       " b' you': 498,\n",
       " b'ype': 499,\n",
       " b'pl': 500,\n",
       " b' new': 501,\n",
       " b' j': 502,\n",
       " b'                   ': 503,\n",
       " b' from': 504,\n",
       " b' ex': 505,\n",
       " b' O': 506,\n",
       " b'ld': 507,\n",
       " b' [': 508,\n",
       " b'oc': 509,\n",
       " b':\\n': 510,\n",
       " b' se': 511,\n",
       " b' le': 512,\n",
       " b'--------': 513,\n",
       " b'.s': 514,\n",
       " b'{\\n': 515,\n",
       " b\"',\": 516,\n",
       " b'ant': 517,\n",
       " b' at': 518,\n",
       " b'ase': 519,\n",
       " b'.c': 520,\n",
       " b' ch': 521,\n",
       " b'</': 522,\n",
       " b'ave': 523,\n",
       " b'ang': 524,\n",
       " b' are': 525,\n",
       " b' int': 526,\n",
       " b'\\xe2\\x80\\x99': 527,\n",
       " b'_t': 528,\n",
       " b'ert': 529,\n",
       " b'ial': 530,\n",
       " b'act': 531,\n",
       " b'}\\n': 532,\n",
       " b'ive': 533,\n",
       " b'ode': 534,\n",
       " b'ost': 535,\n",
       " b' class': 536,\n",
       " b' not': 537,\n",
       " b'og': 538,\n",
       " b'ord': 539,\n",
       " b'alue': 540,\n",
       " b'all': 541,\n",
       " b'ff': 542,\n",
       " b'();\\n': 543,\n",
       " b'ont': 544,\n",
       " b'ime': 545,\n",
       " b'are': 546,\n",
       " b' U': 547,\n",
       " b' pr': 548,\n",
       " b' :': 549,\n",
       " b'ies': 550,\n",
       " b'ize': 551,\n",
       " b'ure': 552,\n",
       " b' by': 553,\n",
       " b'ire': 554,\n",
       " b' }\\n\\n': 555,\n",
       " b'.p': 556,\n",
       " b' sh': 557,\n",
       " b'ice': 558,\n",
       " b'ast': 559,\n",
       " b'ption': 560,\n",
       " b'tring': 561,\n",
       " b'ok': 562,\n",
       " b'__': 563,\n",
       " b'cl': 564,\n",
       " b'##': 565,\n",
       " b' he': 566,\n",
       " b'ard': 567,\n",
       " b').': 568,\n",
       " b' @': 569,\n",
       " b'iew': 570,\n",
       " b'\\t\\t\\t': 571,\n",
       " b' was': 572,\n",
       " b'ip': 573,\n",
       " b'this': 574,\n",
       " b' u': 575,\n",
       " b' The': 576,\n",
       " b'ide': 577,\n",
       " b'ace': 578,\n",
       " b'ib': 579,\n",
       " b'ac': 580,\n",
       " b'rou': 581,\n",
       " b' we': 582,\n",
       " b'ject': 583,\n",
       " b' public': 584,\n",
       " b'ak': 585,\n",
       " b've': 586,\n",
       " b'ath': 587,\n",
       " b'oid': 588,\n",
       " b' =>': 589,\n",
       " b'ust': 590,\n",
       " b'que': 591,\n",
       " b' res': 592,\n",
       " b'))': 593,\n",
       " b\"'s\": 594,\n",
       " b' k': 595,\n",
       " b'ans': 596,\n",
       " b'yst': 597,\n",
       " b'unction': 598,\n",
       " b'********': 599,\n",
       " b' i': 600,\n",
       " b' us': 601,\n",
       " b'pp': 602,\n",
       " b'one': 603,\n",
       " b'ail': 604,\n",
       " b'====': 605,\n",
       " b'name': 606,\n",
       " b' str': 607,\n",
       " b' /': 608,\n",
       " b' &': 609,\n",
       " b'ach': 610,\n",
       " b'div': 611,\n",
       " b'ystem': 612,\n",
       " b'ell': 613,\n",
       " b' have': 614,\n",
       " b'err': 615,\n",
       " b'ould': 616,\n",
       " b'ull': 617,\n",
       " b'pon': 618,\n",
       " b' J': 619,\n",
       " b'_p': 620,\n",
       " b' ==': 621,\n",
       " b'ign': 622,\n",
       " b'St': 623,\n",
       " b'.\\n': 624,\n",
       " b' pl': 625,\n",
       " b');\\n\\n': 626,\n",
       " b'form': 627,\n",
       " b'put': 628,\n",
       " b'ount': 629,\n",
       " b'}\\n\\n': 630,\n",
       " b'dd': 631,\n",
       " b'ite': 632,\n",
       " b' get': 633,\n",
       " b'rr': 634,\n",
       " b'ome': 635,\n",
       " b' \\xe2\\x80': 636,\n",
       " b'aram': 637,\n",
       " b'cc': 638,\n",
       " b' */': 639,\n",
       " b'ER': 640,\n",
       " b'In': 641,\n",
       " b'les': 642,\n",
       " b'_s': 643,\n",
       " b'ong': 644,\n",
       " b'ie': 645,\n",
       " b' can': 646,\n",
       " b' V': 647,\n",
       " b'erv': 648,\n",
       " b'pr': 649,\n",
       " b' un': 650,\n",
       " b'row': 651,\n",
       " b'ber': 652,\n",
       " b' do': 653,\n",
       " b'll': 654,\n",
       " b' el': 655,\n",
       " b' self': 656,\n",
       " b'ated': 657,\n",
       " b'ary': 658,\n",
       " b' .': 659,\n",
       " b\"']\": 660,\n",
       " b'ud': 661,\n",
       " b' en': 662,\n",
       " b' Th': 663,\n",
       " b'                       ': 664,\n",
       " b'te': 665,\n",
       " b'_c': 666,\n",
       " b'uct': 667,\n",
       " b' ab': 668,\n",
       " b'ork': 669,\n",
       " b'.get': 670,\n",
       " b' #': 671,\n",
       " b'aw': 672,\n",
       " b'ress': 673,\n",
       " b'ob': 674,\n",
       " b'Name': 675,\n",
       " b'app': 676,\n",
       " b\"['\": 677,\n",
       " b' all': 678,\n",
       " b'ory': 679,\n",
       " b'ition': 680,\n",
       " b'ance': 681,\n",
       " b'ear': 682,\n",
       " b' cont': 683,\n",
       " b'vent': 684,\n",
       " b'ia': 685,\n",
       " b' will': 686,\n",
       " b'IN': 687,\n",
       " b'         ': 688,\n",
       " b'return': 689,\n",
       " b' </': 690,\n",
       " b'data': 691,\n",
       " b')\\n\\n': 692,\n",
       " b'Re': 693,\n",
       " b'ple': 694,\n",
       " b'ild': 695,\n",
       " b'ther': 696,\n",
       " b' your': 697,\n",
       " b'\"\\n': 698,\n",
       " b'($': 699,\n",
       " b' out': 700,\n",
       " b'),': 701,\n",
       " b' has': 702,\n",
       " b'String': 703,\n",
       " b'so': 704,\n",
       " b' up': 705,\n",
       " b'ax': 706,\n",
       " b' def': 707,\n",
       " b' bo': 708,\n",
       " b'ge': 709,\n",
       " b'alse': 710,\n",
       " b'ON': 711,\n",
       " b'per': 712,\n",
       " b'ich': 713,\n",
       " b' but': 714,\n",
       " b' \\n': 715,\n",
       " b' _': 716,\n",
       " b'_m': 717,\n",
       " b'add': 718,\n",
       " b'quest': 719,\n",
       " b'odel': 720,\n",
       " b'self': 721,\n",
       " b'ery': 722,\n",
       " b'ft': 723,\n",
       " b'ens': 724,\n",
       " b'////': 725,\n",
       " b'ake': 726,\n",
       " b'.C': 727,\n",
       " b' go': 728,\n",
       " b' function': 729,\n",
       " b' K': 730,\n",
       " b'ivate': 731,\n",
       " b' im': 732,\n",
       " b' const': 733,\n",
       " b'.t': 734,\n",
       " b' */\\n': 735,\n",
       " b');\\r\\n': 736,\n",
       " b' void': 737,\n",
       " b' set': 738,\n",
       " b' System': 739,\n",
       " b'cri': 740,\n",
       " b'()\\n': 741,\n",
       " b'li': 742,\n",
       " b'\\tif': 743,\n",
       " b'.m': 744,\n",
       " b'ally': 745,\n",
       " b'set': 746,\n",
       " b'ep': 747,\n",
       " b'\\xe2\\x80\\x99s': 748,\n",
       " b'bo': 749,\n",
       " b'def': 750,\n",
       " b\"',\\n\": 751,\n",
       " b' me': 752,\n",
       " b' !': 753,\n",
       " b'atch': 754,\n",
       " b'\">': 755,\n",
       " b'\",\\n': 756,\n",
       " b'ec': 757,\n",
       " b' In': 758,\n",
       " b'ph': 759,\n",
       " b' |': 760,\n",
       " b'_f': 761,\n",
       " b' var': 762,\n",
       " b'ence': 763,\n",
       " b'Id': 764,\n",
       " b'ree': 765,\n",
       " b'ink': 766,\n",
       " b'lect': 767,\n",
       " b'ug': 768,\n",
       " b'eth': 769,\n",
       " b' else': 770,\n",
       " b'----------------': 771,\n",
       " b'cont': 772,\n",
       " b' so': 773,\n",
       " b'atic': 774,\n",
       " b' lo': 775,\n",
       " b'pro': 776,\n",
       " b'ton': 777,\n",
       " b'ss': 778,\n",
       " b'own': 779,\n",
       " b'abel': 780,\n",
       " b'oint': 781,\n",
       " b'ous': 782,\n",
       " b'eld': 783,\n",
       " b'ST': 784,\n",
       " b'The': 785,\n",
       " b'                                ': 786,\n",
       " b'RE': 787,\n",
       " b'\":': 788,\n",
       " b'olor': 789,\n",
       " b'tp': 790,\n",
       " b'eg': 791,\n",
       " b'key': 792,\n",
       " b'ude': 793,\n",
       " b' St': 794,\n",
       " b'ound': 795,\n",
       " b' ar': 796,\n",
       " b'\");\\n': 797,\n",
       " b'ener': 798,\n",
       " b'ser': 799,\n",
       " b'bject': 800,\n",
       " b'essage': 801,\n",
       " b'fer': 802,\n",
       " b' more': 803,\n",
       " b'ations': 804,\n",
       " b'ents': 805,\n",
       " b' his': 806,\n",
       " b' they': 807,\n",
       " b'.S': 808,\n",
       " b' Y': 809,\n",
       " b'use': 810,\n",
       " b'ne': 811,\n",
       " b'ish': 812,\n",
       " b'old': 813,\n",
       " b'_d': 814,\n",
       " b'io': 815,\n",
       " b'ield': 816,\n",
       " b' per': 817,\n",
       " b'Cont': 818,\n",
       " b'ings': 819,\n",
       " b'####': 820,\n",
       " b' data': 821,\n",
       " b' sa': 822,\n",
       " b'ef': 823,\n",
       " b'fo': 824,\n",
       " b' one': 825,\n",
       " b'eng': 826,\n",
       " b' dis': 827,\n",
       " b'AT': 828,\n",
       " b' name': 829,\n",
       " b' true': 830,\n",
       " b'val': 831,\n",
       " b'led': 832,\n",
       " b'.f': 833,\n",
       " b' ne': 834,\n",
       " b' end': 835,\n",
       " b'.T': 836,\n",
       " b'cre': 837,\n",
       " b'ark': 838,\n",
       " b'log': 839,\n",
       " b'Ex': 840,\n",
       " b'error': 841,\n",
       " b'_id': 842,\n",
       " b'urre': 843,\n",
       " b'ange': 844,\n",
       " b' null': 845,\n",
       " b'rray': 846,\n",
       " b' my': 847,\n",
       " b'pan': 848,\n",
       " b'ict': 849,\n",
       " b'ator': 850,\n",
       " b'View': 851,\n",
       " b'List': 852,\n",
       " b'\\treturn': 853,\n",
       " b'\\xe2\\x80\\x9d': 854,\n",
       " b' pre': 855,\n",
       " b' x': 856,\n",
       " b'clude': 857,\n",
       " b'arg': 858,\n",
       " b'ov': 859,\n",
       " b'.h': 860,\n",
       " b' >': 861,\n",
       " b' their': 862,\n",
       " b\"')\": 863,\n",
       " b'irst': 864,\n",
       " b'ick': 865,\n",
       " b'gh': 866,\n",
       " b'LE': 867,\n",
       " b'OR': 868,\n",
       " b' private': 869,\n",
       " b'tem': 870,\n",
       " b'\\r\\n\\r\\n': 871,\n",
       " b'user': 872,\n",
       " b' )': 873,\n",
       " b'com': 874,\n",
       " b'.A': 875,\n",
       " b'\";\\n': 876,\n",
       " b' id': 877,\n",
       " b'read': 878,\n",
       " b' who': 879,\n",
       " b'_b': 880,\n",
       " b'\">\\n': 881,\n",
       " b' time': 882,\n",
       " b' man': 883,\n",
       " b'ry': 884,\n",
       " b'========': 885,\n",
       " b'roup': 886,\n",
       " b'rop': 887,\n",
       " b'public': 888,\n",
       " b'vel': 889,\n",
       " b'umber': 890,\n",
       " b'ble': 891,\n",
       " b' which': 892,\n",
       " b'****************': 893,\n",
       " b' any': 894,\n",
       " b' false': 895,\n",
       " b'we': 896,\n",
       " b' value': 897,\n",
       " b' li': 898,\n",
       " b'\")': 899,\n",
       " b'nder': 900,\n",
       " b'gr': 901,\n",
       " b' no': 902,\n",
       " b'param': 903,\n",
       " b'fig': 904,\n",
       " b'.com': 905,\n",
       " b' app': 906,\n",
       " b'_l': 907,\n",
       " b'ions': 908,\n",
       " b'.D': 909,\n",
       " b' Ch': 910,\n",
       " b' about': 911,\n",
       " b' add': 912,\n",
       " b' su': 913,\n",
       " b' string': 914,\n",
       " b'ID': 915,\n",
       " b' over': 916,\n",
       " b'string': 917,\n",
       " b'.l': 918,\n",
       " b'ource': 919,\n",
       " b'_C': 920,\n",
       " b']\\n': 921,\n",
       " b' qu': 922,\n",
       " b' String': 923,\n",
       " b'ca': 924,\n",
       " b'SE': 925,\n",
       " b' ro': 926,\n",
       " b'sh': 927,\n",
       " b'ual': 928,\n",
       " b'Type': 929,\n",
       " b'son': 930,\n",
       " b'new': 931,\n",
       " b'ern': 932,\n",
       " b' ag': 933,\n",
       " b'AR': 934,\n",
       " b'];\\n': 935,\n",
       " b'].': 936,\n",
       " b' ?': 937,\n",
       " b'ical': 938,\n",
       " b' des': 939,\n",
       " b'uth': 940,\n",
       " b'ix': 941,\n",
       " b'ays': 942,\n",
       " b' type': 943,\n",
       " b\"'t\": 944,\n",
       " b'ault': 945,\n",
       " b' inter': 946,\n",
       " b'var': 947,\n",
       " b'.b': 948,\n",
       " b' part': 949,\n",
       " b'.d': 950,\n",
       " b'urrent': 951,\n",
       " b'IT': 952,\n",
       " b'EN': 953,\n",
       " b'enc': 954,\n",
       " b'(f': 955,\n",
       " b'ra': 956,\n",
       " b'value': 957,\n",
       " b'cho': 958,\n",
       " b'utton': 959,\n",
       " b'ose': 960,\n",
       " b' !=': 961,\n",
       " b'ater': 962,\n",
       " b'\\xc3\\xa9': 963,\n",
       " b'reate': 964,\n",
       " b'oll': 965,\n",
       " b'pos': 966,\n",
       " b'yle': 967,\n",
       " b'ng': 968,\n",
       " b'AL': 969,\n",
       " b'using': 970,\n",
       " b'ames': 971,\n",
       " b' {\\r\\n': 972,\n",
       " b'ates': 973,\n",
       " b'ely': 974,\n",
       " b' work': 975,\n",
       " b' em': 976,\n",
       " b'inal': 977,\n",
       " b' sp': 978,\n",
       " b' when': 979,\n",
       " b'.set': 980,\n",
       " b'      ': 981,\n",
       " b'):\\n': 982,\n",
       " b'to': 983,\n",
       " b'quire': 984,\n",
       " b'indow': 985,\n",
       " b'lement': 986,\n",
       " b'pect': 987,\n",
       " b'ash': 988,\n",
       " b'[i': 989,\n",
       " b' use': 990,\n",
       " b'.F': 991,\n",
       " b'pec': 992,\n",
       " b' ad': 993,\n",
       " b'ove': 994,\n",
       " b'ception': 995,\n",
       " b'ength': 996,\n",
       " b'include': 997,\n",
       " b'ader': 998,\n",
       " b'                           ': 999,\n",
       " ...}"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "8ea45811-b04f-461e-980e-7f7c6aa8e93c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1350, 492, 151643, 863, 151643]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "58f6bd7f-2162-4d76-8994-eea7c0a9b367",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'print'"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([1350])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "b655549f-f089-49c7-abff-6335a48ca117",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"print('<|endoftext|>')<|endoftext|>\""
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([1350, 492, 151643, 863, 151643])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "11e44fcd-f707-4cb6-aa50-0d8d6d69c7e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "151643"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.eod_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "69277c19-171b-4d69-902c-738b3176aed6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "692bd4b6-8098-4596-b2bb-81bfb04f1fa0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<|endoftext|>'"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([151643])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "49e51cb0-8242-489b-ace2-fc955e75844f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'|endoftext|'"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([91, 8691, 723, 427, 91])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "fd732a3d-046c-42a5-9edc-636f324a44aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"('<\""
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([11146])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3cab603-279b-48e6-b8ac-7ee82da543a5",
   "metadata": {},
   "source": [
    "**disallow emerge special tokens**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "72c2b0c0-38b0-41b0-9c4f-5246de72b07f",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprint(\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m<|endoftext|>\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mall\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;241m+\u001b[39m [tokenizer\u001b[38;5;241m.\u001b[39meod_id]\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2373\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)\u001b[0m\n\u001b[1;32m   2336\u001b[0m \u001b[38;5;129m@add_end_docstrings\u001b[39m(\n\u001b[1;32m   2337\u001b[0m     ENCODE_KWARGS_DOCSTRING,\n\u001b[1;32m   2338\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2356\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2357\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m   2358\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   2359\u001b[0m \u001b[38;5;124;03m    Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[1;32m   2360\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2371\u001b[0m \u001b[38;5;124;03m            method).\u001b[39;00m\n\u001b[1;32m   2372\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2373\u001b[0m     encoded_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2374\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2375\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2376\u001b[0m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2377\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2378\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2379\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2380\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2381\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2382\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2383\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2385\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2781\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m   2771\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m   2772\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m   2773\u001b[0m     padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m   2774\u001b[0m     truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   2778\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m   2779\u001b[0m )\n\u001b[0;32m-> 2781\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2782\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2783\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2784\u001b[0m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2785\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2786\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2787\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2788\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2789\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2790\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2791\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2792\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2793\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2794\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2795\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2796\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2797\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2798\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2799\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2800\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils.py:656\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m    647\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_offsets_mapping:\n\u001b[1;32m    648\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m    649\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn_offset_mapping is not available when using Python tokenizers. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    650\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTo use this feature, change your tokenizer to one deriving from \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    653\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://github.com/huggingface/transformers/pull/2674\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    654\u001b[0m     )\n\u001b[0;32m--> 656\u001b[0m first_ids \u001b[38;5;241m=\u001b[39m \u001b[43mget_input_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    657\u001b[0m second_ids \u001b[38;5;241m=\u001b[39m get_input_ids(text_pair) \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_for_model(\n\u001b[1;32m    660\u001b[0m     first_ids,\n\u001b[1;32m    661\u001b[0m     pair_ids\u001b[38;5;241m=\u001b[39msecond_ids,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    675\u001b[0m     verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[1;32m    676\u001b[0m )\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils.py:623\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus.<locals>.get_input_ids\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m    621\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_ids\u001b[39m(text):\n\u001b[1;32m    622\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 623\u001b[0m         tokens \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    624\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconvert_tokens_to_ids(tokens)\n\u001b[1;32m    625\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(text) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;28mstr\u001b[39m):\n",
      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/Qwen/Qwen-7B/4792686b9af1b3663a02f39bc44f37326f4f30f4/tokenization_qwen.py:182\u001b[0m, in \u001b[0;36mQWenTokenizer.tokenize\u001b[0;34m(self, text, allowed_special, disallowed_special, **kwargs)\u001b[0m\n\u001b[1;32m    179\u001b[0m text \u001b[38;5;241m=\u001b[39m unicodedata\u001b[38;5;241m.\u001b[39mnormalize(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNFC\u001b[39m\u001b[38;5;124m\"\u001b[39m, text)\n\u001b[1;32m    181\u001b[0m \u001b[38;5;66;03m# this implementation takes a detour: text -> token id -> token surface forms\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    183\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallowed_special\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisallowed_special\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m    185\u001b[0m     tokens\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder[t])\n\u001b[1;32m    186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tokens\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/tiktoken/core.py:117\u001b[0m, in \u001b[0;36mEncoding.encode\u001b[0;34m(self, text, allowed_special, disallowed_special)\u001b[0m\n\u001b[1;32m    115\u001b[0m         disallowed_special \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mfrozenset\u001b[39m(disallowed_special)\n\u001b[1;32m    116\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m match \u001b[38;5;241m:=\u001b[39m _special_token_regex(disallowed_special)\u001b[38;5;241m.\u001b[39msearch(text):\n\u001b[0;32m--> 117\u001b[0m         \u001b[43mraise_disallowed_special_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmatch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    119\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_core_bpe\u001b[38;5;241m.\u001b[39mencode(text, allowed_special)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/tiktoken/core.py:351\u001b[0m, in \u001b[0;36mraise_disallowed_special_token\u001b[0;34m(token)\u001b[0m\n\u001b[1;32m    350\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mraise_disallowed_special_token\u001b[39m(token: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m NoReturn:\n\u001b[0;32m--> 351\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    352\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEncountered text corresponding to disallowed special token \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    353\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIf you want this text to be encoded as a special token, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    354\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpass it to `allowed_special`, e.g. `allowed_special=\u001b[39m\u001b[38;5;130;01m{{\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m, ...\u001b[39m\u001b[38;5;130;01m}}\u001b[39;00m\u001b[38;5;124m`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    355\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIf you want this text to be encoded as normal text, disable the check for this token \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    356\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mby passing `disallowed_special=(enc.special_tokens_set - \u001b[39m\u001b[38;5;130;01m{{\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;130;01m}}\u001b[39;00m\u001b[38;5;124m)`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    357\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTo disable this check for all special tokens, pass `disallowed_special=()`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    358\u001b[0m     )\n",
      "\u001b[0;31mValueError\u001b[0m: Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n"
     ]
    }
   ],
   "source": [
    "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "d730bd02-d07f-42e8-8bf1-d311e6337061",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using unk_token, but it is not set yet.\n"
     ]
    }
   ],
   "source": [
    "tokenizer.unk_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "e1d658dd-03d4-4bb2-8947-e6d84f654d6f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NoneType"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(tokenizer.pad_token_id)\n",
    "#tokenizer._convert_id_to_token(tokenizer.pad_token_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "139d66d9-017a-40d4-8b2d-919a78439ddf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "151646"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.special_tokens['<|extra_0|>']\n",
    "#tokenizer.special_tokens['<|extra_204|>']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15efb8c0-073e-4f9a-974c-9709f705e5f3",
   "metadata": {},
   "source": [
    "#abc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b714951c-745e-41dd-86d2-89e28862e21c",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]\n",
    "tokenizer.convert_ids_to_tokens(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "0c217322-b24b-4803-bb4d-fb9c675fef5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[b' ']"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ids = tokenizer.encode(\" \")\n",
    "tokenizer.convert_ids_to_tokens(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "1ac63768-bb7c-45de-ba95-ca918db5f806",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ids: [151644, 1350, 492, 35946, 99639, 91680, 100472, 151646, 1305, 2, 116198, 116198, 116198, 13, 10236, 226, 114, 151645, 151643]\n"
     ]
    }
   ],
   "source": [
    "ids = tokenizer.encode(\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵. 然<|im_end|>\", \n",
    "                       allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n",
    "                       disallowed_special={'<|endoftext|>'}) + [tokenizer.eod_id]\n",
    "print(\"ids:\", ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "47530127-7f66-4b2c-9396-10b672ef9c4c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<|im_start|>',\n",
       " b'print',\n",
       " b\"('\",\n",
       " b'\\xe6\\x88\\x91',\n",
       " b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80',\n",
       " b'\\xe5\\x8f\\xaa',\n",
       " b'\\xe7\\x8c\\xab',\n",
       " '<|extra_0|>',\n",
       " b\"')\\n\",\n",
       " b'#',\n",
       " b'\\xe5\\x96\\xb5',\n",
       " b'\\xe5\\x96\\xb5',\n",
       " b'\\xe5\\x96\\xb5',\n",
       " b'.',\n",
       " b' \\xe7',\n",
       " b'\\x84',\n",
       " b'\\xb6',\n",
       " '<|im_end|>',\n",
       " '<|endoftext|>']"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.convert_ids_to_tokens(ids)\n",
    "##bytes to string\n",
    "\n",
    "# for t in tokens:\n",
    "#     if isinstance(t, bytes):\n",
    "#         try:\n",
    "#             t = t.decode('utf-8')\n",
    "#         except:\n",
    "#             print(\"*\", t)\n",
    "#             t = t.decode('iso-8859-1')\n",
    "#     print(t)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9a655c8-cf35-4209-9815-dd077fdfe6d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "26664e1a-192f-499f-9411-d6639d302b73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'是一'"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bbytes = b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80'\n",
    "bbytes.decode('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a50927d-0a26-4083-976a-3ec600405f7b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}