{ "cells": [ { "cell_type": "code", "execution_count": 76, "id": "d5c3dff6-bd21-4e6a-8f8b-a83dc6895e08", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": 36, "id": "549ca852-199a-4d55-9e29-17837dd1f975", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "QWenTokenizer(name_or_path='Qwen/Qwen-7B', vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer" ] }, { "cell_type": "code", "execution_count": 37, "id": "fe6c9cf7-fbc6-4073-81e9-5dfaefe88fdf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151851" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tokenizer)" ] }, { "cell_type": "code", "execution_count": 38, "id": "4b583802-0e69-40d2-a32e-745ea50ede63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[14990, 5562]\n", "text : hello dog\n" ] } ], "source": [ "#tokenizer('hello dog')['input_ids']\n", "#tokenizer('hello dog').tokens()\n", "tks = tokenizer.encode('hello dog')\n", "print(tks)\n", "text = tokenizer.decode(tks)\n", "print(\"text :\", text)" ] }, { "cell_type": "code", "execution_count": 39, "id": "9b360ba8-692d-4e3c-b9c9-fae3969c3617", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' �'" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([51461])" ] }, { "cell_type": "code", "execution_count": 40, "id": "85b41d25-acbd-4818-a8e9-5020a3009b1d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' 根'" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([51461, 117])" ] }, { "cell_type": "code", "execution_count": 41, "id": "abb10266-3da5-4586-ab6e-8812dc83ce3d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{b'!': 0,\n", " b'\"': 1,\n", " b'#': 2,\n", " b'$': 3,\n", " b'%': 4,\n", " b'&': 5,\n", " b\"'\": 6,\n", " b'(': 7,\n", " b')': 8,\n", " b'*': 9,\n", " b'+': 10,\n", " b',': 11,\n", " b'-': 12,\n", " b'.': 13,\n", " b'/': 14,\n", " b'0': 15,\n", " b'1': 16,\n", " b'2': 17,\n", " b'3': 18,\n", " b'4': 19,\n", " b'5': 20,\n", " b'6': 21,\n", " b'7': 22,\n", " b'8': 23,\n", " b'9': 24,\n", " b':': 25,\n", " b';': 26,\n", " b'<': 27,\n", " b'=': 28,\n", " b'>': 29,\n", " b'?': 30,\n", " b'@': 31,\n", " b'A': 32,\n", " b'B': 33,\n", " b'C': 34,\n", " b'D': 35,\n", " b'E': 36,\n", " b'F': 37,\n", " b'G': 38,\n", " b'H': 39,\n", " b'I': 40,\n", " b'J': 41,\n", " b'K': 42,\n", " b'L': 43,\n", " b'M': 44,\n", " b'N': 45,\n", " b'O': 46,\n", " b'P': 47,\n", " b'Q': 48,\n", " b'R': 49,\n", " b'S': 50,\n", " b'T': 51,\n", " b'U': 52,\n", " b'V': 53,\n", " b'W': 54,\n", " b'X': 55,\n", " b'Y': 56,\n", " b'Z': 57,\n", " b'[': 58,\n", " b'\\\\': 59,\n", " b']': 60,\n", " b'^': 61,\n", " b'_': 62,\n", " b'`': 63,\n", " b'a': 64,\n", " b'b': 65,\n", " b'c': 66,\n", " b'd': 67,\n", " b'e': 68,\n", " b'f': 69,\n", " b'g': 70,\n", " b'h': 71,\n", " b'i': 72,\n", " b'j': 73,\n", " b'k': 74,\n", " b'l': 75,\n", " b'm': 76,\n", " b'n': 77,\n", " b'o': 78,\n", " b'p': 79,\n", " b'q': 80,\n", " b'r': 81,\n", " b's': 82,\n", " b't': 83,\n", " b'u': 84,\n", " b'v': 85,\n", " b'w': 86,\n", " b'x': 87,\n", " b'y': 88,\n", " b'z': 89,\n", " b'{': 90,\n", " b'|': 91,\n", " b'}': 92,\n", " b'~': 93,\n", " b'\\xa1': 94,\n", " b'\\xa2': 95,\n", " b'\\xa3': 96,\n", " b'\\xa4': 97,\n", " b'\\xa5': 98,\n", " b'\\xa6': 99,\n", " b'\\xa7': 100,\n", " b'\\xa8': 101,\n", " b'\\xa9': 102,\n", " b'\\xaa': 103,\n", " b'\\xab': 104,\n", " b'\\xac': 105,\n", " b'\\xae': 106,\n", " b'\\xaf': 107,\n", " b'\\xb0': 108,\n", " b'\\xb1': 109,\n", " b'\\xb2': 110,\n", " b'\\xb3': 111,\n", " b'\\xb4': 112,\n", " b'\\xb5': 113,\n", " b'\\xb6': 114,\n", " b'\\xb7': 115,\n", " b'\\xb8': 116,\n", " b'\\xb9': 117,\n", " b'\\xba': 118,\n", " b'\\xbb': 119,\n", " b'\\xbc': 120,\n", " b'\\xbd': 121,\n", " b'\\xbe': 122,\n", " b'\\xbf': 123,\n", " b'\\xc0': 124,\n", " b'\\xc1': 125,\n", " b'\\xc2': 126,\n", " b'\\xc3': 127,\n", " b'\\xc4': 128,\n", " b'\\xc5': 129,\n", " b'\\xc6': 130,\n", " b'\\xc7': 131,\n", " b'\\xc8': 132,\n", " b'\\xc9': 133,\n", " b'\\xca': 134,\n", " b'\\xcb': 135,\n", " b'\\xcc': 136,\n", " b'\\xcd': 137,\n", " b'\\xce': 138,\n", " b'\\xcf': 139,\n", " b'\\xd0': 140,\n", " b'\\xd1': 141,\n", " b'\\xd2': 142,\n", " b'\\xd3': 143,\n", " b'\\xd4': 144,\n", " b'\\xd5': 145,\n", " b'\\xd6': 146,\n", " b'\\xd7': 147,\n", " b'\\xd8': 148,\n", " b'\\xd9': 149,\n", " b'\\xda': 150,\n", " b'\\xdb': 151,\n", " b'\\xdc': 152,\n", " b'\\xdd': 153,\n", " b'\\xde': 154,\n", " b'\\xdf': 155,\n", " b'\\xe0': 156,\n", " b'\\xe1': 157,\n", " b'\\xe2': 158,\n", " b'\\xe3': 159,\n", " b'\\xe4': 160,\n", " b'\\xe5': 161,\n", " b'\\xe6': 162,\n", " b'\\xe7': 163,\n", " b'\\xe8': 164,\n", " b'\\xe9': 165,\n", " b'\\xea': 166,\n", " b'\\xeb': 167,\n", " b'\\xec': 168,\n", " b'\\xed': 169,\n", " b'\\xee': 170,\n", " b'\\xef': 171,\n", " b'\\xf0': 172,\n", " b'\\xf1': 173,\n", " b'\\xf2': 174,\n", " b'\\xf3': 175,\n", " b'\\xf4': 176,\n", " b'\\xf5': 177,\n", " b'\\xf6': 178,\n", " b'\\xf7': 179,\n", " b'\\xf8': 180,\n", " b'\\xf9': 181,\n", " b'\\xfa': 182,\n", " b'\\xfb': 183,\n", " b'\\xfc': 184,\n", " b'\\xfd': 185,\n", " b'\\xfe': 186,\n", " b'\\xff': 187,\n", " b'\\x00': 188,\n", " b'\\x01': 189,\n", " b'\\x02': 190,\n", " b'\\x03': 191,\n", " b'\\x04': 192,\n", " b'\\x05': 193,\n", " b'\\x06': 194,\n", " b'\\x07': 195,\n", " b'\\x08': 196,\n", " b'\\t': 197,\n", " b'\\n': 198,\n", " b'\\x0b': 199,\n", " b'\\x0c': 200,\n", " b'\\r': 201,\n", " b'\\x0e': 202,\n", " b'\\x0f': 203,\n", " b'\\x10': 204,\n", " b'\\x11': 205,\n", " b'\\x12': 206,\n", " b'\\x13': 207,\n", " b'\\x14': 208,\n", " b'\\x15': 209,\n", " b'\\x16': 210,\n", " b'\\x17': 211,\n", " b'\\x18': 212,\n", " b'\\x19': 213,\n", " b'\\x1a': 214,\n", " b'\\x1b': 215,\n", " b'\\x1c': 216,\n", " b'\\x1d': 217,\n", " b'\\x1e': 218,\n", " b'\\x1f': 219,\n", " b' ': 220,\n", " b'\\x7f': 221,\n", " b'\\x80': 222,\n", " b'\\x81': 223,\n", " b'\\x82': 224,\n", " b'\\x83': 225,\n", " b'\\x84': 226,\n", " b'\\x85': 227,\n", " b'\\x86': 228,\n", " b'\\x87': 229,\n", " b'\\x88': 230,\n", " b'\\x89': 231,\n", " b'\\x8a': 232,\n", " b'\\x8b': 233,\n", " b'\\x8c': 234,\n", " b'\\x8d': 235,\n", " b'\\x8e': 236,\n", " b'\\x8f': 237,\n", " b'\\x90': 238,\n", " b'\\x91': 239,\n", " b'\\x92': 240,\n", " b'\\x93': 241,\n", " b'\\x94': 242,\n", " b'\\x95': 243,\n", " b'\\x96': 244,\n", " b'\\x97': 245,\n", " b'\\x98': 246,\n", " b'\\x99': 247,\n", " b'\\x9a': 248,\n", " b'\\x9b': 249,\n", " b'\\x9c': 250,\n", " b'\\x9d': 251,\n", " b'\\x9e': 252,\n", " b'\\x9f': 253,\n", " b'\\xa0': 254,\n", " b'\\xad': 255,\n", " b' ': 256,\n", " b' ': 257,\n", " b'in': 258,\n", " b' t': 259,\n", " b' ': 260,\n", " b'er': 261,\n", " b' ': 262,\n", " b'on': 263,\n", " b' a': 264,\n", " b're': 265,\n", " b'at': 266,\n", " b'st': 267,\n", " b'en': 268,\n", " b'or': 269,\n", " b' th': 270,\n", " b'\\n\\n': 271,\n", " b' c': 272,\n", " b'le': 273,\n", " b' s': 274,\n", " b'it': 275,\n", " b'an': 276,\n", " b'ar': 277,\n", " b'al': 278,\n", " b' the': 279,\n", " b';\\n': 280,\n", " b' p': 281,\n", " b' f': 282,\n", " b'ou': 283,\n", " b' =': 284,\n", " b'is': 285,\n", " b' ': 286,\n", " b'ing': 287,\n", " b'es': 288,\n", " b' w': 289,\n", " b'ion': 290,\n", " b'ed': 291,\n", " b'ic': 292,\n", " b' b': 293,\n", " b' d': 294,\n", " b'et': 295,\n", " b' m': 296,\n", " b' o': 297,\n", " b'\\t\\t': 298,\n", " b'ro': 299,\n", " b'as': 300,\n", " b'el': 301,\n", " b'ct': 302,\n", " b'nd': 303,\n", " b' in': 304,\n", " b' h': 305,\n", " b'ent': 306,\n", " b'id': 307,\n", " b' n': 308,\n", " b'am': 309,\n", " b' ': 310,\n", " b' to': 311,\n", " b' re': 312,\n", " b'--': 313,\n", " b' {': 314,\n", " b' of': 315,\n", " b'om': 316,\n", " b');\\n': 317,\n", " b'im': 318,\n", " b'\\r\\n': 319,\n", " b' (': 320,\n", " b'il': 321,\n", " b'//': 322,\n", " b' and': 323,\n", " b'ur': 324,\n", " b'se': 325,\n", " b' l': 326,\n", " b'ex': 327,\n", " b' S': 328,\n", " b'ad': 329,\n", " b' \"': 330,\n", " b'ch': 331,\n", " b'ut': 332,\n", " b'if': 333,\n", " b'**': 334,\n", " b' }': 335,\n", " b'em': 336,\n", " b'ol': 337,\n", " b' ': 338,\n", " b'th': 339,\n", " b')\\n': 340,\n", " b' {\\n': 341,\n", " b' g': 342,\n", " b'ig': 343,\n", " b'iv': 344,\n", " b',\\n': 345,\n", " b'ce': 346,\n", " b'od': 347,\n", " b' v': 348,\n", " b'ate': 349,\n", " b' T': 350,\n", " b'ag': 351,\n", " b'ay': 352,\n", " b' *': 353,\n", " b'ot': 354,\n", " b'us': 355,\n", " b' C': 356,\n", " b' st': 357,\n", " b' I': 358,\n", " b'un': 359,\n", " b'ul': 360,\n", " b'ue': 361,\n", " b' A': 362,\n", " b'ow': 363,\n", " b\" '\": 364,\n", " b'ew': 365,\n", " b' <': 366,\n", " b'ation': 367,\n", " b'()': 368,\n", " b' for': 369,\n", " b'ab': 370,\n", " b'ort': 371,\n", " b'um': 372,\n", " b'ame': 373,\n", " b' is': 374,\n", " b'pe': 375,\n", " b'tr': 376,\n", " b'ck': 377,\n", " b'\\xe2\\x80': 378,\n", " b' y': 379,\n", " b'ist': 380,\n", " b'----': 381,\n", " b'.\\n\\n': 382,\n", " b'he': 383,\n", " b' e': 384,\n", " b'lo': 385,\n", " b' M': 386,\n", " b' be': 387,\n", " b'ers': 388,\n", " b' on': 389,\n", " b' con': 390,\n", " b'ap': 391,\n", " b'ub': 392,\n", " b' P': 393,\n", " b' ': 394,\n", " b'ass': 395,\n", " b'int': 396,\n", " b'>\\n': 397,\n", " b'ly': 398,\n", " b'urn': 399,\n", " b' $': 400,\n", " b';\\n\\n': 401,\n", " b'av': 402,\n", " b'port': 403,\n", " b'ir': 404,\n", " b'->': 405,\n", " b'nt': 406,\n", " b'ction': 407,\n", " b'end': 408,\n", " b' de': 409,\n", " b'ith': 410,\n", " b'out': 411,\n", " b'turn': 412,\n", " b'our': 413,\n", " b' ': 414,\n", " b'lic': 415,\n", " b'res': 416,\n", " b'pt': 417,\n", " b'==': 418,\n", " b' this': 419,\n", " b' wh': 420,\n", " b' if': 421,\n", " b' D': 422,\n", " b'ver': 423,\n", " b'age': 424,\n", " b' B': 425,\n", " b'ht': 426,\n", " b'ext': 427,\n", " b'=\"': 428,\n", " b' that': 429,\n", " b'****': 430,\n", " b' R': 431,\n", " b' it': 432,\n", " b'ess': 433,\n", " b' F': 434,\n", " b' r': 435,\n", " b'os': 436,\n", " b'and': 437,\n", " b' as': 438,\n", " b'ect': 439,\n", " b'ke': 440,\n", " b'rom': 441,\n", " b' //': 442,\n", " b'con': 443,\n", " b' L': 444,\n", " b'(\"': 445,\n", " b'qu': 446,\n", " b'lass': 447,\n", " b' with': 448,\n", " b'iz': 449,\n", " b'de': 450,\n", " b' N': 451,\n", " b' al': 452,\n", " b'op': 453,\n", " b'up': 454,\n", " b'get': 455,\n", " b' }\\n': 456,\n", " b'ile': 457,\n", " b' an': 458,\n", " b'ata': 459,\n", " b'ore': 460,\n", " b'ri': 461,\n", " b' pro': 462,\n", " b';\\r\\n': 463,\n", " b'\\t\\t\\t\\t': 464,\n", " b'ter': 465,\n", " b'ain': 466,\n", " b' W': 467,\n", " b' E': 468,\n", " b' com': 469,\n", " b' return': 470,\n", " b'art': 471,\n", " b' H': 472,\n", " b'ack': 473,\n", " b'import': 474,\n", " b'ublic': 475,\n", " b' or': 476,\n", " b'est': 477,\n", " b'ment': 478,\n", " b' G': 479,\n", " b'able': 480,\n", " b' -': 481,\n", " b'ine': 482,\n", " b'ill': 483,\n", " b'ind': 484,\n", " b'ere': 485,\n", " b'::': 486,\n", " b'ity': 487,\n", " b' +': 488,\n", " b' tr': 489,\n", " b'elf': 490,\n", " b'ight': 491,\n", " b\"('\": 492,\n", " b'orm': 493,\n", " b'ult': 494,\n", " b'str': 495,\n", " b'..': 496,\n", " b'\",': 497,\n", " b' you': 498,\n", " b'ype': 499,\n", " b'pl': 500,\n", " b' new': 501,\n", " b' j': 502,\n", " b' ': 503,\n", " b' from': 504,\n", " b' ex': 505,\n", " b' O': 506,\n", " b'ld': 507,\n", " b' [': 508,\n", " b'oc': 509,\n", " b':\\n': 510,\n", " b' se': 511,\n", " b' le': 512,\n", " b'--------': 513,\n", " b'.s': 514,\n", " b'{\\n': 515,\n", " b\"',\": 516,\n", " b'ant': 517,\n", " b' at': 518,\n", " b'ase': 519,\n", " b'.c': 520,\n", " b' ch': 521,\n", " b'': 589,\n", " b'ust': 590,\n", " b'que': 591,\n", " b' res': 592,\n", " b'))': 593,\n", " b\"'s\": 594,\n", " b' k': 595,\n", " b'ans': 596,\n", " b'yst': 597,\n", " b'unction': 598,\n", " b'********': 599,\n", " b' i': 600,\n", " b' us': 601,\n", " b'pp': 602,\n", " b'one': 603,\n", " b'ail': 604,\n", " b'====': 605,\n", " b'name': 606,\n", " b' str': 607,\n", " b' /': 608,\n", " b' &': 609,\n", " b'ach': 610,\n", " b'div': 611,\n", " b'ystem': 612,\n", " b'ell': 613,\n", " b' have': 614,\n", " b'err': 615,\n", " b'ould': 616,\n", " b'ull': 617,\n", " b'pon': 618,\n", " b' J': 619,\n", " b'_p': 620,\n", " b' ==': 621,\n", " b'ign': 622,\n", " b'St': 623,\n", " b'.\\n': 624,\n", " b' pl': 625,\n", " b');\\n\\n': 626,\n", " b'form': 627,\n", " b'put': 628,\n", " b'ount': 629,\n", " b'}\\n\\n': 630,\n", " b'dd': 631,\n", " b'ite': 632,\n", " b' get': 633,\n", " b'rr': 634,\n", " b'ome': 635,\n", " b' \\xe2\\x80': 636,\n", " b'aram': 637,\n", " b'cc': 638,\n", " b' */': 639,\n", " b'ER': 640,\n", " b'In': 641,\n", " b'les': 642,\n", " b'_s': 643,\n", " b'ong': 644,\n", " b'ie': 645,\n", " b' can': 646,\n", " b' V': 647,\n", " b'erv': 648,\n", " b'pr': 649,\n", " b' un': 650,\n", " b'row': 651,\n", " b'ber': 652,\n", " b' do': 653,\n", " b'll': 654,\n", " b' el': 655,\n", " b' self': 656,\n", " b'ated': 657,\n", " b'ary': 658,\n", " b' .': 659,\n", " b\"']\": 660,\n", " b'ud': 661,\n", " b' en': 662,\n", " b' Th': 663,\n", " b' ': 664,\n", " b'te': 665,\n", " b'_c': 666,\n", " b'uct': 667,\n", " b' ab': 668,\n", " b'ork': 669,\n", " b'.get': 670,\n", " b' #': 671,\n", " b'aw': 672,\n", " b'ress': 673,\n", " b'ob': 674,\n", " b'Name': 675,\n", " b'app': 676,\n", " b\"['\": 677,\n", " b' all': 678,\n", " b'ory': 679,\n", " b'ition': 680,\n", " b'ance': 681,\n", " b'ear': 682,\n", " b' cont': 683,\n", " b'vent': 684,\n", " b'ia': 685,\n", " b' will': 686,\n", " b'IN': 687,\n", " b' ': 688,\n", " b'return': 689,\n", " b' ': 755,\n", " b'\",\\n': 756,\n", " b'ec': 757,\n", " b' In': 758,\n", " b'ph': 759,\n", " b' |': 760,\n", " b'_f': 761,\n", " b' var': 762,\n", " b'ence': 763,\n", " b'Id': 764,\n", " b'ree': 765,\n", " b'ink': 766,\n", " b'lect': 767,\n", " b'ug': 768,\n", " b'eth': 769,\n", " b' else': 770,\n", " b'----------------': 771,\n", " b'cont': 772,\n", " b' so': 773,\n", " b'atic': 774,\n", " b' lo': 775,\n", " b'pro': 776,\n", " b'ton': 777,\n", " b'ss': 778,\n", " b'own': 779,\n", " b'abel': 780,\n", " b'oint': 781,\n", " b'ous': 782,\n", " b'eld': 783,\n", " b'ST': 784,\n", " b'The': 785,\n", " b' ': 786,\n", " b'RE': 787,\n", " b'\":': 788,\n", " b'olor': 789,\n", " b'tp': 790,\n", " b'eg': 791,\n", " b'key': 792,\n", " b'ude': 793,\n", " b' St': 794,\n", " b'ound': 795,\n", " b' ar': 796,\n", " b'\");\\n': 797,\n", " b'ener': 798,\n", " b'ser': 799,\n", " b'bject': 800,\n", " b'essage': 801,\n", " b'fer': 802,\n", " b' more': 803,\n", " b'ations': 804,\n", " b'ents': 805,\n", " b' his': 806,\n", " b' they': 807,\n", " b'.S': 808,\n", " b' Y': 809,\n", " b'use': 810,\n", " b'ne': 811,\n", " b'ish': 812,\n", " b'old': 813,\n", " b'_d': 814,\n", " b'io': 815,\n", " b'ield': 816,\n", " b' per': 817,\n", " b'Cont': 818,\n", " b'ings': 819,\n", " b'####': 820,\n", " b' data': 821,\n", " b' sa': 822,\n", " b'ef': 823,\n", " b'fo': 824,\n", " b' one': 825,\n", " b'eng': 826,\n", " b' dis': 827,\n", " b'AT': 828,\n", " b' name': 829,\n", " b' true': 830,\n", " b'val': 831,\n", " b'led': 832,\n", " b'.f': 833,\n", " b' ne': 834,\n", " b' end': 835,\n", " b'.T': 836,\n", " b'cre': 837,\n", " b'ark': 838,\n", " b'log': 839,\n", " b'Ex': 840,\n", " b'error': 841,\n", " b'_id': 842,\n", " b'urre': 843,\n", " b'ange': 844,\n", " b' null': 845,\n", " b'rray': 846,\n", " b' my': 847,\n", " b'pan': 848,\n", " b'ict': 849,\n", " b'ator': 850,\n", " b'View': 851,\n", " b'List': 852,\n", " b'\\treturn': 853,\n", " b'\\xe2\\x80\\x9d': 854,\n", " b' pre': 855,\n", " b' x': 856,\n", " b'clude': 857,\n", " b'arg': 858,\n", " b'ov': 859,\n", " b'.h': 860,\n", " b' >': 861,\n", " b' their': 862,\n", " b\"')\": 863,\n", " b'irst': 864,\n", " b'ick': 865,\n", " b'gh': 866,\n", " b'LE': 867,\n", " b'OR': 868,\n", " b' private': 869,\n", " b'tem': 870,\n", " b'\\r\\n\\r\\n': 871,\n", " b'user': 872,\n", " b' )': 873,\n", " b'com': 874,\n", " b'.A': 875,\n", " b'\";\\n': 876,\n", " b' id': 877,\n", " b'read': 878,\n", " b' who': 879,\n", " b'_b': 880,\n", " b'\">\\n': 881,\n", " b' time': 882,\n", " b' man': 883,\n", " b'ry': 884,\n", " b'========': 885,\n", " b'roup': 886,\n", " b'rop': 887,\n", " b'public': 888,\n", " b'vel': 889,\n", " b'umber': 890,\n", " b'ble': 891,\n", " b' which': 892,\n", " b'****************': 893,\n", " b' any': 894,\n", " b' false': 895,\n", " b'we': 896,\n", " b' value': 897,\n", " b' li': 898,\n", " b'\")': 899,\n", " b'nder': 900,\n", " b'gr': 901,\n", " b' no': 902,\n", " b'param': 903,\n", " b'fig': 904,\n", " b'.com': 905,\n", " b' app': 906,\n", " b'_l': 907,\n", " b'ions': 908,\n", " b'.D': 909,\n", " b' Ch': 910,\n", " b' about': 911,\n", " b' add': 912,\n", " b' su': 913,\n", " b' string': 914,\n", " b'ID': 915,\n", " b' over': 916,\n", " b'string': 917,\n", " b'.l': 918,\n", " b'ource': 919,\n", " b'_C': 920,\n", " b']\\n': 921,\n", " b' qu': 922,\n", " b' String': 923,\n", " b'ca': 924,\n", " b'SE': 925,\n", " b' ro': 926,\n", " b'sh': 927,\n", " b'ual': 928,\n", " b'Type': 929,\n", " b'son': 930,\n", " b'new': 931,\n", " b'ern': 932,\n", " b' ag': 933,\n", " b'AR': 934,\n", " b'];\\n': 935,\n", " b'].': 936,\n", " b' ?': 937,\n", " b'ical': 938,\n", " b' des': 939,\n", " b'uth': 940,\n", " b'ix': 941,\n", " b'ays': 942,\n", " b' type': 943,\n", " b\"'t\": 944,\n", " b'ault': 945,\n", " b' inter': 946,\n", " b'var': 947,\n", " b'.b': 948,\n", " b' part': 949,\n", " b'.d': 950,\n", " b'urrent': 951,\n", " b'IT': 952,\n", " b'EN': 953,\n", " b'enc': 954,\n", " b'(f': 955,\n", " b'ra': 956,\n", " b'value': 957,\n", " b'cho': 958,\n", " b'utton': 959,\n", " b'ose': 960,\n", " b' !=': 961,\n", " b'ater': 962,\n", " b'\\xc3\\xa9': 963,\n", " b'reate': 964,\n", " b'oll': 965,\n", " b'pos': 966,\n", " b'yle': 967,\n", " b'ng': 968,\n", " b'AL': 969,\n", " b'using': 970,\n", " b'ames': 971,\n", " b' {\\r\\n': 972,\n", " b'ates': 973,\n", " b'ely': 974,\n", " b' work': 975,\n", " b' em': 976,\n", " b'inal': 977,\n", " b' sp': 978,\n", " b' when': 979,\n", " b'.set': 980,\n", " b' ': 981,\n", " b'):\\n': 982,\n", " b'to': 983,\n", " b'quire': 984,\n", " b'indow': 985,\n", " b'lement': 986,\n", " b'pect': 987,\n", " b'ash': 988,\n", " b'[i': 989,\n", " b' use': 990,\n", " b'.F': 991,\n", " b'pec': 992,\n", " b' ad': 993,\n", " b'ove': 994,\n", " b'ception': 995,\n", " b'ength': 996,\n", " b'include': 997,\n", " b'ader': 998,\n", " b' ': 999,\n", " ...}" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.get_vocab()" ] }, { "cell_type": "code", "execution_count": 42, "id": "8ea45811-b04f-461e-980e-7f7c6aa8e93c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1350, 492, 151643, 863, 151643]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\")" ] }, { "cell_type": "code", "execution_count": 43, "id": "58f6bd7f-2162-4d76-8994-eea7c0a9b367", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'print'" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([1350])" ] }, { "cell_type": "code", "execution_count": 44, "id": "b655549f-f089-49c7-abff-6335a48ca117", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"print('<|endoftext|>')<|endoftext|>\"" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([1350, 492, 151643, 863, 151643])" ] }, { "cell_type": "code", "execution_count": 45, "id": "11e44fcd-f707-4cb6-aa50-0d8d6d69c7e9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151643" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.eod_id" ] }, { "cell_type": "code", "execution_count": 46, "id": "69277c19-171b-4d69-902c-738b3176aed6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]" ] }, { "cell_type": "code", "execution_count": 47, "id": "692bd4b6-8098-4596-b2bb-81bfb04f1fa0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'<|endoftext|>'" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([151643])" ] }, { "cell_type": "code", "execution_count": 48, "id": "49e51cb0-8242-489b-ace2-fc955e75844f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'|endoftext|'" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([91, 8691, 723, 427, 91])" ] }, { "cell_type": "code", "execution_count": 49, "id": "fd732a3d-046c-42a5-9edc-636f324a44aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"('<\"" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([11146])" ] }, { "cell_type": "markdown", "id": "f3cab603-279b-48e6-b8ac-7ee82da543a5", "metadata": {}, "source": [ "**disallow emerge special tokens**" ] }, { "cell_type": "code", "execution_count": 51, "id": "72c2b0c0-38b0-41b0-9c4f-5246de72b07f", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprint(\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m<|endoftext|>\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mset\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mall\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;241m+\u001b[39m [tokenizer\u001b[38;5;241m.\u001b[39meod_id]\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2373\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)\u001b[0m\n\u001b[1;32m 2336\u001b[0m \u001b[38;5;129m@add_end_docstrings\u001b[39m(\n\u001b[1;32m 2337\u001b[0m ENCODE_KWARGS_DOCSTRING,\n\u001b[1;32m 2338\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2356\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 2357\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m 2358\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2359\u001b[0m \u001b[38;5;124;03m Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[1;32m 2360\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2371\u001b[0m \u001b[38;5;124;03m method).\u001b[39;00m\n\u001b[1;32m 2372\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2373\u001b[0m encoded_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2374\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2375\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2376\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2377\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2378\u001b[0m \u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2379\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2380\u001b[0m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2381\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2382\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2383\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2385\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m encoded_inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2781\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2771\u001b[0m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m 2772\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m 2773\u001b[0m padding\u001b[38;5;241m=\u001b[39mpadding,\n\u001b[1;32m 2774\u001b[0m truncation\u001b[38;5;241m=\u001b[39mtruncation,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2778\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 2779\u001b[0m )\n\u001b[0;32m-> 2781\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2782\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2783\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2784\u001b[0m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2785\u001b[0m \u001b[43m \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2786\u001b[0m \u001b[43m \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2787\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2788\u001b[0m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2789\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2790\u001b[0m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2791\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2792\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2793\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2794\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2795\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2796\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2797\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2798\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2799\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2800\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils.py:656\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m return_offsets_mapping:\n\u001b[1;32m 648\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 649\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreturn_offset_mapping is not available when using Python tokenizers. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 650\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTo use this feature, change your tokenizer to one deriving from \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://github.com/huggingface/transformers/pull/2674\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 654\u001b[0m )\n\u001b[0;32m--> 656\u001b[0m first_ids \u001b[38;5;241m=\u001b[39m \u001b[43mget_input_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 657\u001b[0m second_ids \u001b[38;5;241m=\u001b[39m get_input_ids(text_pair) \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_for_model(\n\u001b[1;32m 660\u001b[0m first_ids,\n\u001b[1;32m 661\u001b[0m pair_ids\u001b[38;5;241m=\u001b[39msecond_ids,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 675\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[1;32m 676\u001b[0m )\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/tokenization_utils.py:623\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus..get_input_ids\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 621\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_ids\u001b[39m(text):\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 623\u001b[0m tokens \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 624\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconvert_tokens_to_ids(tokens)\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(text) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;28mstr\u001b[39m):\n", "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/Qwen/Qwen-7B/4792686b9af1b3663a02f39bc44f37326f4f30f4/tokenization_qwen.py:182\u001b[0m, in \u001b[0;36mQWenTokenizer.tokenize\u001b[0;34m(self, text, allowed_special, disallowed_special, **kwargs)\u001b[0m\n\u001b[1;32m 179\u001b[0m text \u001b[38;5;241m=\u001b[39m unicodedata\u001b[38;5;241m.\u001b[39mnormalize(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNFC\u001b[39m\u001b[38;5;124m\"\u001b[39m, text)\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# this implementation takes a detour: text -> token id -> token surface forms\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallowed_special\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisallowed_special\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisallowed_special\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 185\u001b[0m tokens\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder[t])\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m tokens\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/tiktoken/core.py:117\u001b[0m, in \u001b[0;36mEncoding.encode\u001b[0;34m(self, text, allowed_special, disallowed_special)\u001b[0m\n\u001b[1;32m 115\u001b[0m disallowed_special \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mfrozenset\u001b[39m(disallowed_special)\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m match \u001b[38;5;241m:=\u001b[39m _special_token_regex(disallowed_special)\u001b[38;5;241m.\u001b[39msearch(text):\n\u001b[0;32m--> 117\u001b[0m \u001b[43mraise_disallowed_special_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmatch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_core_bpe\u001b[38;5;241m.\u001b[39mencode(text, allowed_special)\n", "File \u001b[0;32m~/.local/lib/python3.10/site-packages/tiktoken/core.py:351\u001b[0m, in \u001b[0;36mraise_disallowed_special_token\u001b[0;34m(token)\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mraise_disallowed_special_token\u001b[39m(token: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m NoReturn:\n\u001b[0;32m--> 351\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 352\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEncountered text corresponding to disallowed special token \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 353\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIf you want this text to be encoded as a special token, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 354\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpass it to `allowed_special`, e.g. `allowed_special=\u001b[39m\u001b[38;5;130;01m{{\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m, ...\u001b[39m\u001b[38;5;130;01m}}\u001b[39;00m\u001b[38;5;124m`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 355\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIf you want this text to be encoded as normal text, disable the check for this token \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mby passing `disallowed_special=(enc.special_tokens_set - \u001b[39m\u001b[38;5;130;01m{{\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;130;01m}}\u001b[39;00m\u001b[38;5;124m)`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 357\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTo disable this check for all special tokens, pass `disallowed_special=()`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 358\u001b[0m )\n", "\u001b[0;31mValueError\u001b[0m: Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n" ] } ], "source": [ "tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]" ] }, { "cell_type": "code", "execution_count": 52, "id": "d730bd02-d07f-42e8-8bf1-d311e6337061", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using unk_token, but it is not set yet.\n" ] } ], "source": [ "tokenizer.unk_token" ] }, { "cell_type": "code", "execution_count": 58, "id": "e1d658dd-03d4-4bb2-8947-e6d84f654d6f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NoneType" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(tokenizer.pad_token_id)\n", "#tokenizer._convert_id_to_token(tokenizer.pad_token_id)" ] }, { "cell_type": "code", "execution_count": 54, "id": "139d66d9-017a-40d4-8b2d-919a78439ddf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151646" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.special_tokens['<|extra_0|>']\n", "#tokenizer.special_tokens['<|extra_204|>']" ] }, { "cell_type": "markdown", "id": "15efb8c0-073e-4f9a-974c-9709f705e5f3", "metadata": {}, "source": [ "#abc" ] }, { "cell_type": "code", "execution_count": null, "id": "b714951c-745e-41dd-86d2-89e28862e21c", "metadata": {}, "outputs": [], "source": [ "ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]\n", "tokenizer.convert_ids_to_tokens(ids)" ] }, { "cell_type": "code", "execution_count": 78, "id": "0c217322-b24b-4803-bb4d-fb9c675fef5f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[b' ']" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ids = tokenizer.encode(\" \")\n", "tokenizer.convert_ids_to_tokens(ids)" ] }, { "cell_type": "code", "execution_count": 101, "id": "1ac63768-bb7c-45de-ba95-ca918db5f806", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ids: [151644, 1350, 492, 35946, 99639, 91680, 100472, 151646, 1305, 2, 116198, 116198, 116198, 13, 10236, 226, 114, 151645, 151643]\n" ] } ], "source": [ "ids = tokenizer.encode(\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵. 然<|im_end|>\", \n", " allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n", " disallowed_special={'<|endoftext|>'}) + [tokenizer.eod_id]\n", "print(\"ids:\", ids)" ] }, { "cell_type": "code", "execution_count": 102, "id": "47530127-7f66-4b2c-9396-10b672ef9c4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['<|im_start|>',\n", " b'print',\n", " b\"('\",\n", " b'\\xe6\\x88\\x91',\n", " b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80',\n", " b'\\xe5\\x8f\\xaa',\n", " b'\\xe7\\x8c\\xab',\n", " '<|extra_0|>',\n", " b\"')\\n\",\n", " b'#',\n", " b'\\xe5\\x96\\xb5',\n", " b'\\xe5\\x96\\xb5',\n", " b'\\xe5\\x96\\xb5',\n", " b'.',\n", " b' \\xe7',\n", " b'\\x84',\n", " b'\\xb6',\n", " '<|im_end|>',\n", " '<|endoftext|>']" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.convert_ids_to_tokens(ids)\n", "##bytes to string\n", "\n", "# for t in tokens:\n", "# if isinstance(t, bytes):\n", "# try:\n", "# t = t.decode('utf-8')\n", "# except:\n", "# print(\"*\", t)\n", "# t = t.decode('iso-8859-1')\n", "# print(t)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "c9a655c8-cf35-4209-9815-dd077fdfe6d7", "metadata": {}, "outputs": [], "source": [ "tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))" ] }, { "cell_type": "code", "execution_count": 64, "id": "26664e1a-192f-499f-9411-d6639d302b73", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'是一'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bbytes = b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80'\n", "bbytes.decode('utf-8')" ] }, { "cell_type": "code", "execution_count": null, "id": "0a50927d-0a26-4083-976a-3ec600405f7b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }