{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[97, 121, 121, 121, 121, 32, 119, 104, 97, 116, 115, 32, 117, 112, 32, 128075]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[ord(x) for x in \"ayyyy whats up 👋\"]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[97,\n", " 121,\n", " 121,\n", " 121,\n", " 121,\n", " 32,\n", " 119,\n", " 104,\n", " 97,\n", " 116,\n", " 115,\n", " 32,\n", " 117,\n", " 112,\n", " 32,\n", " 240,\n", " 159,\n", " 145,\n", " 139]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(\"ayyyy whats up 👋\".encode('utf-8'))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_counts(text):\n", " counts = {}\n", " for pairs in zip(text, text[1:]):\n", " counts[pairs] = counts.get(pairs, 0) + 1\n", " return counts\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('y', 'y'), 3)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "counts = get_counts(\"ayyyy whats up 👋\")\n", "top_pair = max(counts, key=counts.get)\n", "top_pair, counts[top_pair]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((121, 121), 3)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "counts = get_counts(\"ayyyy whats up 👋\".encode('utf-8'))\n", "top_pair = max(counts, key=counts.get)\n", "top_pair, counts[top_pair]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[97,\n", " 69420,\n", " 69420,\n", " 32,\n", " 119,\n", " 104,\n", " 97,\n", " 116,\n", " 115,\n", " 32,\n", " 117,\n", " 112,\n", " 32,\n", " 240,\n", " 159,\n", " 145,\n", " 139]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def merge_token(token_pattern, text, symbol):\n", " i = 0\n", " new_text = []\n", " while i < len(text):\n", " if i + 1 < len(text) and text[i] == token_pattern[0] and text[i+1] == token_pattern[1]:\n", " new_text.append(symbol)\n", " i += 2\n", " else:\n", " new_text.append(text[i])\n", " i += 1\n", " return new_text\n", "\n", "new_text = merge_token(top_pair, \"ayyyy whats up 👋\".encode('utf-8'), 69420)\n", "\n", "new_text\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I’ve been on the low\n", "\n", "I been taking my time\n", "I feel\n", "[73, 226, 128, 153, 118, 101, 32, 98, 101, 101, 110, 32, 111, 110, 32, 116, 104, 101, 32, 108, 111, 119, 10, 10, 73, 32, 98, 101, 101, 110, 32, 116, 97, 107, 105, 110, 103, 32, 109, 121, 32, 116, 105, 109, 101, 10, 73, 32, 102, 101]\n", "merged (101, 32) to 256\n", "merged (116, 32) to 257\n", "merged (116, 104) to 258\n", "merged (105, 110) to 259\n", "merged (44, 32) to 260\n", "merged (115, 32) to 261\n", "merged (111, 117) to 262\n", "merged (121, 32) to 263\n", "merged (101, 114) to 264\n", "merged (100, 32) to 265\n", "merged (97, 110) to 266\n", "merged (73, 32) to 267\n", "merged (32, 258) to 268\n", "merged (97, 32) to 269\n", "merged (111, 110) to 270\n", "merged (259, 39) to 271\n", "merged (111, 119) to 272\n", "merged (111, 32) to 273\n", "merged (108, 105) to 274\n", "merged (108, 108) to 275\n", "merged (99, 107) to 276\n", "merged (101, 97) to 277\n", "merged (121, 262) to 278\n", "merged (111, 114) to 279\n", "merged (101, 110) to 280\n", "merged (268, 256) to 281\n", "merged (271, 32) to 282\n", "merged (109, 32) to 283\n", "merged (101, 10) to 284\n", "merged (32, 109) to 285\n", "merged (104, 97) to 286\n", "merged (105, 116) to 287\n", "merged (73, 39) to 288\n", "merged (107, 256) to 289\n", "merged (259, 103) to 290\n", "merged (117, 115) to 291\n", "merged (101, 101) to 292\n", "merged (226, 128) to 293\n", "merged (118, 264) to 294\n", "merged (110, 272) to 295\n", "merged (105, 103) to 296\n", "merged (105, 261) to 297\n", "merged (278, 32) to 298\n", "merged (280, 32) to 299\n", "merged (105, 257) to 300\n", "merged (116, 273) to 301\n", "merged (115, 116) to 302\n", "merged (115, 104) to 303\n", "merged (274, 289) to 304\n", "merged (288, 283) to 305\n", "merged (99, 104) to 306\n", "merged (118, 256) to 307\n", "merged (108, 97) to 308\n", "merged (293, 153) to 309\n", "merged (266, 265) to 310\n", "merged (111, 100) to 311\n", "merged (10, 267) to 312\n", "merged (117, 276) to 313\n", "merged (258, 256) to 314\n", "merged (114, 97) to 315\n", "merged (101, 115) to 316\n", "merged (116, 105) to 317\n", "merged (102, 279) to 318\n", "merged (107, 295) to 319\n", "merged (109, 256) to 320\n", "merged (101, 294) to 321\n", "merged (97, 257) to 322\n", "merged (98, 101) to 323\n", "merged (97, 275) to 324\n", "merged (101, 257) to 325\n", "merged (117, 257) to 326\n", "merged (114, 111) to 327\n", "merged (41, 10) to 328\n", "merged (102, 313) to 329\n", "merged (119, 104) to 330\n", "merged (296, 104) to 331\n", "merged (290, 32) to 332\n", "merged (119, 105) to 333\n", "merged (100, 270) to 334\n", "merged (101, 260) to 335\n", "merged (103, 111) to 336\n", "merged (264, 32) to 337\n", "merged (109, 263) to 338\n", "merged (105, 99) to 339\n", "merged (97, 114) to 340\n", "merged (39, 261) to 341\n", "merged (111, 102) to 342\n", "merged (110, 269) to 343\n", "merged (117, 112) to 344\n", "merged (285, 263) to 345\n", "merged (97, 116) to 346\n", "merged (97, 261) to 347\n", "merged (266, 32) to 348\n", "merged (260, 267) to 349\n", "merged (115, 10) to 350\n", "merged (101, 265) to 351\n", "merged (108, 32) to 352\n", "merged (119, 266) to 353\n", "merged (110, 265) to 354\n", "merged (333, 258) to 355\n", "merged (108, 111) to 356\n", "merged (276, 32) to 357\n", "merged (100, 10) to 358\n", "merged (272, 32) to 359\n", "merged (114, 32) to 360\n", "merged (102, 292) to 361\n", "merged (98, 256) to 362\n", "merged (111, 109) to 363\n", "merged (39, 257) to 364\n", "merged (101, 263) to 365\n", "merged (319, 32) to 366\n", "merged (98, 311) to 367\n", "merged (291, 257) to 368\n", "merged (116, 10) to 369\n", "merged (98, 97) to 370\n", "merged (323, 299) to 371\n", "merged (367, 263) to 372\n", "merged (114, 277) to 373\n", "merged (108, 256) to 374\n", "merged (97, 108) to 375\n", "merged (110, 111) to 376\n", "merged (32, 269) to 377\n", "merged (97, 109) to 378\n", "merged (353, 343) to 379\n", "merged (93, 10) to 380\n", "merged (277, 104) to 381\n", "merged (260, 98) to 382\n", "merged (10, 66) to 383\n", "merged (10, 91) to 384\n", "merged (101, 261) to 385\n", "merged (87, 104) to 386\n", "merged (100, 105) to 387\n", "merged (65, 354) to 388\n", "merged (286, 116) to 389\n", "merged (111, 112) to 390\n", "merged (106, 368) to 391\n", "merged (275, 32) to 392\n", "merged (89, 262) to 393\n", "merged (108, 265) to 394\n", "merged (309, 257) to 395\n", "merged (111, 258) to 396\n", "merged (116, 116) to 397\n", "merged (321, 121) to 398\n", "merged (315, 112) to 399\n", "merged (117, 110) to 400\n", "merged (262, 257) to 401\n", "merged (97, 121) to 402\n", "merged (73, 83) to 403\n", "merged (109, 259) to 404\n", "merged (105, 100) to 405\n", "merged (111, 103) to 406\n", "merged (105, 115) to 407\n", "merged (114, 101) to 408\n", "merged (10, 84) to 409\n", "merged (103, 104) to 410\n", "merged (97, 115) to 411\n", "merged (258, 322) to 412\n", "merged (116, 101) to 413\n", "merged (117, 114) to 414\n", "merged (331, 257) to 415\n", "merged (103, 325) to 416\n", "merged (321, 32) to 417\n", "merged (104, 256) to 418\n", "merged (73, 309) to 419\n", "merged (274, 118) to 420\n", "merged (97, 263) to 421\n", "merged (116, 111) to 422\n", "merged (271, 257) to 423\n", "merged (334, 364) to 424\n", "merged (318, 32) to 425\n", "merged (272, 110) to 426\n", "merged (259, 281) to 427\n", "merged (97, 45) to 428\n", "merged (101, 118) to 429\n", "merged (264, 256) to 430\n", "merged (97, 100) to 431\n", "merged (334, 395) to 432\n", "merged (115, 273) to 433\n", "merged (10, 76) to 434\n", "merged (260, 119) to 435\n", "merged (306, 105) to 436\n", "merged (393, 32) to 437\n", "merged (114, 256) to 438\n", "merged (105, 275) to 439\n", "merged (97, 423) to 440\n", "merged (361, 352) to 441\n", "merged (270, 32) to 442\n", "merged (262, 410) to 443\n", "merged (268, 322) to 444\n", "merged (107, 32) to 445\n", "merged (342, 32) to 446\n", "merged (111, 111) to 447\n", "merged (270, 281) to 448\n", "merged (317, 109) to 449\n", "merged (284, 267) to 450\n", "merged (10, 388) to 451\n", "merged (258, 332) to 452\n", "merged (114, 105) to 453\n", "merged (286, 257) to 454\n", "merged (406, 339) to 455\n", "merged (10, 83) to 456\n", "merged (288, 109) to 457\n", "merged (262, 394) to 458\n", "merged (108, 272) to 459\n", "merged (99, 324) to 460\n", "merged (97, 98) to 461\n", "merged (101, 109) to 462\n", "merged (361, 108) to 463\n", "merged (121, 381) to 464\n", "merged (264, 115) to 465\n", "merged (318, 281) to 466\n", "merged (121, 260) to 467\n", "merged (419, 283) to 468\n", "merged (104, 359) to 469\n", "merged (296, 103) to 470\n", "merged (116, 97) to 471\n", "merged (274, 102) to 472\n", "merged (97, 259) to 473\n", "merged (303, 300) to 474\n", "merged (278, 360) to 475\n", "merged (264, 261) to 476\n", "merged (69, 294) to 477\n", "merged (287, 260) to 478\n", "merged (101, 283) to 479\n", "merged (389, 341) to 480\n", "merged (63, 10) to 481\n", "merged (258, 365) to 482\n", "merged (104, 277) to 483\n", "merged (67, 104) to 484\n", "merged (89, 381) to 485\n", "merged (108, 263) to 486\n", "merged (58, 32) to 487\n", "merged (46, 32) to 488\n", "merged (293, 148) to 489\n", "merged (100, 260) to 490\n", "merged (97, 99) to 491\n", "merged (441, 304) to 492\n", "merged (268, 297) to 493\n", "merged (259, 32) to 494\n", "merged (336, 257) to 495\n", "merged (326, 267) to 496\n", "merged (376, 257) to 497\n", "merged (99, 97) to 498\n", "merged (107, 282) to 499\n", "merged (370, 357) to 500\n", "merged (109, 266) to 501\n", "merged (76, 455) to 502\n", "merged (287, 10) to 503\n", "merged (100, 426) to 504\n", "merged (100, 264) to 505\n", "merged (108, 101) to 506\n", "merged (86, 465) to 507\n", "merged (116, 260) to 508\n", "merged (101, 275) to 509\n", "merged (121, 372) to 510\n", "merged (110, 273) to 511\n", "merged (112, 32) to 512\n", "merged (116, 114) to 513\n", "merged (103, 105) to 514\n", "merged (344, 32) to 515\n", "merged (98, 263) to 516\n", "merged (287, 306) to 517\n", "merged (10, 71) to 518\n", "merged (258, 346) to 519\n", "merged (102, 32) to 520\n", "merged (324, 32) to 521\n", "merged (110, 292) to 522\n", "merged (260, 305) to 523\n", "merged (403, 403) to 524\n", "merged (268, 365) to 525\n", "merged (330, 299) to 526\n", "merged (384, 507) to 527\n", "merged (105, 350) to 528\n", "merged (97, 10) to 529\n", "merged (422, 281) to 530\n", "merged (116, 261) to 531\n", "merged (457, 269) to 532\n", "merged (291, 256) to 533\n", "merged (398, 372) to 534\n", "merged (39, 438) to 535\n", "merged (112, 112) to 536\n", "merged (114, 331) to 537\n", "merged (110, 417) to 538\n", "merged (397, 269) to 539\n", "merged (102, 259) to 540\n", "merged (109, 279) to 541\n", "merged (102, 327) to 542\n", "merged (285, 256) to 543\n", "merged (100, 111) to 544\n", "merged (117, 109) to 545\n", "merged (100, 273) to 546\n", "merged (104, 297) to 547\n", "merged (436, 107) to 548\n", "merged (449, 385) to 549\n", "merged (329, 282) to 550\n", "merged (356, 307) to 551\n", "merged (308, 357) to 552\n", "merged (472, 256) to 553\n", "merged (115, 97) to 554\n", "merged (116, 264) to 555\n", "merged (104, 111) to 556\n", "merged (101, 256) to 557\n", "merged (527, 256) to 558\n", "merged (269, 115) to 559\n", "merged (103, 32) to 560\n", "merged (269, 109) to 561\n", "merged (256, 310) to 562\n", "merged (99, 348) to 563\n", "merged (379, 362) to 564\n", "merged (97, 420) to 565\n", "merged (270, 284) to 566\n", "merged (336, 539) to 567\n", "merged (330, 430) to 568\n", "merged (398, 452) to 569\n", "merged (110, 97) to 570\n", "merged (73, 110) to 571\n", "merged (100, 337) to 572\n", "merged (98, 108) to 573\n", "merged (277, 360) to 574\n", "merged (119, 454) to 575\n", "merged (111, 116) to 576\n", "merged (522, 265) to 577\n", "merged (10, 72) to 578\n", "merged (97, 260) to 579\n", "merged (39, 479) to 580\n", "merged (115, 260) to 581\n", "merged (258, 297) to 582\n", "merged (119, 279) to 583\n", "merged (307, 371) to 584\n", "merged (108, 325) to 585\n", "merged (99, 266) to 586\n", "merged (286, 265) to 587\n", "merged (316, 257) to 588\n", "merged (111, 265) to 589\n", "merged (101, 108) to 590\n", "merged (119, 347) to 591\n", "merged (287, 341) to 592\n", "merged (111, 320) to 593\n", "merged (108, 121) to 594\n", "merged (115, 105) to 595\n", "merged (114, 262) to 596\n", "merged (355, 32) to 597\n", "merged (484, 279) to 598\n", "merged (598, 291) to 599\n", "merged (119, 256) to 600\n", "merged (463, 282) to 601\n", "merged (428, 548) to 602\n", "merged (109, 284) to 603\n", "merged (386, 111) to 604\n", "merged (290, 260) to 605\n", "merged (329, 32) to 606\n", "merged (316, 115) to 607\n", "merged (312, 371) to 608\n", "merged (98, 326) to 609\n", "merged (355, 377) to 610\n", "merged (104, 260) to 611\n", "merged (285, 396) to 612\n", "merged (108, 277) to 613\n", "merged (97, 103) to 614\n", "merged (111, 98) to 615\n", "merged (100, 263) to 616\n", "merged (110, 470) to 617\n", "merged (270, 256) to 618\n", "merged (115, 112) to 619\n", "merged (39, 32) to 620\n", "merged (304, 269) to 621\n", "merged (477, 510) to 622\n", "merged (99, 256) to 623\n", "merged (63, 32) to 624\n", "merged (10, 386) to 625\n", "merged (317, 270) to 626\n", "merged (10, 70) to 627\n", "merged (66, 326) to 628\n", "merged (277, 114) to 629\n", "merged (99, 363) to 630\n", "merged (10, 78) to 631\n", "merged (110, 101) to 632\n", "merged (98, 517) to 633\n", "merged (498, 533) to 634\n", "merged (487, 502) to 635\n", "merged (370, 276) to 636\n", "merged (10, 77) to 637\n", "merged (101, 390) to 638\n", "merged (105, 108) to 639\n", "merged (278, 535) to 640\n", "merged (99, 262) to 641\n", "merged (524, 260) to 642\n", "merged (480, 344) to 643\n", "merged (338, 404) to 644\n", "merged (308, 413) to 645\n", "merged (121, 10) to 646\n", "merged (363, 101) to 647\n", "merged (340, 256) to 648\n", "merged (105, 114) to 649\n", "merged (100, 114) to 650\n", "merged (114, 415) to 651\n", "merged (262, 369) to 652\n", "merged (429, 299) to 653\n", "merged (104, 32) to 654\n", "merged (542, 283) to 655\n", "merged (329, 476) to 656\n", "merged (313, 32) to 657\n", "merged (262, 354) to 658\n", "merged (111, 294) to 659\n", "merged (355, 281) to 660\n", "merged (103, 273) to 661\n", "merged (107, 284) to 662\n", "merged (268, 346) to 663\n", "merged (107, 439) to 664\n", "merged (107, 105) to 665\n", "merged (373, 306) to 666\n", "merged (604, 97) to 667\n", "merged (320, 416) to 668\n", "merged (115, 647) to 669\n", "merged (109, 335) to 670\n", "merged (309, 261) to 671\n", "merged (104, 363) to 672\n", "merged (258, 290) to 673\n", "merged (513, 121) to 674\n", "merged (286, 114) to 675\n", "merged (116, 327) to 676\n", "merged (259, 345) to 677\n", "merged (382, 326) to 678\n", "merged (435, 256) to 679\n", "merged (344, 260) to 680\n", "merged (399, 32) to 681\n", "merged (111, 107) to 682\n", "merged (10, 393) to 683\n", "merged (100, 101) to 684\n", "merged (596, 110) to 685\n", "merged (519, 341) to 686\n", "merged (384, 599) to 687\n", "merged (118, 105) to 688\n", "merged (303, 287) to 689\n", "merged (10, 79) to 690\n", "merged (317, 320) to 691\n", "merged (40, 66) to 692\n", "merged (102, 491) to 693\n", "merged (98, 277) to 694\n", "merged (40, 87) to 695\n", "merged (303, 458) to 696\n", "merged (71, 311) to 697\n", "merged (40, 667) to 698\n", "merged (447, 33) to 699\n", "merged (668, 644) to 700\n", "merged (342, 345) to 701\n", "merged (73, 257) to 702\n", "merged (473, 395) to 703\n", "merged (10, 65) to 704\n", "merged (396, 337) to 705\n", "merged (112, 308) to 706\n", "merged (10, 437) to 707\n", "merged (324, 263) to 708\n", "merged (330, 273) to 709\n", "merged (514, 307) to 710\n", "merged (41, 312) to 711\n", "merged (118, 282) to 712\n", "merged (304, 258) to 713\n", "merged (119, 270) to 714\n", "merged (116, 282) to 715\n", "merged (110, 32) to 716\n", "merged (518, 325) to 717\n", "merged (276, 101) to 718\n", "merged (108, 359) to 719\n", "merged (119, 421) to 720\n", "merged (119, 402) to 721\n", "merged (270, 345) to 722\n", "merged (39, 634) to 723\n", "merged (635, 380) to 724\n", "merged (602, 602) to 725\n", "merged (383, 326) to 726\n", "merged (103, 270) to 727\n", "merged (111, 121) to 728\n", "merged (316, 256) to 729\n", "merged (112, 638) to 730\n", "merged (99, 458) to 731\n", "merged (351, 301) to 732\n", "merged (308, 276) to 733\n", "merged (270, 486) to 734\n", "merged (308, 307) to 735\n", "merged (698, 328) to 736\n", "merged (399, 382) to 737\n", "merged (109, 348) to 738\n", "merged (379, 387) to 739\n", "merged (270, 101) to 740\n", "merged (461, 401) to 741\n", "merged (274, 410) to 742\n", "merged (258, 264) to 743\n", "merged (108, 100) to 744\n", "merged (108, 270) to 745\n", "merged (674, 343) to 746\n", "merged (266, 121) to 747\n", "merged (526, 267) to 748\n", "merged (264, 656) to 749\n", "merged (340, 658) to 750\n", "merged (434, 105) to 751\n", "merged (302, 390) to 752\n", "merged (681, 303) to 753\n", "merged (66, 327) to 754\n", "merged (317, 718) to 755\n", "merged (755, 531) to 756\n", "merged (52, 48) to 757\n", "merged (375, 433) to 758\n", "merged (274, 662) to 759\n", "merged (105, 109) to 760\n", "merged (615, 516) to 761\n", "merged (97, 289) to 762\n", "merged (464, 260) to 763\n", "merged (104, 300) to 764\n", "merged (111, 394) to 765\n", "merged (101, 392) to 766\n", "merged (295, 32) to 767\n", "merged (10, 485) to 768\n", "merged (296, 32) to 769\n", "merged (107, 292) to 770\n", "merged (271, 260) to 771\n", "merged (267, 109) to 772\n", "merged (483, 114) to 773\n", "merged (266, 100) to 774\n", "merged (356, 118) to 775\n", "merged (97, 329) to 776\n", "merged (114, 605) to 777\n", "merged (286, 109) to 778\n", "merged (10, 87) to 779\n", "merged (303, 32) to 780\n", "merged (549, 466) to 781\n", "merged (642, 642) to 782\n", "merged (260, 338) to 783\n", "merged (434, 325) to 784\n", "merged (492, 338) to 785\n", "merged (107, 259) to 786\n", "merged (258, 259) to 787\n", "merged (104, 264) to 788\n", "merged (409, 104) to 789\n", "merged (115, 257) to 790\n", "merged (256, 40) to 791\n", "merged (267, 115) to 792\n", "merged (384, 571) to 793\n", "merged (713, 528) to 794\n", "merged (287, 256) to 795\n", "merged (612, 749) to 796\n", "merged (101, 100) to 797\n", "merged (271, 268) to 798\n", "merged (114, 443) to 799\n", "merged (108, 117) to 800\n", "merged (102, 117) to 801\n", "merged (100, 378) to 802\n", "merged (557, 753) to 803\n", "merged (803, 272) to 804\n", "merged (804, 261) to 805\n", "merged (805, 110) to 806\n", "merged (806, 574) to 807\n", "merged (807, 754) to 808\n", "merged (808, 682) to 809\n", "merged (809, 594) to 810\n", "merged (810, 110) to 811\n", "merged (811, 717) to 812\n", "merged (812, 756) to 813\n", "merged (813, 347) to 814\n", "merged (814, 719) to 815\n", "merged (815, 347) to 816\n", "merged (816, 36) to 817\n", "merged (817, 757) to 818\n", "merged (818, 683) to 819\n", "merged (819, 285) to 820\n", "merged (820, 415) to 821\n", "merged (821, 758) to 822\n", "merged (822, 759) to 823\n", "merged (306, 266) to 824\n", "merged (382, 496) to 825\n", "merged (471, 289) to 826\n", "merged (114, 292) to 827\n", "merged (99, 315) to 828\n", "merged (101, 397) to 829\n", "merged (301, 98) to 830\n", "merged (112, 114) to 831\n", "merged (586, 364) to 832\n", "merged (105, 107) to 833\n", "merged (117, 99) to 834\n", "merged (259, 377) to 835\n", "merged (406, 32) to 836\n", "merged (302, 105) to 837\n", "merged (288, 584) to 838\n", "merged (777, 777) to 839\n", "merged (10, 622) to 840\n", "merged (695, 699) to 841\n", "merged (121, 349) to 842\n", "merged (537, 369) to 843\n", "merged (562, 541) to 844\n", "merged (339, 576) to 845\n", "merged (845, 259) to 846\n", "merged (297, 375) to 847\n", "merged (358, 702) to 848\n", "merged (785, 553) to 849\n", "merged (565, 450) to 850\n", "merged (429, 280) to 851\n", "merged (312, 366) to 852\n", "merged (278, 114) to 853\n", "merged (353, 257) to 854\n", "merged (284, 78) to 855\n", "merged (400, 505) to 856\n", "merged (331, 116) to 857\n", "merged (383, 496) to 858\n", "merged (76, 105) to 859\n", "merged (119, 114) to 860\n", "merged (356, 111) to 861\n", "merged (100, 402) to 862\n", "merged (70, 657) to 863\n", "merged (672, 105) to 864\n", "merged (99, 593) to 865\n", "merged (105, 374) to 866\n", "merged (316, 260) to 867\n", "merged (310, 314) to 868\n", "merged (111, 257) to 869\n", "merged (97, 102) to 870\n", "merged (116, 588) to 871\n", "merged (389, 256) to 872\n", "merged (114, 400) to 873\n", "merged (279, 32) to 874\n", "merged (420, 282) to 875\n", "merged (39, 462) to 876\n", "merged (102, 378) to 877\n", "merged (67, 97) to 878\n", "merged (402, 362) to 879\n", "merged (101, 341) to 880\n", "merged (115, 109) to 881\n", "merged (110, 364) to 882\n", "merged (98, 111) to 883\n", "merged (65, 121) to 884\n", "merged (100, 103) to 885\n", "merged (80, 666) to 886\n", "merged (286, 307) to 887\n", "merged (112, 105) to 888\n", "merged (469, 301) to 889\n", "merged (335, 534) to 890\n", "merged (841, 328) to 891\n", "merged (836, 39) to 892\n", "merged (892, 685) to 893\n", "merged (893, 100) to 894\n", "merged (894, 489) to 895\n", "merged (655, 97) to 896\n", "merged (896, 489) to 897\n", "merged (74, 316) to 898\n", "merged (595, 302) to 899\n", "merged (115, 270) to 900\n", "merged (569, 847) to 901\n", "merged (419, 584) to 902\n", "merged (107, 290) to 903\n", "merged (492, 468) to 904\n", "merged (401, 701) to 905\n", "merged (404, 284) to 906\n", "merged (116, 375) to 907\n", "merged (112, 315) to 908\n", "merged (303, 359) to 909\n", "merged (376, 372) to 910\n", "merged (789, 365) to 911\n", "merged (115, 421) to 912\n", "merged (298, 301) to 913\n", "merged (625, 299) to 914\n", "merged (316, 261) to 915\n", "merged (71, 589) to 916\n", "merged (568, 298) to 917\n", "merged (274, 397) to 918\n", "merged (918, 374) to 919\n", "merged (540, 708) to 920\n", "merged (676, 380) to 921\n", "merged (10, 558) to 922\n", "merged (370, 98) to 923\n", "merged (427, 99) to 924\n", "merged (331, 369) to 925\n", "merged (802, 110) to 926\n", "merged (301, 109) to 927\n", "merged (116, 390) to 928\n", "merged (115, 593) to 929\n", "merged (83, 823) to 930\n", "merged (632, 119) to 931\n", "merged (98, 356) to 932\n", "merged (109, 396) to 933\n", "merged (269, 98) to 934\n", "merged (268, 443) to 935\n", "merged (114, 405) to 936\n", "merged (101, 99) to 937\n", "merged (280, 265) to 938\n", "merged (117, 116) to 939\n", "merged (334, 256) to 940\n", "merged (630, 282) to 941\n", "merged (303, 503) to 942\n", "merged (493, 474) to 943\n", "merged (84, 104) to 944\n", "merged (115, 256) to 945\n", "merged (304, 314) to 946\n", "merged (114, 272) to 947\n", "merged (730, 374) to 948\n", "merged (109, 281) to 949\n", "merged (516, 314) to 950\n", "merged (10, 305) to 951\n", "merged (112, 291) to 952\n", "merged (271, 281) to 953\n", "merged (614, 473) to 954\n", "merged (297, 314) to 955\n", "merged (346, 116) to 956\n", "merged (10, 68) to 957\n", "merged (336, 282) to 958\n", "merged (884, 121) to 959\n", "merged (349, 371) to 960\n", "merged (121, 111) to 961\n", "merged (961, 620) to 962\n", "merged (83, 112) to 963\n", "merged (97, 302) to 964\n", "merged (116, 351) to 965\n", "merged (407, 105) to 966\n", "merged (573, 292) to 967\n", "merged (112, 264) to 968\n", "merged (318, 268) to 969\n", "merged (85, 611) to 970\n", "merged (100, 97) to 971\n", "merged (844, 844) to 972\n", "merged (83, 101) to 973\n", "merged (902, 448) to 974\n", "merged (974, 459) to 975\n", "merged (608, 471) to 976\n", "merged (976, 903) to 977\n", "merged (977, 345) to 978\n", "merged (978, 449) to 979\n", "merged (979, 450) to 980\n", "merged (980, 904) to 981\n", "merged (981, 905) to 982\n", "merged (982, 404) to 983\n", "merged (983, 848) to 984\n", "merged (984, 849) to 985\n", "merged (985, 703) to 986\n", "merged (986, 906) to 987\n", "merged (116, 311) to 988\n", "merged (413, 392) to 989\n", "merged (98, 401) to 990\n", "merged (787, 107) to 991\n", "merged (112, 104) to 992\n", "merged (853, 285) to 993\n", "merged (99, 105) to 994\n", "merged (565, 284) to 995\n", "merged (10, 73) to 996\n", "merged (102, 649) to 997\n", "merged (98, 373) to 998\n", "merged (275, 268) to 999\n", "merged (100, 421) to 1000\n", "merged (651, 295) to 1001\n", "merged (920, 564) to 1002\n", "merged (409, 418) to 1003\n", "merged (111, 118) to 1004\n", "merged (103, 261) to 1005\n", "merged (114, 263) to 1006\n", "merged (336, 589) to 1007\n", "merged (260, 298) to 1008\n", "merged (112, 327) to 1009\n", "merged (363, 109) to 1010\n", "merged (413, 275) to 1011\n", "merged (275, 263) to 1012\n", "merged (98, 105) to 1013\n", "merged (281, 110) to 1014\n", "merged (99, 340) to 1015\n", "merged (256, 298) to 1016\n", "merged (613, 307) to 1017\n", "merged (111, 260) to 1018\n", "merged (376, 260) to 1019\n", "merged (266, 705) to 1020\n", "merged (121, 629) to 1021\n", "merged (105, 258) to 1022\n", "merged (66, 761) to 1023\n", "[975, 10, 987, 386, 273, 563, 408, 645, 481, 975, 987, 312, 432, 564, 850, 432, 564, 850, 391, 739, 256, 988, 402, 312, 391, 739, 450, 432, 564, 850, 432, 564, 850, 391, 739, 284, 388, 585, 320, 989, 298, 330, 646, 704, 275, 493, 705, 474, 468, 907]\n" ] } ], "source": [ "with open(\"logic_lyrics.txt\", \"r\") as f:\n", " text = f.read()\n", "\n", "print(text[:50])\n", "\n", "new_text = list(text.encode('utf-8'))\n", "\n", "print(new_text[:50])\n", "\n", "text = text.encode('utf-8')\n", "\n", "\n", "old_vocab_size = 256 # utf-8 has 256 characters\n", "new_vocab_size = 1024 # arbitrary number to increase vocab to\n", "\n", "num_merges = new_vocab_size - old_vocab_size\n", "\n", "merge = {}\n", "\n", "for i in range(num_merges):\n", " counts = get_counts(new_text)\n", " top_pair = max(counts, key=counts.get)\n", " merge[top_pair] = i + old_vocab_size\n", " new_text = merge_token(top_pair, new_text, i + old_vocab_size)\n", " print(f\"merged {top_pair} to {i + old_vocab_size}\")\n", "\n", "\n", "print(new_text[:50])\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "original length: 43508\n", "tokens length: 14469\n", "compression ratio: 0.33255952928197113\n", "[975, 10, 987, 386, 273, 563, 408, 645, 481, 975, 987, 312, 432, 564, 850, 432, 564, 850, 391, 739, 256, 988, 402, 312, 391, 739, 450, 432, 564, 850, 432, 564, 850, 391, 739, 284, 388, 585, 320, 989, 298, 330, 646, 704, 275, 493, 705, 474, 468, 907]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Vocab size: 256, Compression Ratio: 1.0\n", "Vocab size: 288, Compression Ratio: 0.7335432564126138\n", "Vocab size: 320, Compression Ratio: 0.6502252459317827\n", "Vocab size: 352, Compression Ratio: 0.5953617725475775\n", "Vocab size: 384, Compression Ratio: 0.5540590236278385\n", "Vocab size: 416, Compression Ratio: 0.5220419233244461\n", "Vocab size: 448, Compression Ratio: 0.49868989611106\n", "Vocab size: 480, Compression Ratio: 0.4790613220557139\n", "Vocab size: 512, Compression Ratio: 0.46207594005700103\n", "Vocab size: 544, Compression Ratio: 0.4472051117035947\n", "Vocab size: 576, Compression Ratio: 0.43405810425668845\n", "Vocab size: 608, Compression Ratio: 0.4221062793049554\n", "Vocab size: 640, Compression Ratio: 0.4114645582421624\n", "Vocab size: 672, Compression Ratio: 0.40185712972326926\n", "Vocab size: 704, Compression Ratio: 0.3931001195182495\n", "Vocab size: 736, Compression Ratio: 0.38510159051208975\n", "Vocab size: 768, Compression Ratio: 0.37767766847476325\n", "Vocab size: 800, Compression Ratio: 0.37073641629125675\n", "Vocab size: 832, Compression Ratio: 0.364116944010297\n", "Vocab size: 864, Compression Ratio: 0.3579571573044038\n", "Vocab size: 896, Compression Ratio: 0.3520731819435506\n", "Vocab size: 928, Compression Ratio: 0.3467867978302841\n", "Vocab size: 960, Compression Ratio: 0.34163831938953754\n", "Vocab size: 992, Compression Ratio: 0.33697251080261104\n", "Vocab size: 1024, Compression Ratio: 0.33255952928197113\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "\n", "tokens_length = len(new_text)\n", "\n", "original_length = len(text)\n", "\n", "print(f\"original length: {original_length}\")\n", "print(f\"tokens length: {tokens_length}\")\n", "print(f\"compression ratio: {tokens_length/original_length}\")\n", "\n", "\n", "def compression_ratio(text, new_vocab_size):\n", " new_text = text\n", " old_vocab_size = 256\n", " num_merges = new_vocab_size - old_vocab_size\n", " merge = {}\n", " for i in range(num_merges):\n", " counts = get_counts(new_text)\n", " top_pair = max(counts, key=counts.get)\n", " merge[top_pair] = i + old_vocab_size\n", " new_text = merge_token(top_pair, new_text, i + old_vocab_size)\n", " return len(new_text)/len(text)\n", "\n", "print(new_text[:50])\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "def plot_compression_ratio(new_text, max_new_vocab_size):\n", " x = [] # New vocabulary sizes\n", " y = [] # Corresponding compression ratios\n", "\n", " for new_vocab_size in range(256, max_new_vocab_size + 1, 32):\n", " ratio = compression_ratio(text, new_vocab_size)\n", " x.append(new_vocab_size)\n", " y.append(ratio)\n", " print(f\"Vocab size: {new_vocab_size}, Compression Ratio: {ratio}\")\n", "\n", " plt.figure(figsize=(10, 6))\n", " plt.plot(x, y, marker='o')\n", " plt.title('Compression Ratio vs. Vocabulary Size')\n", " plt.xlabel('Vocabulary Size')\n", " plt.ylabel('Compression Ratio')\n", " plt.grid(True)\n", " plt.show()\n", "\n", "plot_compression_ratio(text, 1024)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'%'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "vocab = {i: bytes([i]) for i in range(256)}\n", "\n", "for (pair0, pair1), symbol in merge.items():\n", " \n", " vocab[symbol] = vocab[pair0] + vocab[pair1]\n", "\n", "\n", "\n", "\n", "def decode_sequence(sequence):\n", " bitstring = b\"\".join([vocab[token] for token in sequence])\n", " return bitstring.decode('utf-8', errors='replace')\n", "\n", "decode_sequence([37])\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{(101, 32): 256, (116, 32): 257, (116, 104): 258, (105, 110): 259, (44, 32): 260, (115, 32): 261, (111, 117): 262, (121, 32): 263, (101, 114): 264, (100, 32): 265, (97, 110): 266, (73, 32): 267, (32, 258): 268, (97, 32): 269, (111, 110): 270, (259, 39): 271, (111, 119): 272, (111, 32): 273, (108, 105): 274, (108, 108): 275, (99, 107): 276, (101, 97): 277, (121, 262): 278, (111, 114): 279, (101, 110): 280, (268, 256): 281, (271, 32): 282, (109, 32): 283, (101, 10): 284, (32, 109): 285, (104, 97): 286, (105, 116): 287, (73, 39): 288, (107, 256): 289, (259, 103): 290, (117, 115): 291, (101, 101): 292, (226, 128): 293, (118, 264): 294, (110, 272): 295, (105, 103): 296, (105, 261): 297, (278, 32): 298, (280, 32): 299, (105, 257): 300, (116, 273): 301, (115, 116): 302, (115, 104): 303, (274, 289): 304, (288, 283): 305, (99, 104): 306, (118, 256): 307, (108, 97): 308, (293, 153): 309, (266, 265): 310, (111, 100): 311, (10, 267): 312, (117, 276): 313, (258, 256): 314, (114, 97): 315, (101, 115): 316, (116, 105): 317, (102, 279): 318, (107, 295): 319, (109, 256): 320, (101, 294): 321, (97, 257): 322, (98, 101): 323, (97, 275): 324, (101, 257): 325, (117, 257): 326, (114, 111): 327, (41, 10): 328, (102, 313): 329, (119, 104): 330, (296, 104): 331, (290, 32): 332, (119, 105): 333, (100, 270): 334, (101, 260): 335, (103, 111): 336, (264, 32): 337, (109, 263): 338, (105, 99): 339, (97, 114): 340, (39, 261): 341, (111, 102): 342, (110, 269): 343, (117, 112): 344, (285, 263): 345, (97, 116): 346, (97, 261): 347, (266, 32): 348, (260, 267): 349, (115, 10): 350, (101, 265): 351, (108, 32): 352, (119, 266): 353, (110, 265): 354, (333, 258): 355, (108, 111): 356, (276, 32): 357, (100, 10): 358, (272, 32): 359, (114, 32): 360, (102, 292): 361, (98, 256): 362, (111, 109): 363, (39, 257): 364, (101, 263): 365, (319, 32): 366, (98, 311): 367, (291, 257): 368, (116, 10): 369, (98, 97): 370, (323, 299): 371, (367, 263): 372, (114, 277): 373, (108, 256): 374, (97, 108): 375, (110, 111): 376, (32, 269): 377, (97, 109): 378, (353, 343): 379, (93, 10): 380, (277, 104): 381, (260, 98): 382, (10, 66): 383, (10, 91): 384, (101, 261): 385, (87, 104): 386, (100, 105): 387, (65, 354): 388, (286, 116): 389, (111, 112): 390, (106, 368): 391, (275, 32): 392, (89, 262): 393, (108, 265): 394, (309, 257): 395, (111, 258): 396, (116, 116): 397, (321, 121): 398, (315, 112): 399, (117, 110): 400, (262, 257): 401, (97, 121): 402, (73, 83): 403, (109, 259): 404, (105, 100): 405, (111, 103): 406, (105, 115): 407, (114, 101): 408, (10, 84): 409, (103, 104): 410, (97, 115): 411, (258, 322): 412, (116, 101): 413, (117, 114): 414, (331, 257): 415, (103, 325): 416, (321, 32): 417, (104, 256): 418, (73, 309): 419, (274, 118): 420, (97, 263): 421, (116, 111): 422, (271, 257): 423, (334, 364): 424, (318, 32): 425, (272, 110): 426, (259, 281): 427, (97, 45): 428, (101, 118): 429, (264, 256): 430, (97, 100): 431, (334, 395): 432, (115, 273): 433, (10, 76): 434, (260, 119): 435, (306, 105): 436, (393, 32): 437, (114, 256): 438, (105, 275): 439, (97, 423): 440, (361, 352): 441, (270, 32): 442, (262, 410): 443, (268, 322): 444, (107, 32): 445, (342, 32): 446, (111, 111): 447, (270, 281): 448, (317, 109): 449, (284, 267): 450, (10, 388): 451, (258, 332): 452, (114, 105): 453, (286, 257): 454, (406, 339): 455, (10, 83): 456, (288, 109): 457, (262, 394): 458, (108, 272): 459, (99, 324): 460, (97, 98): 461, (101, 109): 462, (361, 108): 463, (121, 381): 464, (264, 115): 465, (318, 281): 466, (121, 260): 467, (419, 283): 468, (104, 359): 469, (296, 103): 470, (116, 97): 471, (274, 102): 472, (97, 259): 473, (303, 300): 474, (278, 360): 475, (264, 261): 476, (69, 294): 477, (287, 260): 478, (101, 283): 479, (389, 341): 480, (63, 10): 481, (258, 365): 482, (104, 277): 483, (67, 104): 484, (89, 381): 485, (108, 263): 486, (58, 32): 487, (46, 32): 488, (293, 148): 489, (100, 260): 490, (97, 99): 491, (441, 304): 492, (268, 297): 493, (259, 32): 494, (336, 257): 495, (326, 267): 496, (376, 257): 497, (99, 97): 498, (107, 282): 499, (370, 357): 500, (109, 266): 501, (76, 455): 502, (287, 10): 503, (100, 426): 504, (100, 264): 505, (108, 101): 506, (86, 465): 507, (116, 260): 508, (101, 275): 509, (121, 372): 510, (110, 273): 511, (112, 32): 512, (116, 114): 513, (103, 105): 514, (344, 32): 515, (98, 263): 516, (287, 306): 517, (10, 71): 518, (258, 346): 519, (102, 32): 520, (324, 32): 521, (110, 292): 522, (260, 305): 523, (403, 403): 524, (268, 365): 525, (330, 299): 526, (384, 507): 527, (105, 350): 528, (97, 10): 529, (422, 281): 530, (116, 261): 531, (457, 269): 532, (291, 256): 533, (398, 372): 534, (39, 438): 535, (112, 112): 536, (114, 331): 537, (110, 417): 538, (397, 269): 539, (102, 259): 540, (109, 279): 541, (102, 327): 542, (285, 256): 543, (100, 111): 544, (117, 109): 545, (100, 273): 546, (104, 297): 547, (436, 107): 548, (449, 385): 549, (329, 282): 550, (356, 307): 551, (308, 357): 552, (472, 256): 553, (115, 97): 554, (116, 264): 555, (104, 111): 556, (101, 256): 557, (527, 256): 558, (269, 115): 559, (103, 32): 560, (269, 109): 561, (256, 310): 562, (99, 348): 563, (379, 362): 564, (97, 420): 565, (270, 284): 566, (336, 539): 567, (330, 430): 568, (398, 452): 569, (110, 97): 570, (73, 110): 571, (100, 337): 572, (98, 108): 573, (277, 360): 574, (119, 454): 575, (111, 116): 576, (522, 265): 577, (10, 72): 578, (97, 260): 579, (39, 479): 580, (115, 260): 581, (258, 297): 582, (119, 279): 583, (307, 371): 584, (108, 325): 585, (99, 266): 586, (286, 265): 587, (316, 257): 588, (111, 265): 589, (101, 108): 590, (119, 347): 591, (287, 341): 592, (111, 320): 593, (108, 121): 594, (115, 105): 595, (114, 262): 596, (355, 32): 597, (484, 279): 598, (598, 291): 599, (119, 256): 600, (463, 282): 601, (428, 548): 602, (109, 284): 603, (386, 111): 604, (290, 260): 605, (329, 32): 606, (316, 115): 607, (312, 371): 608, (98, 326): 609, (355, 377): 610, (104, 260): 611, (285, 396): 612, (108, 277): 613, (97, 103): 614, (111, 98): 615, (100, 263): 616, (110, 470): 617, (270, 256): 618, (115, 112): 619, (39, 32): 620, (304, 269): 621, (477, 510): 622, (99, 256): 623, (63, 32): 624, (10, 386): 625, (317, 270): 626, (10, 70): 627, (66, 326): 628, (277, 114): 629, (99, 363): 630, (10, 78): 631, (110, 101): 632, (98, 517): 633, (498, 533): 634, (487, 502): 635, (370, 276): 636, (10, 77): 637, (101, 390): 638, (105, 108): 639, (278, 535): 640, (99, 262): 641, (524, 260): 642, (480, 344): 643, (338, 404): 644, (308, 413): 645, (121, 10): 646, (363, 101): 647, (340, 256): 648, (105, 114): 649, (100, 114): 650, (114, 415): 651, (262, 369): 652, (429, 299): 653, (104, 32): 654, (542, 283): 655, (329, 476): 656, (313, 32): 657, (262, 354): 658, (111, 294): 659, (355, 281): 660, (103, 273): 661, (107, 284): 662, (268, 346): 663, (107, 439): 664, (107, 105): 665, (373, 306): 666, (604, 97): 667, (320, 416): 668, (115, 647): 669, (109, 335): 670, (309, 261): 671, (104, 363): 672, (258, 290): 673, (513, 121): 674, (286, 114): 675, (116, 327): 676, (259, 345): 677, (382, 326): 678, (435, 256): 679, (344, 260): 680, (399, 32): 681, (111, 107): 682, (10, 393): 683, (100, 101): 684, (596, 110): 685, (519, 341): 686, (384, 599): 687, (118, 105): 688, (303, 287): 689, (10, 79): 690, (317, 320): 691, (40, 66): 692, (102, 491): 693, (98, 277): 694, (40, 87): 695, (303, 458): 696, (71, 311): 697, (40, 667): 698, (447, 33): 699, (668, 644): 700, (342, 345): 701, (73, 257): 702, (473, 395): 703, (10, 65): 704, (396, 337): 705, (112, 308): 706, (10, 437): 707, (324, 263): 708, (330, 273): 709, (514, 307): 710, (41, 312): 711, (118, 282): 712, (304, 258): 713, (119, 270): 714, (116, 282): 715, (110, 32): 716, (518, 325): 717, (276, 101): 718, (108, 359): 719, (119, 421): 720, (119, 402): 721, (270, 345): 722, (39, 634): 723, (635, 380): 724, (602, 602): 725, (383, 326): 726, (103, 270): 727, (111, 121): 728, (316, 256): 729, (112, 638): 730, (99, 458): 731, (351, 301): 732, (308, 276): 733, (270, 486): 734, (308, 307): 735, (698, 328): 736, (399, 382): 737, (109, 348): 738, (379, 387): 739, (270, 101): 740, (461, 401): 741, (274, 410): 742, (258, 264): 743, (108, 100): 744, (108, 270): 745, (674, 343): 746, (266, 121): 747, (526, 267): 748, (264, 656): 749, (340, 658): 750, (434, 105): 751, (302, 390): 752, (681, 303): 753, (66, 327): 754, (317, 718): 755, (755, 531): 756, (52, 48): 757, (375, 433): 758, (274, 662): 759, (105, 109): 760, (615, 516): 761, (97, 289): 762, (464, 260): 763, (104, 300): 764, (111, 394): 765, (101, 392): 766, (295, 32): 767, (10, 485): 768, (296, 32): 769, (107, 292): 770, (271, 260): 771, (267, 109): 772, (483, 114): 773, (266, 100): 774, (356, 118): 775, (97, 329): 776, (114, 605): 777, (286, 109): 778, (10, 87): 779, (303, 32): 780, (549, 466): 781, (642, 642): 782, (260, 338): 783, (434, 325): 784, (492, 338): 785, (107, 259): 786, (258, 259): 787, (104, 264): 788, (409, 104): 789, (115, 257): 790, (256, 40): 791, (267, 115): 792, (384, 571): 793, (713, 528): 794, (287, 256): 795, (612, 749): 796, (101, 100): 797, (271, 268): 798, (114, 443): 799, (108, 117): 800, (102, 117): 801, (100, 378): 802, (557, 753): 803, (803, 272): 804, (804, 261): 805, (805, 110): 806, (806, 574): 807, (807, 754): 808, (808, 682): 809, (809, 594): 810, (810, 110): 811, (811, 717): 812, (812, 756): 813, (813, 347): 814, (814, 719): 815, (815, 347): 816, (816, 36): 817, (817, 757): 818, (818, 683): 819, (819, 285): 820, (820, 415): 821, (821, 758): 822, (822, 759): 823, (306, 266): 824, (382, 496): 825, (471, 289): 826, (114, 292): 827, (99, 315): 828, (101, 397): 829, (301, 98): 830, (112, 114): 831, (586, 364): 832, (105, 107): 833, (117, 99): 834, (259, 377): 835, (406, 32): 836, (302, 105): 837, (288, 584): 838, (777, 777): 839, (10, 622): 840, (695, 699): 841, (121, 349): 842, (537, 369): 843, (562, 541): 844, (339, 576): 845, (845, 259): 846, (297, 375): 847, (358, 702): 848, (785, 553): 849, (565, 450): 850, (429, 280): 851, (312, 366): 852, (278, 114): 853, (353, 257): 854, (284, 78): 855, (400, 505): 856, (331, 116): 857, (383, 496): 858, (76, 105): 859, (119, 114): 860, (356, 111): 861, (100, 402): 862, (70, 657): 863, (672, 105): 864, (99, 593): 865, (105, 374): 866, (316, 260): 867, (310, 314): 868, (111, 257): 869, (97, 102): 870, (116, 588): 871, (389, 256): 872, (114, 400): 873, (279, 32): 874, (420, 282): 875, (39, 462): 876, (102, 378): 877, (67, 97): 878, (402, 362): 879, (101, 341): 880, (115, 109): 881, (110, 364): 882, (98, 111): 883, (65, 121): 884, (100, 103): 885, (80, 666): 886, (286, 307): 887, (112, 105): 888, (469, 301): 889, (335, 534): 890, (841, 328): 891, (836, 39): 892, (892, 685): 893, (893, 100): 894, (894, 489): 895, (655, 97): 896, (896, 489): 897, (74, 316): 898, (595, 302): 899, (115, 270): 900, (569, 847): 901, (419, 584): 902, (107, 290): 903, (492, 468): 904, (401, 701): 905, (404, 284): 906, (116, 375): 907, (112, 315): 908, (303, 359): 909, (376, 372): 910, (789, 365): 911, (115, 421): 912, (298, 301): 913, (625, 299): 914, (316, 261): 915, (71, 589): 916, (568, 298): 917, (274, 397): 918, (918, 374): 919, (540, 708): 920, (676, 380): 921, (10, 558): 922, (370, 98): 923, (427, 99): 924, (331, 369): 925, (802, 110): 926, (301, 109): 927, (116, 390): 928, (115, 593): 929, (83, 823): 930, (632, 119): 931, (98, 356): 932, (109, 396): 933, (269, 98): 934, (268, 443): 935, (114, 405): 936, (101, 99): 937, (280, 265): 938, (117, 116): 939, (334, 256): 940, (630, 282): 941, (303, 503): 942, (493, 474): 943, (84, 104): 944, (115, 256): 945, (304, 314): 946, (114, 272): 947, (730, 374): 948, (109, 281): 949, (516, 314): 950, (10, 305): 951, (112, 291): 952, (271, 281): 953, (614, 473): 954, (297, 314): 955, (346, 116): 956, (10, 68): 957, (336, 282): 958, (884, 121): 959, (349, 371): 960, (121, 111): 961, (961, 620): 962, (83, 112): 963, (97, 302): 964, (116, 351): 965, (407, 105): 966, (573, 292): 967, (112, 264): 968, (318, 268): 969, (85, 611): 970, (100, 97): 971, (844, 844): 972, (83, 101): 973, (902, 448): 974, (974, 459): 975, (608, 471): 976, (976, 903): 977, (977, 345): 978, (978, 449): 979, (979, 450): 980, (980, 904): 981, (981, 905): 982, (982, 404): 983, (983, 848): 984, (984, 849): 985, (985, 703): 986, (986, 906): 987, (116, 311): 988, (413, 392): 989, (98, 401): 990, (787, 107): 991, (112, 104): 992, (853, 285): 993, (99, 105): 994, (565, 284): 995, (10, 73): 996, (102, 649): 997, (98, 373): 998, (275, 268): 999, (100, 421): 1000, (651, 295): 1001, (920, 564): 1002, (409, 418): 1003, (111, 118): 1004, (103, 261): 1005, (114, 263): 1006, (336, 589): 1007, (260, 298): 1008, (112, 327): 1009, (363, 109): 1010, (413, 275): 1011, (275, 263): 1012, (98, 105): 1013, (281, 110): 1014, (99, 340): 1015, (256, 298): 1016, (613, 307): 1017, (111, 260): 1018, (376, 260): 1019, (266, 705): 1020, (121, 629): 1021, (105, 258): 1022, (66, 761): 1023}\n" ] } ], "source": [ "print(merge)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[402, 121, 121, 263, 119, 389, 261, 515, 240, 159, 145, 139]\n" ] } ], "source": [ "def encode_sequence(sequence):\n", " tokens = list(sequence.encode('utf-8'))\n", " \n", " \n", " while len(tokens) >= 2:\n", " counts = get_counts(tokens)\n", " pair = min(counts, key=lambda x: merge.get(x, float('inf')))\n", " # print(pair)\n", " # print(merge)\n", " if pair not in merge:\n", " break\n", " \n", " symbol = merge[pair]\n", " tokens = merge_token(pair, tokens, symbol)\n", " return tokens\n", "\n", "\n", "\n", "print(encode_sequence(\"ayyyy whats up 👋\"))\n", "\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ayyyy whats up 👋\n", "['h', 'ell', 'o']\n" ] } ], "source": [ "print(decode_sequence(encode_sequence(\"ayyyy whats up 👋\")))\n", "\n", "\n", "token_delimited_text = [decode_sequence([token]) for token in encode_sequence(\"hello\")]\n", "print(token_delimited_text)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[104, 509, 111]\n" ] } ], "source": [ "print(encode_sequence(\"hello\"))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['shit']\n" ] } ], "source": [ "print([decode_sequence([token]) for token in encode_sequence(\"shit\")])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['S', 'it ', 'the ', 'fuck ', 'back ', 'down']\n", "compression ratio: 3.6666666666666665\n" ] } ], "source": [ "print([decode_sequence([token]) for token in encode_sequence(\"Sit the fuck back down\")])\n", "print(f\"compression ratio: {len('Sit the fuck back down')/len([decode_sequence([token]) for token in encode_sequence('Sit the fuck back down')])}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "Object of type bytes is not JSON serializable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[16], line 8\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# for key, value in vocab.items():\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# vocab[key] = value.decode('utf-8', errors='replace')\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvocab.json\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[0;32m----> 8\u001b[0m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdump\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m merge_untupled \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mmap\u001b[39m(\u001b[38;5;28mstr\u001b[39m, key)): value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m merge\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmerge.json\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m json_file:\n", "File \u001b[0;32m~/miniforge3/envs/gradio/lib/python3.10/json/__init__.py:179\u001b[0m, in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 173\u001b[0m iterable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(skipkeys\u001b[38;5;241m=\u001b[39mskipkeys, ensure_ascii\u001b[38;5;241m=\u001b[39mensure_ascii,\n\u001b[1;32m 174\u001b[0m check_circular\u001b[38;5;241m=\u001b[39mcheck_circular, allow_nan\u001b[38;5;241m=\u001b[39mallow_nan, indent\u001b[38;5;241m=\u001b[39mindent,\n\u001b[1;32m 175\u001b[0m separators\u001b[38;5;241m=\u001b[39mseparators,\n\u001b[1;32m 176\u001b[0m default\u001b[38;5;241m=\u001b[39mdefault, sort_keys\u001b[38;5;241m=\u001b[39msort_keys, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw)\u001b[38;5;241m.\u001b[39miterencode(obj)\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# could accelerate with writelines in some versions of Python, at\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# a debuggability cost\u001b[39;00m\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m 180\u001b[0m fp\u001b[38;5;241m.\u001b[39mwrite(chunk)\n", "File \u001b[0;32m~/miniforge3/envs/gradio/lib/python3.10/json/encoder.py:431\u001b[0m, in \u001b[0;36m_make_iterencode.._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_list(o, _current_indent_level)\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode_dict(o, _current_indent_level)\n\u001b[1;32m 432\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 433\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[0;32m~/miniforge3/envs/gradio/lib/python3.10/json/encoder.py:405\u001b[0m, in \u001b[0;36m_make_iterencode.._iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 404\u001b[0m chunks \u001b[38;5;241m=\u001b[39m _iterencode(value, _current_indent_level)\n\u001b[0;32m--> 405\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m chunks\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m newline_indent \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 407\u001b[0m _current_indent_level \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", "File \u001b[0;32m~/miniforge3/envs/gradio/lib/python3.10/json/encoder.py:438\u001b[0m, in \u001b[0;36m_make_iterencode.._iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCircular reference detected\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 437\u001b[0m markers[markerid] \u001b[38;5;241m=\u001b[39m o\n\u001b[0;32m--> 438\u001b[0m o \u001b[38;5;241m=\u001b[39m \u001b[43m_default\u001b[49m\u001b[43m(\u001b[49m\u001b[43mo\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _iterencode(o, _current_indent_level)\n\u001b[1;32m 440\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m markers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", "File \u001b[0;32m~/miniforge3/envs/gradio/lib/python3.10/json/encoder.py:179\u001b[0m, in \u001b[0;36mJSONEncoder.default\u001b[0;34m(self, o)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdefault\u001b[39m(\u001b[38;5;28mself\u001b[39m, o):\n\u001b[1;32m 161\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Implement this method in a subclass such that it returns\u001b[39;00m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;124;03m a serializable object for ``o``, or calls the base implementation\u001b[39;00m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;124;03m (to raise a ``TypeError``).\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 177\u001b[0m \n\u001b[1;32m 178\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mObject of type \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mo\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mis not JSON serializable\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", "\u001b[0;31mTypeError\u001b[0m: Object of type bytes is not JSON serializable" ] } ], "source": [ "import json\n", "\n", "# for key, value in vocab.items():\n", " \n", "# vocab[key] = value.decode('utf-8', errors='replace')\n", "\n", "with open('vocab.json', 'w') as json_file:\n", " json.dump(vocab, json_file)\n", " \n", "merge_untupled = {'_'.join(map(str, key)): value for key, value in merge.items()}\n", "\n", "with open('merge.json', 'w') as json_file:\n", " json.dump(merge_untupled, json_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "gradio", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }