alandao commited on
Commit
232ac84
1 Parent(s): bb9a010

Upload example_images_tokenize.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. example_images_tokenize.ipynb +7 -41
example_images_tokenize.ipynb CHANGED
@@ -1432,41 +1432,7 @@
1432
  },
1433
  {
1434
  "cell_type": "code",
1435
- "execution_count": 8,
1436
- "id": "3ca6861c-4182-4d23-984f-92e0b5ba22f2",
1437
- "metadata": {},
1438
- "outputs": [
1439
- {
1440
- "ename": "AssertionError",
1441
- "evalue": "Key <|img_start|> is not a special token",
1442
- "output_type": "error",
1443
- "traceback": [
1444
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1445
- "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
1446
- "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mllama_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m<|img_start|>\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43masdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n",
1447
- "File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:947\u001b[0m, in \u001b[0;36mSpecialTokensMixin.add_special_tokens\u001b[0;34m(self, special_tokens_dict, replace_additional_special_tokens)\u001b[0m\n\u001b[1;32m 945\u001b[0m added_tokens \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 946\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m special_tokens_dict\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m--> 947\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSPECIAL_TOKENS_ATTRIBUTES, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mKey \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not a special token\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 949\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose:\n\u001b[1;32m 950\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAssigning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m to the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m key of the tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
1448
- "\u001b[0;31mAssertionError\u001b[0m: Key <|img_start|> is not a special token"
1449
- ]
1450
- }
1451
- ],
1452
- "source": [
1453
- "llama_tokenizer.regist"
1454
- ]
1455
- },
1456
- {
1457
- "cell_type": "code",
1458
- "execution_count": null,
1459
- "id": "aaf0a3d1-dd5d-402b-9802-b692921c9079",
1460
- "metadata": {},
1461
- "outputs": [],
1462
- "source": [
1463
- "llama_tokenizer.add_special_tokens({\"image start\":\"<|img_start|>\"})\n",
1464
- "llama_tokenizer.add_special_tokens(\"<|img_end|>\")"
1465
- ]
1466
- },
1467
- {
1468
- "cell_type": "code",
1469
- "execution_count": 55,
1470
  "id": "a3e9b926-2cd4-4870-b219-7f92cd9295c4",
1471
  "metadata": {},
1472
  "outputs": [],
@@ -1487,7 +1453,7 @@
1487
  },
1488
  {
1489
  "cell_type": "code",
1490
- "execution_count": 56,
1491
  "id": "7998066b-6fe7-4ca8-8ede-3870f37f3725",
1492
  "metadata": {},
1493
  "outputs": [],
@@ -1521,7 +1487,7 @@
1521
  },
1522
  {
1523
  "cell_type": "code",
1524
- "execution_count": 67,
1525
  "id": "8919fa2e-4e07-48d9-876f-c033a0fd1ab8",
1526
  "metadata": {},
1527
  "outputs": [],
@@ -1531,7 +1497,7 @@
1531
  },
1532
  {
1533
  "cell_type": "code",
1534
- "execution_count": 69,
1535
  "id": "c97bf893-aeda-43e4-b703-bdb425dcda3e",
1536
  "metadata": {},
1537
  "outputs": [],
@@ -1566,7 +1532,7 @@
1566
  },
1567
  {
1568
  "cell_type": "code",
1569
- "execution_count": 74,
1570
  "id": "320c91a9-e0c0-4786-8eaa-424fa5e8e41e",
1571
  "metadata": {
1572
  "scrolled": true
@@ -1589,7 +1555,7 @@
1589
  },
1590
  {
1591
  "cell_type": "code",
1592
- "execution_count": 75,
1593
  "id": "85037f9a-91aa-4991-ae41-251e926343b9",
1594
  "metadata": {
1595
  "scrolled": true
@@ -2601,7 +2567,7 @@
2601
  " ...]"
2602
  ]
2603
  },
2604
- "execution_count": 75,
2605
  "metadata": {},
2606
  "output_type": "execute_result"
2607
  }
 
1432
  },
1433
  {
1434
  "cell_type": "code",
1435
+ "execution_count": 6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
  "id": "a3e9b926-2cd4-4870-b219-7f92cd9295c4",
1437
  "metadata": {},
1438
  "outputs": [],
 
1453
  },
1454
  {
1455
  "cell_type": "code",
1456
+ "execution_count": 7,
1457
  "id": "7998066b-6fe7-4ca8-8ede-3870f37f3725",
1458
  "metadata": {},
1459
  "outputs": [],
 
1487
  },
1488
  {
1489
  "cell_type": "code",
1490
+ "execution_count": 8,
1491
  "id": "8919fa2e-4e07-48d9-876f-c033a0fd1ab8",
1492
  "metadata": {},
1493
  "outputs": [],
 
1497
  },
1498
  {
1499
  "cell_type": "code",
1500
+ "execution_count": 9,
1501
  "id": "c97bf893-aeda-43e4-b703-bdb425dcda3e",
1502
  "metadata": {},
1503
  "outputs": [],
 
1532
  },
1533
  {
1534
  "cell_type": "code",
1535
+ "execution_count": 10,
1536
  "id": "320c91a9-e0c0-4786-8eaa-424fa5e8e41e",
1537
  "metadata": {
1538
  "scrolled": true
 
1555
  },
1556
  {
1557
  "cell_type": "code",
1558
+ "execution_count": 11,
1559
  "id": "85037f9a-91aa-4991-ae41-251e926343b9",
1560
  "metadata": {
1561
  "scrolled": true
 
2567
  " ...]"
2568
  ]
2569
  },
2570
+ "execution_count": 11,
2571
  "metadata": {},
2572
  "output_type": "execute_result"
2573
  }