Upload example_images_tokenize.ipynb with huggingface_hub
Browse files
example_images_tokenize.ipynb
CHANGED
@@ -1432,41 +1432,7 @@
|
|
1432 |
},
|
1433 |
{
|
1434 |
"cell_type": "code",
|
1435 |
-
"execution_count":
|
1436 |
-
"id": "3ca6861c-4182-4d23-984f-92e0b5ba22f2",
|
1437 |
-
"metadata": {},
|
1438 |
-
"outputs": [
|
1439 |
-
{
|
1440 |
-
"ename": "AssertionError",
|
1441 |
-
"evalue": "Key <|img_start|> is not a special token",
|
1442 |
-
"output_type": "error",
|
1443 |
-
"traceback": [
|
1444 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
1445 |
-
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
|
1446 |
-
"Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mllama_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m<|img_start|>\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43masdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n",
|
1447 |
-
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:947\u001b[0m, in \u001b[0;36mSpecialTokensMixin.add_special_tokens\u001b[0;34m(self, special_tokens_dict, replace_additional_special_tokens)\u001b[0m\n\u001b[1;32m 945\u001b[0m added_tokens \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 946\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m special_tokens_dict\u001b[38;5;241m.\u001b[39mitems():\n\u001b[0;32m--> 947\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mSPECIAL_TOKENS_ATTRIBUTES, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mKey \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not a special token\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 949\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose:\n\u001b[1;32m 950\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAssigning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m to the \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m key of the tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
1448 |
-
"\u001b[0;31mAssertionError\u001b[0m: Key <|img_start|> is not a special token"
|
1449 |
-
]
|
1450 |
-
}
|
1451 |
-
],
|
1452 |
-
"source": [
|
1453 |
-
"llama_tokenizer.regist"
|
1454 |
-
]
|
1455 |
-
},
|
1456 |
-
{
|
1457 |
-
"cell_type": "code",
|
1458 |
-
"execution_count": null,
|
1459 |
-
"id": "aaf0a3d1-dd5d-402b-9802-b692921c9079",
|
1460 |
-
"metadata": {},
|
1461 |
-
"outputs": [],
|
1462 |
-
"source": [
|
1463 |
-
"llama_tokenizer.add_special_tokens({\"image start\":\"<|img_start|>\"})\n",
|
1464 |
-
"llama_tokenizer.add_special_tokens(\"<|img_end|>\")"
|
1465 |
-
]
|
1466 |
-
},
|
1467 |
-
{
|
1468 |
-
"cell_type": "code",
|
1469 |
-
"execution_count": 55,
|
1470 |
"id": "a3e9b926-2cd4-4870-b219-7f92cd9295c4",
|
1471 |
"metadata": {},
|
1472 |
"outputs": [],
|
@@ -1487,7 +1453,7 @@
|
|
1487 |
},
|
1488 |
{
|
1489 |
"cell_type": "code",
|
1490 |
-
"execution_count":
|
1491 |
"id": "7998066b-6fe7-4ca8-8ede-3870f37f3725",
|
1492 |
"metadata": {},
|
1493 |
"outputs": [],
|
@@ -1521,7 +1487,7 @@
|
|
1521 |
},
|
1522 |
{
|
1523 |
"cell_type": "code",
|
1524 |
-
"execution_count":
|
1525 |
"id": "8919fa2e-4e07-48d9-876f-c033a0fd1ab8",
|
1526 |
"metadata": {},
|
1527 |
"outputs": [],
|
@@ -1531,7 +1497,7 @@
|
|
1531 |
},
|
1532 |
{
|
1533 |
"cell_type": "code",
|
1534 |
-
"execution_count":
|
1535 |
"id": "c97bf893-aeda-43e4-b703-bdb425dcda3e",
|
1536 |
"metadata": {},
|
1537 |
"outputs": [],
|
@@ -1566,7 +1532,7 @@
|
|
1566 |
},
|
1567 |
{
|
1568 |
"cell_type": "code",
|
1569 |
-
"execution_count":
|
1570 |
"id": "320c91a9-e0c0-4786-8eaa-424fa5e8e41e",
|
1571 |
"metadata": {
|
1572 |
"scrolled": true
|
@@ -1589,7 +1555,7 @@
|
|
1589 |
},
|
1590 |
{
|
1591 |
"cell_type": "code",
|
1592 |
-
"execution_count":
|
1593 |
"id": "85037f9a-91aa-4991-ae41-251e926343b9",
|
1594 |
"metadata": {
|
1595 |
"scrolled": true
|
@@ -2601,7 +2567,7 @@
|
|
2601 |
" ...]"
|
2602 |
]
|
2603 |
},
|
2604 |
-
"execution_count":
|
2605 |
"metadata": {},
|
2606 |
"output_type": "execute_result"
|
2607 |
}
|
|
|
1432 |
},
|
1433 |
{
|
1434 |
"cell_type": "code",
|
1435 |
+
"execution_count": 6,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1436 |
"id": "a3e9b926-2cd4-4870-b219-7f92cd9295c4",
|
1437 |
"metadata": {},
|
1438 |
"outputs": [],
|
|
|
1453 |
},
|
1454 |
{
|
1455 |
"cell_type": "code",
|
1456 |
+
"execution_count": 7,
|
1457 |
"id": "7998066b-6fe7-4ca8-8ede-3870f37f3725",
|
1458 |
"metadata": {},
|
1459 |
"outputs": [],
|
|
|
1487 |
},
|
1488 |
{
|
1489 |
"cell_type": "code",
|
1490 |
+
"execution_count": 8,
|
1491 |
"id": "8919fa2e-4e07-48d9-876f-c033a0fd1ab8",
|
1492 |
"metadata": {},
|
1493 |
"outputs": [],
|
|
|
1497 |
},
|
1498 |
{
|
1499 |
"cell_type": "code",
|
1500 |
+
"execution_count": 9,
|
1501 |
"id": "c97bf893-aeda-43e4-b703-bdb425dcda3e",
|
1502 |
"metadata": {},
|
1503 |
"outputs": [],
|
|
|
1532 |
},
|
1533 |
{
|
1534 |
"cell_type": "code",
|
1535 |
+
"execution_count": 10,
|
1536 |
"id": "320c91a9-e0c0-4786-8eaa-424fa5e8e41e",
|
1537 |
"metadata": {
|
1538 |
"scrolled": true
|
|
|
1555 |
},
|
1556 |
{
|
1557 |
"cell_type": "code",
|
1558 |
+
"execution_count": 11,
|
1559 |
"id": "85037f9a-91aa-4991-ae41-251e926343b9",
|
1560 |
"metadata": {
|
1561 |
"scrolled": true
|
|
|
2567 |
" ...]"
|
2568 |
]
|
2569 |
},
|
2570 |
+
"execution_count": 11,
|
2571 |
"metadata": {},
|
2572 |
"output_type": "execute_result"
|
2573 |
}
|