Spaces:
Runtime error
Runtime error
File size: 23,608 Bytes
ade0520 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
}
],
"source": [
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Encode and Decode"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1350, 492, 151643, 863, 151643]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# treat surface forms of special tokens as actual special tokens\n",
"# the default, but unsafe (to be compatible with other projects)\n",
"# the same as tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\", allowed_special='all', disallowed_special=())\n",
"tokenizer.encode(\"print('<|endoftext|>')<|endoftext|>\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"print('<|endoftext|>')<|endoftext|>\""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.decode([1350, 492, 151643, 863, 151643])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# treat texts just as texts, avoid injection attacks\n",
"tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"print('<|endoftext|>')<|endoftext|>\""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[7], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m tokenizer\u001b[39m.\u001b[39;49mencode(\u001b[39m\"\u001b[39;49m\u001b[39mprint(\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m<|endoftext|>\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m)\u001b[39;49m\u001b[39m\"\u001b[39;49m, allowed_special\u001b[39m=\u001b[39;49m\u001b[39mset\u001b[39;49m(), disallowed_special\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mall\u001b[39;49m\u001b[39m'\u001b[39;49m) \u001b[39m+\u001b[39m [tokenizer\u001b[39m.\u001b[39meod_id]\n",
"File \u001b[1;32mtransformers\\tokenization_utils_base.py:2348\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode\u001b[1;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)\u001b[0m\n\u001b[0;32m 2311\u001b[0m \u001b[39m@add_end_docstrings\u001b[39m(\n\u001b[0;32m 2312\u001b[0m ENCODE_KWARGS_DOCSTRING,\n\u001b[0;32m 2313\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2331\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2332\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[\u001b[39mint\u001b[39m]:\n\u001b[0;32m 2333\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 2334\u001b[0m \u001b[39m Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\u001b[39;00m\n\u001b[0;32m 2335\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2346\u001b[0m \u001b[39m method).\u001b[39;00m\n\u001b[0;32m 2347\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2348\u001b[0m encoded_inputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencode_plus(\n\u001b[0;32m 2349\u001b[0m text,\n\u001b[0;32m 2350\u001b[0m text_pair\u001b[39m=\u001b[39mtext_pair,\n\u001b[0;32m 2351\u001b[0m add_special_tokens\u001b[39m=\u001b[39madd_special_tokens,\n\u001b[0;32m 2352\u001b[0m padding\u001b[39m=\u001b[39mpadding,\n\u001b[0;32m 2353\u001b[0m truncation\u001b[39m=\u001b[39mtruncation,\n\u001b[0;32m 2354\u001b[0m max_length\u001b[39m=\u001b[39mmax_length,\n\u001b[0;32m 2355\u001b[0m stride\u001b[39m=\u001b[39mstride,\n\u001b[0;32m 2356\u001b[0m return_tensors\u001b[39m=\u001b[39mreturn_tensors,\n\u001b[0;32m 2357\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2358\u001b[0m )\n\u001b[0;32m 2360\u001b[0m \u001b[39mreturn\u001b[39;00m encoded_inputs[\u001b[39m\"\u001b[39m\u001b[39minput_ids\u001b[39m\u001b[39m\"\u001b[39m]\n",
"File \u001b[1;32mtransformers\\tokenization_utils_base.py:2756\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[1;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[0;32m 2746\u001b[0m \u001b[39m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[0;32m 2747\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[0;32m 2748\u001b[0m padding\u001b[39m=\u001b[39mpadding,\n\u001b[0;32m 2749\u001b[0m truncation\u001b[39m=\u001b[39mtruncation,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2753\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2754\u001b[0m )\n\u001b[1;32m-> 2756\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_encode_plus(\n\u001b[0;32m 2757\u001b[0m text\u001b[39m=\u001b[39mtext,\n\u001b[0;32m 2758\u001b[0m text_pair\u001b[39m=\u001b[39mtext_pair,\n\u001b[0;32m 2759\u001b[0m add_special_tokens\u001b[39m=\u001b[39madd_special_tokens,\n\u001b[0;32m 2760\u001b[0m padding_strategy\u001b[39m=\u001b[39mpadding_strategy,\n\u001b[0;32m 2761\u001b[0m truncation_strategy\u001b[39m=\u001b[39mtruncation_strategy,\n\u001b[0;32m 2762\u001b[0m max_length\u001b[39m=\u001b[39mmax_length,\n\u001b[0;32m 2763\u001b[0m stride\u001b[39m=\u001b[39mstride,\n\u001b[0;32m 2764\u001b[0m is_split_into_words\u001b[39m=\u001b[39mis_split_into_words,\n\u001b[0;32m 2765\u001b[0m pad_to_multiple_of\u001b[39m=\u001b[39mpad_to_multiple_of,\n\u001b[0;32m 2766\u001b[0m return_tensors\u001b[39m=\u001b[39mreturn_tensors,\n\u001b[0;32m 2767\u001b[0m return_token_type_ids\u001b[39m=\u001b[39mreturn_token_type_ids,\n\u001b[0;32m 2768\u001b[0m return_attention_mask\u001b[39m=\u001b[39mreturn_attention_mask,\n\u001b[0;32m 2769\u001b[0m return_overflowing_tokens\u001b[39m=\u001b[39mreturn_overflowing_tokens,\n\u001b[0;32m 2770\u001b[0m return_special_tokens_mask\u001b[39m=\u001b[39mreturn_special_tokens_mask,\n\u001b[0;32m 2771\u001b[0m return_offsets_mapping\u001b[39m=\u001b[39mreturn_offsets_mapping,\n\u001b[0;32m 2772\u001b[0m return_length\u001b[39m=\u001b[39mreturn_length,\n\u001b[0;32m 2773\u001b[0m verbose\u001b[39m=\u001b[39mverbose,\n\u001b[0;32m 2774\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[0;32m 2775\u001b[0m )\n",
"File \u001b[1;32mtransformers\\tokenization_utils.py:649\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus\u001b[1;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[0;32m 640\u001b[0m \u001b[39mif\u001b[39;00m return_offsets_mapping:\n\u001b[0;32m 641\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\n\u001b[0;32m 642\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mreturn_offset_mapping is not available when using Python tokenizers. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 643\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mTo use this feature, change your tokenizer to one deriving from \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 646\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mhttps://github.com/huggingface/transformers/pull/2674\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 647\u001b[0m )\n\u001b[1;32m--> 649\u001b[0m first_ids \u001b[39m=\u001b[39m get_input_ids(text)\n\u001b[0;32m 650\u001b[0m second_ids \u001b[39m=\u001b[39m get_input_ids(text_pair) \u001b[39mif\u001b[39;00m text_pair \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 652\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_for_model(\n\u001b[0;32m 653\u001b[0m first_ids,\n\u001b[0;32m 654\u001b[0m pair_ids\u001b[39m=\u001b[39msecond_ids,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 668\u001b[0m verbose\u001b[39m=\u001b[39mverbose,\n\u001b[0;32m 669\u001b[0m )\n",
"File \u001b[1;32mtransformers\\tokenization_utils.py:616\u001b[0m, in \u001b[0;36mPreTrainedTokenizer._encode_plus.<locals>.get_input_ids\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m 614\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_input_ids\u001b[39m(text):\n\u001b[0;32m 615\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(text, \u001b[39mstr\u001b[39m):\n\u001b[1;32m--> 616\u001b[0m tokens \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenize(text, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 617\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconvert_tokens_to_ids(tokens)\n\u001b[0;32m 618\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(text, (\u001b[39mlist\u001b[39m, \u001b[39mtuple\u001b[39m)) \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(text) \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(text[\u001b[39m0\u001b[39m], \u001b[39mstr\u001b[39m):\n",
"File \u001b[1;32mtokenization_qwen.py:155\u001b[0m, in \u001b[0;36mQWenTokenizer.tokenize\u001b[1;34m(self, text, allowed_special, disallowed_special, **kwargs)\u001b[0m\n\u001b[0;32m 152\u001b[0m text \u001b[39m=\u001b[39m unicodedata\u001b[39m.\u001b[39mnormalize(\u001b[39m\"\u001b[39m\u001b[39mNFC\u001b[39m\u001b[39m\"\u001b[39m, text)\n\u001b[0;32m 154\u001b[0m \u001b[39m# this implementation takes a detour: text -> token id -> token surface forms\u001b[39;00m\n\u001b[1;32m--> 155\u001b[0m \u001b[39mfor\u001b[39;00m t \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtokenizer\u001b[39m.\u001b[39;49mencode(\n\u001b[0;32m 156\u001b[0m text, allowed_special\u001b[39m=\u001b[39;49mallowed_special, disallowed_special\u001b[39m=\u001b[39;49mdisallowed_special\n\u001b[0;32m 157\u001b[0m ):\n\u001b[0;32m 158\u001b[0m tokens\u001b[39m.\u001b[39mappend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecoder[t])\n\u001b[0;32m 159\u001b[0m \u001b[39mreturn\u001b[39;00m tokens\n",
"File \u001b[1;32mtiktoken\\core.py:117\u001b[0m, in \u001b[0;36mEncoding.encode\u001b[1;34m(self, text, allowed_special, disallowed_special)\u001b[0m\n\u001b[0;32m 115\u001b[0m disallowed_special \u001b[39m=\u001b[39m \u001b[39mfrozenset\u001b[39m(disallowed_special)\n\u001b[0;32m 116\u001b[0m \u001b[39mif\u001b[39;00m match \u001b[39m:=\u001b[39m _special_token_regex(disallowed_special)\u001b[39m.\u001b[39msearch(text):\n\u001b[1;32m--> 117\u001b[0m raise_disallowed_special_token(match\u001b[39m.\u001b[39;49mgroup())\n\u001b[0;32m 119\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 120\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_core_bpe\u001b[39m.\u001b[39mencode(text, allowed_special)\n",
"File \u001b[1;32mtiktoken\\core.py:337\u001b[0m, in \u001b[0;36mraise_disallowed_special_token\u001b[1;34m(token)\u001b[0m\n\u001b[0;32m 336\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mraise_disallowed_special_token\u001b[39m(token: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m NoReturn:\n\u001b[1;32m--> 337\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 338\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mEncountered text corresponding to disallowed special token \u001b[39m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 339\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mIf you want this text to be encoded as a special token, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 340\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpass it to `allowed_special`, e.g. `allowed_special=\u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m, ...\u001b[39m\u001b[39m}}\u001b[39;00m\u001b[39m`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 341\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mIf you want this text to be encoded as normal text, disable the check for this token \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 342\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mby passing `disallowed_special=(enc.special_tokens_set - \u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mtoken\u001b[39m!r}\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m)`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 343\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mTo disable this check for all special tokens, pass `disallowed_special=()`.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 344\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: Encountered text corresponding to disallowed special token '<|endoftext|>'.\nIf you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.\nIf you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.\nTo disable this check for all special tokens, pass `disallowed_special=()`.\n"
]
}
],
"source": [
"# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered\n",
"tokenizer.encode(\"print('<|endoftext|>')\", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[151644, 1350, 11146, 91, 15460, 62, 15, 91, 79865, 151645, 151643]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# fine-grained control, just keep mind of this:\n",
"# allowed_special is treated as special tokens\n",
"# disallowed_special raise errors\n",
"# allowed_special has higher priority than disallowed_special\n",
"tokenizer.encode(\"<|im_start|>print('<|extra_0|>')<|im_end|>\", \n",
" allowed_special={'<|im_start|>', '<|im_end|>'}, \n",
" disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[151644, 1350, 492, 151646, 863, 151645, 151643]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.encode(\"<|im_start|>print('<|extra_0|>')<|im_end|>\", \n",
" allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n",
" disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Special Token Management"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using unk_token, but it is not set yet.\n"
]
}
],
"source": [
"# huggingface tokenizer has its own special token mechanism, so does tiktoken\n",
"# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None\n",
"tokenizer.unk_token"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.eos_token_id # use tokenizer.eod_id instead"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.pad_token_id "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"151646"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# use one of the extras such as <|extra_0|>\n",
"tokenizer.special_tokens['<|extra_0|>']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Utility Methods"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[b'print', b\"('<\", b'|', b'endo', b'ft', b'ext', b'|', b\">')\", '<|endoftext|>']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)\n",
"ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]\n",
"tokenizer.convert_ids_to_tokens(ids)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"print('<|endoftext|>')<|endoftext|>\""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"ids = tokenizer.encode(\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵<|im_end|>\", \n",
" allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, \n",
" disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<|im_start|>',\n",
" b'print',\n",
" b\"('\",\n",
" b'\\xe6\\x88\\x91',\n",
" b'\\xe6\\x98\\xaf\\xe4\\xb8\\x80',\n",
" b'\\xe5\\x8f\\xaa',\n",
" b'\\xe7\\x8c\\xab',\n",
" '<|extra_0|>',\n",
" b\"')\\n\",\n",
" b'#',\n",
" b'\\xe5\\x96\\xb5',\n",
" b'\\xe5\\x96\\xb5',\n",
" b'\\xe5\\x96\\xb5',\n",
" '<|im_end|>',\n",
" '<|endoftext|>']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.convert_ids_to_tokens(ids)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"<|im_start|>print('我是一只猫<|extra_0|>')\\n#喵喵喵<|im_end|><|endoftext|>\""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'<|extra_204|>'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer._convert_id_to_token(len(tokenizer)-1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"151850"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer._convert_token_to_id('<|extra_204|>')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|