Spaces:
Paused
Paused
File size: 4,742 Bytes
ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee 31ff37c ca2bbee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
]
}
],
"source": [
"input = \"这是中英文test语句,mix中英文及标点符号\"\n",
"result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
"a = tokenizer.tokenize(input)\n",
"print(a)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
" 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n"
]
},
{
"ename": "TypeError",
"evalue": "argument 'ids': 'dict' object cannot be converted to 'Sequence'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m ids \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mdecode(result)\n\u001b[1;32m 3\u001b[0m tokenizer\u001b[39m.\u001b[39mdecode(ids)\n",
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m 3472\u001b[0m token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m 3473\u001b[0m skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m 3474\u001b[0m clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m 3475\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 3476\u001b[0m )\n",
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m 550\u001b[0m token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m 553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m 554\u001b[0m clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n",
"\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'dict' object cannot be converted to 'Sequence'"
]
}
],
"source": [
"print(result.input_ids)\n",
"ids = tokenizer.decode(result)\n",
"tokenizer.decode(ids)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|