Spaces:

clarkchan
/

learn_on_hugging

Paused

File size: 4,742 Bytes

ca2bbee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31ff37c
ca2bbee
 
 
31ff37c
 
 
 
ca2bbee
 
 
 
 
 
31ff37c
ca2bbee
 
 
 
 
31ff37c
ca2bbee
 
 
 
 
 
 
 
 
 
 
 
31ff37c
ca2bbee
 
 
 
31ff37c
ca2bbee
 
31ff37c
ca2bbee
 
 
 
 
31ff37c
 
ca2bbee

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['这', '是', '中', '英', '文', 'test', '语', '句', '，', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
     ]
    }
   ],
   "source": [
    "input = \"这是中英文test语句，mix中英文及标点符号\"\n",
    "result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
    "a = tokenizer.tokenize(input)\n",
    "print(a)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[  101,  6821,  3221,   704,  5739,  3152, 10060,  6427,  1368,  8024,\n",
      "          9678,   704,  5739,  3152,  1350,  3403,  4157,  5016,  1384,   102]])\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "argument 'ids': 'dict' object cannot be converted to 'Sequence'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m ids \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mdecode(result)\n\u001b[1;32m      3\u001b[0m tokenizer\u001b[39m.\u001b[39mdecode(ids)\n",
      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m   3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m   3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m   3472\u001b[0m     token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m   3473\u001b[0m     skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m   3474\u001b[0m     clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m   3475\u001b[0m     \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m   3476\u001b[0m )\n",
      "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m    549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m    550\u001b[0m     token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m    553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m    554\u001b[0m     clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n",
      "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'dict' object cannot be converted to 'Sequence'"
     ]
    }
   ],
   "source": [
    "print(result.input_ids)\n",
    "ids = tokenizer.decode(result)\n",
    "tokenizer.decode(ids)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}