{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install transformers" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from __future__ import absolute_import\n", "import torch\n", "import logging\n", "import torch.nn as nn\n", "from model import Seq2Seq\n", "from transformers import (\n", " RobertaConfig, \n", " RobertaModel, \n", " RobertaTokenizer\n", ")\n", "\n", "import regex as re\n", "\n", "# disable warnings\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# base model is RoBERTa\n", "MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}\n", "\n", "# initialize logging\n", "logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',\n", " datefmt = '%m/%d/%Y %H:%M:%S',\n", " level = logging.INFO)\n", "logger = logging.getLogger(__name__)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class CONFIG:\n", " max_source_length = 256\n", " max_target_length = 128\n", " beam_size = 10\n", " local_rank = -1\n", " no_cuda = False\n", "\n", " do_train = True\n", " do_eval = True\n", " do_test = True\n", " train_batch_size = 12\n", " eval_batch_size = 32\n", "\n", " model_type = \"roberta\"\n", " model_name_or_path = \"microsoft/codebert-base\"\n", " output_dir = \"/content/drive/MyDrive/CodeSummarization\"\n", " load_model_path = None\n", " train_filename = \"dataset/python/train.jsonl\"\n", " dev_filename = \"dataset/python/valid.jsonl\"\n", " test_filename = \"dataset/python/test.jsonl\"\n", " config_name = \"\"\n", " tokenizer_name = \"\"\n", " cache_dir = \"cache\"\n", "\n", " save_every = 5000\n", "\n", " gradient_accumulation_steps = 1\n", " learning_rate = 5e-5\n", " weight_decay = 1e-4\n", " adam_epsilon = 1e-8\n", " max_grad_norm = 1.0\n", " num_train_epochs = 3.0\n", " max_steps = -1\n", " warmup_steps = 0\n", " train_steps = 100000\n", " eval_steps = 10000\n", " n_gpu = torch.cuda.device_count()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load tokenizer" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ded94a2103074dc5b4413a2774888bca", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/899k [00:00 index: 0\n", " index: 2\n", " index: 1\n", " index: 50264\n" ] } ], "source": [ "import logging\n", "from transformers import RobertaTokenizer\n", "logger = logging.getLogger(__name__)\n", "tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', cache_dir=CONFIG.cache_dir)\n", "\n", "print(f'{tokenizer.cls_token} index: {tokenizer.cls_token_id}')\n", "print(f'{tokenizer.sep_token} index: {tokenizer.sep_token_id}')\n", "print(f'{tokenizer.pad_token} index: {tokenizer.pad_token_id}')\n", "print(f'{tokenizer.mask_token} index: {tokenizer.mask_token_id}') " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_str = \"def sina_xml_to_url_list(xml_data):\\n \\\"\\\"\\\"str->list\\n Convert XML to URL List.\\n From Biligrab.\\n \\\"\\\"\\\"\\n rawurl = []\\n dom = parseString(xml_data)\\n for node in dom.getElementsByTagName('durl'):\\n url = node.getElementsByTagName('url')[0]\\n rawurl.append(url.childNodes[0].data)\\n return rawurl\"\n", "input_tokens = tokenizer.tokenize(input_str)\n", "print(input_tokens)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['def',\n", " 'sina_xml_to_url_list',\n", " '(',\n", " 'xml_data',\n", " ')',\n", " ':',\n", " 'rawurl',\n", " '=',\n", " '[',\n", " ']',\n", " 'dom',\n", " '=',\n", " 'parseString',\n", " '(',\n", " 'xml_data',\n", " ')',\n", " 'for',\n", " 'node',\n", " 'in',\n", " 'dom',\n", " '.',\n", " 'getElementsByTagName',\n", " '(',\n", " \"'\",\n", " 'durl',\n", " \"'\",\n", " ')',\n", " ':',\n", " 'url',\n", " '=',\n", " 'node',\n", " '.',\n", " 'getElementsByTagName',\n", " '(',\n", " \"'\",\n", " 'url',\n", " \"'\",\n", " ')',\n", " '[',\n", " '0',\n", " ']',\n", " 'rawurl',\n", " '.',\n", " 'append',\n", " '(',\n", " 'url',\n", " '.',\n", " 'childNodes',\n", " '[',\n", " '0',\n", " ']',\n", " '.',\n", " 'data',\n", " ')',\n", " 'return',\n", " 'rawurl']" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def preprocessing(code_segment):\n", " \n", " # remove newlines\n", " code_segment = re.sub(r'\\n', ' ', code_segment)\n", " \n", " # remove docstring\n", " code_segment = re.sub(r'\"\"\".*?\"\"\"', '', code_segment, flags=re.DOTALL)\n", " \n", " # remove multiple spaces\n", " code_segment = re.sub(r'\\s+', ' ', code_segment)\n", " \n", " # remove comments\n", " code_segment = re.sub(r'#.*', '', code_segment)\n", "\n", " # remove html tags\n", " code_segment = re.sub(r'<.*?>', '', code_segment)\n", "\n", " # remove urls\n", " code_segment = re.sub(r'http\\S+', '', code_segment)\n", " \n", " # split special chars into different tokens\n", " code_segment = re.sub(r'([^\\w\\s])', r' \\1 ', code_segment)\n", " \n", " return code_segment.split()\n", "\n", "preprocessing(input_str)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokens = ['def', 'get_data', '(', ')', ':', 'data', '=', '[', ']', 'for', 'i', 'in', 'range', '(', '10', ')', ':', 'data', '.', 'append', '(', 'i', ')', 'return', 'data']\n" ] } ], "source": [ "input_str = \"def get_data():\\n data = []\\n for i in range(10):\\n data.append(i)\\n return data\"\n", "input_tokens = preprocessing(input_str)\n", "print(f'Tokens = {input_tokens}')\n", "# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokens = ['def', 'sina_xml_to_url_list', '(', 'xml_data', ')', ':', 'rawurl', '=', '[', ']', 'dom', '=', 'parseString', '(', 'xml_data', ')', 'for', 'node', 'in', 'dom', '.', 'getElementsByTagName', '(', \"'\", 'durl', \"'\", ')', ':', 'url', '=', 'node', '.', 'getElementsByTagName', '(', \"'\", 'url', \"'\", ')', '[', '0', ']', 'rawurl', '.', 'append', '(', 'url', '.', 'childNodes', '[', '0', ']', '.', 'data', ')', 'return', 'rawurl']\n" ] }, { "data": { "text/plain": [ "{'input_ids': tensor([[ 0, 9232, 3, 1640, 3, 43, 35, 3, 5214, 10975,\n", " 742, 12623, 5214, 3, 1640, 3, 43, 1990, 46840, 179,\n", " 12623, 4, 3, 1640, 108, 3, 108, 43, 35, 6423,\n", " 5214, 46840, 4, 3, 1640, 108, 6423, 108, 43, 10975,\n", " 288, 742, 3, 4, 48696, 1640, 6423, 4, 3, 10975,\n", " 288, 742, 4, 23687, 43, 30921, 3, 2, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_str = \"def sina_xml_to_url_list(xml_data):\\n \\\"\\\"\\\"str->list\\n Convert XML to URL List.\\n From Biligrab.\\n \\\"\\\"\\\"\\n rawurl = []\\n dom = parseString(xml_data)\\n for node in dom.getElementsByTagName('durl'):\\n url = node.getElementsByTagName('url')[0]\\n rawurl.append(url.childNodes[0].data)\\n return rawurl\"\n", "input_tokens = preprocessing(input_str)\n", "print(f'Tokens = {input_tokens}')\n", "# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 0, 9232, 3, 1640, 43, 35, 23687, 5214, 10975, 742,\n", " 1990, 118, 179, 9435, 1640, 698, 43, 35, 23687, 4,\n", " 48696, 1640, 118, 43, 30921, 23687, 2, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}\n" ] } ], "source": [ "encoded_input = tokenizer.encode_plus(\n", " input_tokens, \n", " max_length=CONFIG.max_source_length, \n", " pad_to_max_length=True, \n", " truncation=True, \n", " return_tensors=\"pt\"\n", ")\n", "print(encoded_input)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load model" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# Config model\n", "config_class, model_class, tokenizer_class = (RobertaConfig, RobertaModel, RobertaTokenizer)\n", "model_config = config_class.from_pretrained(CONFIG.config_name if CONFIG.config_name else CONFIG.model_name_or_path, cache_dir=CONFIG.cache_dir)\n", "model_config.save_pretrained('config')\n", "\n", "# load tokenizer\n", "tokenizer = tokenizer_class.from_pretrained(\n", " CONFIG.tokenizer_name if CONFIG.tokenizer_name else CONFIG.model_name_or_path,\n", " cache_dir=CONFIG.cache_dir,\n", " # do_lower_case=args.do_lower_case\n", ")\n", "\n", "# load encoder from pretrained RoBERTa\n", "encoder = model_class.from_pretrained(CONFIG.model_name_or_path, config=model_config, cache_dir=CONFIG.cache_dir) \n", "\n", "# build decoder \n", "decoder_layer = nn.TransformerDecoderLayer(d_model=model_config.hidden_size, nhead=model_config.num_attention_heads)\n", "decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)\n", "\n", "# build seq2seq model from pretrained encoder and from-scratch decoder\n", "model=Seq2Seq(\n", " encoder=encoder,\n", " decoder=decoder,\n", " config=model_config,\n", " beam_size=CONFIG.beam_size,\n", " max_length=CONFIG.max_target_length,\n", " sos_id=tokenizer.cls_token_id,\n", " eos_id=tokenizer.sep_token_id\n", ")" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_dict = torch.load(\"./models/pytorch_model.bin\")\n", "model.load_state_dict(state_dict)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# move model to GPU\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() and not CONFIG.no_cuda else \"cpu\")\n", "model = model.to(device)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 0, 9232, 3, 1640, 43, 35, 23687, 5214, 10975, 742,\n", " 1990, 118, 179, 9435, 1640, 698, 43, 35, 23687, 4,\n", " 48696, 1640, 118, 43, 30921, 23687, 2, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}\n" ] } ], "source": [ "input_str = \"def get_data():\\n data = []\\n for i in range(10):\\n data.append(i)\\n return data\"\n", "input_tokens = preprocessing(input_str)\n", "encoded_input = tokenizer.encode_plus(\n", " input_tokens, \n", " max_length=CONFIG.max_source_length, \n", " pad_to_max_length=True, \n", " truncation=True, \n", " return_tensors=\"pt\"\n", ")\n", "print(encoded_input)\n", "\n", "input_ids = encoded_input[\"input_ids\"].to(device)\n", "input_mask = encoded_input[\"attention_mask\"].to(device)\n" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Summary.shape = torch.Size([1, 10, 128])\n", "Summary = tensor([[[42555, 10, 889, ..., 0, 0, 0],\n", " [42555, 10, 889, ..., 0, 0, 0],\n", " [42555, 10, 889, ..., 0, 0, 0],\n", " ...,\n", " [42555, 10, 889, ..., 0, 0, 0],\n", " [42555, 10, 889, ..., 0, 0, 0],\n", " [42555, 10, 889, ..., 0, 0, 0]]], device='cuda:0')\n" ] } ], "source": [ "output = model(input_ids, input_mask)\n", "print(f'Summary.shape = {output.shape}')\n", "print(f'Summary = {output}')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([128])\n", "Return a list of data.\n", "torch.Size([128])\n", "Return a list of int values.\n", "torch.Size([128])\n", "Return a list of ints.\n", "torch.Size([128])\n", "Return a list of ints\n", "torch.Size([128])\n", "Return a list of the number of integers.\n", "torch.Size([128])\n", "Return a list of the number of data.\n", "torch.Size([128])\n", "Return a list of the number of digits.\n", "torch.Size([128])\n", "Return a list of the number of numbers.\n", "torch.Size([128])\n", "Return a list of data in a list.\n", "torch.Size([128])\n", "Return a list of data in a list of data\n" ] } ], "source": [ "# decode summary with tokenizer\n", "summary = output[0]\n", "for i in range(10):\n", " print(f'{summary[i].shape}')\n", " pred = tokenizer.decode(summary[i], skip_special_tokens=True)\n", " print(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "aio", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "c4b1d2403d5bedfc2b499b2d1212ae0437b5f8ebf43026ed45c1b9608ddeb20c" } } }, "nbformat": 4, "nbformat_minor": 2 }