{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3ef6a441", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: nltk in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (3.8.1)\n", "Requirement already satisfied: click in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (8.1.3)\n", "Requirement already satisfied: tqdm in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (4.64.1)\n", "Requirement already satisfied: joblib in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (1.2.0)\n", "Requirement already satisfied: regex>=2021.8.3 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (2022.10.31)\n", "Requirement already satisfied: colorama in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from click->nltk) (0.4.6)\n", "Requirement already satisfied: rouge_score in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (0.1.2)\n", "Requirement already satisfied: numpy in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.24.1)\n", "Requirement already satisfied: absl-py in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.4.0)\n", "Requirement already satisfied: six>=1.14.0 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.16.0)\n", "Requirement already satisfied: nltk in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (3.8.1)\n", "Requirement already satisfied: joblib in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (1.2.0)\n", "Requirement already satisfied: tqdm in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (4.64.1)\n", "Requirement already satisfied: regex>=2021.8.3 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (2022.10.31)\n", "Requirement already satisfied: click in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (8.1.3)\n", "Requirement already satisfied: colorama in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from click->nltk->rouge_score) (0.4.6)\n" ] } ], "source": [ "# !pip install transformers\n", "!pip install nltk\n", "!pip install rouge_score\n", "\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "id": "845c8640", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "id": "23e534d2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vjmar\\Documents\\1. Code\\PythonEnvs\\hf-env\\lib\\site-packages\\tqdm\\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "| ID | GPU | MEM |\n", "------------------\n", "| 0 | 5% | 13% |\n", "None\n", "---------------------------------------------------------------\n", "Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n", "Token is valid.\n", "Your token has been saved to C:\\Users\\vjmar\\.cache\\huggingface\\token\n", "Login successful\n" ] } ], "source": [ "import GPUtil\n", "from huggingface_hub import HfApi, HfFolder, login\n", "\n", "print(GPUtil.showUtilization())\n", "print(\"---------------------------------------------------------------\")\n", "token = \"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\"\n", "login(\"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\")\n", "! git config --global credential.helper store" ] }, { "cell_type": "code", "execution_count": 3, "id": "2b5a41be", "metadata": {}, "outputs": [], "source": [ "CKPT = 't5-base'\n", "from transformers import AutoTokenizer, T5ForConditionalGeneration\n", "model = T5ForConditionalGeneration.from_pretrained(CKPT)" ] }, { "cell_type": "code", "execution_count": 4, "id": "75c5f40c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vjmar\\Documents\\1. Code\\PythonEnvs\\hf-env\\lib\\site-packages\\transformers\\models\\t5\\tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", " warnings.warn(\n" ] } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(CKPT)" ] }, { "cell_type": "markdown", "id": "ca3c201b", "metadata": {}, "source": [ "# Data" ] }, { "cell_type": "code", "execution_count": 5, "id": "f9ab72e4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset wikisql (C:/Users/vjmar/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n", "Found cached dataset wikisql (C:/Users/vjmar/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n" ] } ], "source": [ "try:\n", " from datasets import load_dataset\n", "except ModuleNotFoundError:\n", " !pip install datasets\n", " from datasets import load_dataset\n", "\n", "train_data = load_dataset('wikisql', split='train+validation')\n", "test_data = load_dataset('wikisql', split='test')" ] }, { "cell_type": "code", "execution_count": 6, "id": "0e62f295", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-19a43a9806773ee1.arrow\n", "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-620e43f13a2f425c.arrow\n" ] } ], "source": [ "def format_dataset(example):\n", " try:\n", " condition:str = example['sql']['conds']['condition'][0]\n", " except:\n", " condition = \"\"\n", " target = f\"{example['sql']['human_readable']}\"\n", " \n", " if condition.lower() in target.lower() and condition != \"\":\n", " target = target.lower().replace(condition.lower(), f\"'{condition}'\")\n", "\n", " cols = \"\"\n", " for item in example['table']['header']:\n", " cols = cols + item.lower() + \", \"\n", " \n", "\n", " obj = {'input': f\"translate to SQL: {example['question']} | table: {cols})\".replace(\", )\", \"\" ),\n", " \"target\": target}\n", " return obj\n", "\n", "# Apply Data Formatting\n", "train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)\n", "test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "e68f9896", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f47e6cd6", "metadata": {}, "source": [ "# Data Format for Training" ] }, { "cell_type": "code", "execution_count": 12, "id": "15ec294c", "metadata": {}, "outputs": [], "source": [ "def map_to_length(x): # map article and summary len to dict as well as if sample is longer than 512 tokens\n", " \n", " # from transformers import AutoTokenizer \n", " # tokenizer = AutoTokenizer.from_pretrained(\"t5-base\") \n", " x[\"input_len\"] = len(tokenizer(x[\"input\"]).input_ids)\n", " x[\"input_longer_256\"] = int(x[\"input_len\"] > 256)\n", " x[\"input_longer_128\"] = int(x[\"input_len\"] > 128)\n", " x[\"input_longer_64\"] = int(x[\"input_len\"] > 64)\n", " x[\"out_len\"] = len(tokenizer(x[\"target\"]).input_ids)\n", " x[\"out_longer_256\"] = int(x[\"out_len\"] > 256)\n", " x[\"out_longer_128\"] = int(x[\"out_len\"] > 128)\n", " x[\"out_longer_64\"] = int(x[\"out_len\"] > 64)\n", " return x\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "7b5df2e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2380.77ex/s]\n" ] } ], "source": [ "sample_size = 10000\n", "print(type(train_data))\n", "data_stats = train_data.select(range(sample_size)).map(map_to_length) #, num_proc=4" ] }, { "cell_type": "code", "execution_count": 14, "id": "e4589f66", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.68ba/s]\n", "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-aefcd3f1e400ed5a.arrow\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Input Mean: 46.515, %-Input > 256:0.0, %-Input > 128:0.0037, %-Input > 64:0.0712 Output Mean:19.1137, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0007\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/16 [00:00 256:{}, %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}\".format(\n", " sum(x[\"input_len\"]) / sample_size,\n", " sum(x[\"input_longer_256\"]) / sample_size,\n", " sum(x[\"input_longer_128\"]) / sample_size,\n", " sum(x[\"input_longer_64\"]) / sample_size, \n", " sum(x[\"out_len\"]) / sample_size,\n", " sum(x[\"out_longer_256\"]) / sample_size,\n", " sum(x[\"out_longer_128\"]) / sample_size,\n", " sum(x[\"out_longer_64\"]) / sample_size,\n", " )\n", " )\n", "\n", "output = data_stats.map(\n", " compute_and_print_stats, \n", " batched=True,\n", " batch_size=-1,\n", ")\n", "\n", "# tokenize the examples\n", "def convert_to_features(example_batch):\n", " input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)\n", " target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)\n", "\n", " encodings = {\n", " 'input_ids': input_encodings['input_ids'], \n", " 'attention_mask': input_encodings['attention_mask'],\n", " 'labels': target_encodings['input_ids'],\n", " 'decoder_attention_mask': target_encodings['attention_mask']\n", " }\n", "\n", " return encodings\n", "\n", "train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)\n", "test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)\n", "\n", "columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']\n", "\n", "train_data.set_format(type='torch', columns=columns)\n", "test_data.set_format(type='torch', columns=columns)" ] }, { "cell_type": "markdown", "id": "d439da79", "metadata": {}, "source": [ "# Trainer" ] }, { "cell_type": "code", "execution_count": 15, "id": "f1cee70c", "metadata": {}, "outputs": [], "source": [ "from transformers import Seq2SeqTrainer\n", "from transformers import Seq2SeqTrainingArguments\n", "import os\n", "\n", "training_args = Seq2SeqTrainingArguments(\n", " output_dir=str(os.getcwd()),\n", " per_device_train_batch_size=16,\n", " num_train_epochs=5,\n", " per_device_eval_batch_size=16,\n", " predict_with_generate=True,\n", " evaluation_strategy=\"epoch\",\n", " do_train=True,\n", " do_eval=True,\n", " logging_steps=500,\n", " save_strategy=\"epoch\",\n", " #save_steps=1000,\n", " #eval_steps=1000,\n", " overwrite_output_dir=True,\n", " save_total_limit=3,\n", " load_best_model_at_end=True,\n", " push_to_hub=True\n", " #fp16=True, \n", ")" ] }, { "cell_type": "code", "execution_count": 16, "id": "4ee61c54", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\vjmar\\AppData\\Local\\Temp\\ipykernel_29244\\418146841.py:3: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", " rouge = load_metric(\"rouge\")\n" ] } ], "source": [ "from datasets import load_metric\n", "\n", "rouge = load_metric(\"rouge\")\n", "\n", "def compute_metrics(pred):\n", " labels_ids = pred.label_ids\n", " pred_ids = pred.predictions\n", "\n", " # all unnecessary tokens are removed\n", " pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n", " labels_ids[labels_ids == -100] = tokenizer.pad_token_id\n", " label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)\n", "\n", " rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=[\"rouge2\"])[\"rouge2\"].mid\n", "\n", " return {\n", " \"rouge2_precision\": round(rouge_output.precision, 4),\n", " \"rouge2_recall\": round(rouge_output.recall, 4),\n", " \"rouge2_fmeasure\": round(rouge_output.fmeasure, 4),\n", " }" ] }, { "cell_type": "markdown", "id": "f6c0f580", "metadata": {}, "source": [ "# Define Trainer" ] }, { "cell_type": "code", "execution_count": null, "id": "b71acd7c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cloning https://huggingface.co/vjt/T5Training into local empty directory.\n" ] } ], "source": [ "# instantiate trainer\n", "trainer = Seq2SeqTrainer(\n", " model=model,\n", " args=training_args,\n", " compute_metrics=compute_metrics,\n", " train_dataset=train_data,\n", " eval_dataset=test_data,\n", ")\n", "import os\n", "trainer.evaluate()\n", "trainer.train()\n", "trainer.save_model()\n", "tokenizer.save_pretrained(os.getcwd())\n", "trainer.create_model_card()\n", "trainer.push_to_hub()" ] }, { "cell_type": "markdown", "id": "76ca29ea", "metadata": {}, "source": [ "# Test Model" ] }, { "cell_type": "code", "execution_count": null, "id": "d39e7e80", "metadata": {}, "outputs": [], "source": [ "CKPT = os.join(os.getcwd(), 't5-base-finetuned-wikisql')\n", "from transformers import AutoTokenizer, T5ForConditionalGeneration\n", "tokenizer = AutoTokenizer.from_pretrained(CKPT)\n", "model = T5ForConditionalGeneration.from_pretrained(CKPT)" ] }, { "cell_type": "code", "execution_count": null, "id": "58f4258c", "metadata": {}, "outputs": [], "source": [ "test_data = load_dataset('wikisql', split='test')" ] }, { "cell_type": "code", "execution_count": null, "id": "ecb1ddde", "metadata": {}, "outputs": [], "source": [ "def translate_to_sql(text):\n", " inputs = tokenizer(text, padding='longest', max_length=64, return_tensors='pt')\n", " input_ids = inputs.input_ids\n", " attention_mask = inputs.attention_mask\n", " output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)\n", "\n", " return tokenizer.decode(output[0], skip_special_tokens=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "506e28e2", "metadata": {}, "outputs": [], "source": [ "for i in range(0,100,10):\n", " print('translate to SQL: ' + test_data[i]['question'])\n", " print('Predict. :' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))\n", " print('Expected: ' + test_data[i]['sql']['human_readable'])\n", " print('=================================\\n')" ] }, { "cell_type": "code", "execution_count": null, "id": "18f1cdfe", "metadata": {}, "outputs": [], "source": [ "text = \"translate to SQL: Which employee has the highest salary? Columns: employee_id, name, year, parameters, engineer\"\n", "translate_to_sql(text)" ] }, { "cell_type": "code", "execution_count": null, "id": "8bd0a073", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }