File size: 21,740 Bytes

0f303b0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ef6a441",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: nltk in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (3.8.1)\n",
      "Requirement already satisfied: click in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (8.1.3)\n",
      "Requirement already satisfied: tqdm in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (4.64.1)\n",
      "Requirement already satisfied: joblib in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (1.2.0)\n",
      "Requirement already satisfied: regex>=2021.8.3 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk) (2022.10.31)\n",
      "Requirement already satisfied: colorama in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from click->nltk) (0.4.6)\n",
      "Requirement already satisfied: rouge_score in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (0.1.2)\n",
      "Requirement already satisfied: numpy in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.24.1)\n",
      "Requirement already satisfied: absl-py in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.4.0)\n",
      "Requirement already satisfied: six>=1.14.0 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (1.16.0)\n",
      "Requirement already satisfied: nltk in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from rouge_score) (3.8.1)\n",
      "Requirement already satisfied: joblib in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (1.2.0)\n",
      "Requirement already satisfied: tqdm in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (4.64.1)\n",
      "Requirement already satisfied: regex>=2021.8.3 in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (2022.10.31)\n",
      "Requirement already satisfied: click in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from nltk->rouge_score) (8.1.3)\n",
      "Requirement already satisfied: colorama in c:\\users\\vjmar\\documents\\1. code\\pythonenvs\\hf-env\\lib\\site-packages (from click->nltk->rouge_score) (0.4.6)\n"
     ]
    }
   ],
   "source": [
    "# !pip install transformers\n",
    "!pip install nltk\n",
    "!pip install rouge_score\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "845c8640",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "23e534d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\vjmar\\Documents\\1. Code\\PythonEnvs\\hf-env\\lib\\site-packages\\tqdm\\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "| ID | GPU | MEM |\n",
      "------------------\n",
      "|  0 |  5% | 13% |\n",
      "None\n",
      "---------------------------------------------------------------\n",
      "Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n",
      "Token is valid.\n",
      "Your token has been saved to C:\\Users\\vjmar\\.cache\\huggingface\\token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "import GPUtil\n",
    "from huggingface_hub import HfApi, HfFolder, login\n",
    "\n",
    "print(GPUtil.showUtilization())\n",
    "print(\"---------------------------------------------------------------\")\n",
    "token = \"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\"\n",
    "login(\"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\")\n",
    "! git config --global credential.helper store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2b5a41be",
   "metadata": {},
   "outputs": [],
   "source": [
    "CKPT = 't5-base'\n",
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "model = T5ForConditionalGeneration.from_pretrained(CKPT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "75c5f40c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\vjmar\\Documents\\1. Code\\PythonEnvs\\hf-env\\lib\\site-packages\\transformers\\models\\t5\\tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
      "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
      "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(CKPT)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca3c201b",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f9ab72e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikisql (C:/Users/vjmar/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n",
      "Found cached dataset wikisql (C:/Users/vjmar/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    from datasets import load_dataset\n",
    "except ModuleNotFoundError:\n",
    "    !pip install datasets\n",
    "    from datasets import load_dataset\n",
    "\n",
    "train_data = load_dataset('wikisql', split='train+validation')\n",
    "test_data = load_dataset('wikisql', split='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0e62f295",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-19a43a9806773ee1.arrow\n",
      "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-620e43f13a2f425c.arrow\n"
     ]
    }
   ],
   "source": [
    "def format_dataset(example):\n",
    "  try:\n",
    "    condition:str = example['sql']['conds']['condition'][0]\n",
    "  except:\n",
    "    condition = \"\"\n",
    "  target = f\"{example['sql']['human_readable']}\"\n",
    "  \n",
    "  if condition.lower() in target.lower() and condition != \"\":\n",
    "    target = target.lower().replace(condition.lower(), f\"'{condition}'\")\n",
    "\n",
    "  cols = \"\"\n",
    "  for item in example['table']['header']:\n",
    "    cols = cols + item.lower() + \", \"\n",
    "  \n",
    "\n",
    "  obj =  {'input': f\"translate to SQL: {example['question']} | table: {cols})\".replace(\", )\", \"\" ),\n",
    "          \"target\": target}\n",
    "  return obj\n",
    "\n",
    "# Apply Data Formatting\n",
    "train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)\n",
    "test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e68f9896",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "f47e6cd6",
   "metadata": {},
   "source": [
    "# Data Format for Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "15ec294c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def map_to_length(x): # map article and summary len to dict as well as if sample is longer than 512 tokens\n",
    "    \n",
    "      # from transformers import AutoTokenizer  \n",
    "      # tokenizer = AutoTokenizer.from_pretrained(\"t5-base\")  \n",
    "      x[\"input_len\"] = len(tokenizer(x[\"input\"]).input_ids)\n",
    "      x[\"input_longer_256\"] = int(x[\"input_len\"] > 256)\n",
    "      x[\"input_longer_128\"] = int(x[\"input_len\"] > 128)\n",
    "      x[\"input_longer_64\"] = int(x[\"input_len\"] > 64)\n",
    "      x[\"out_len\"] = len(tokenizer(x[\"target\"]).input_ids)\n",
    "      x[\"out_longer_256\"] = int(x[\"out_len\"] > 256)\n",
    "      x[\"out_longer_128\"] = int(x[\"out_len\"] > 128)\n",
    "      x[\"out_longer_64\"] = int(x[\"out_len\"] > 64)\n",
    "      return x\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7b5df2e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'datasets.arrow_dataset.Dataset'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2380.77ex/s]\n"
     ]
    }
   ],
   "source": [
    "sample_size = 10000\n",
    "print(type(train_data))\n",
    "data_stats = train_data.select(range(sample_size)).map(map_to_length) #, num_proc=4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e4589f66",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.68ba/s]\n",
      "Loading cached processed dataset at C:\\Users\\vjmar\\.cache\\huggingface\\datasets\\wikisql\\default\\0.1.0\\7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d\\cache-aefcd3f1e400ed5a.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input Mean: 46.515, %-Input > 256:0.0,  %-Input > 128:0.0037, %-Input > 64:0.0712 Output Mean:19.1137, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0007\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                                                                                                                 | 0/16 [00:00<?, ?ba/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
      "C:\\Users\\vjmar\\Documents\\1. Code\\PythonEnvs\\hf-env\\lib\\site-packages\\transformers\\tokenization_utils_base.py:2339: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
      "  warnings.warn(\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:04<00:00,  3.88ba/s]\n"
     ]
    }
   ],
   "source": [
    "def compute_and_print_stats(x):\n",
    "  if len(x[\"input_len\"]) == sample_size:\n",
    "    print(\n",
    "        \"Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}\".format(\n",
    "            sum(x[\"input_len\"]) / sample_size,\n",
    "            sum(x[\"input_longer_256\"]) / sample_size,\n",
    "            sum(x[\"input_longer_128\"]) / sample_size,\n",
    "            sum(x[\"input_longer_64\"]) / sample_size,   \n",
    "            sum(x[\"out_len\"]) / sample_size,\n",
    "            sum(x[\"out_longer_256\"]) / sample_size,\n",
    "            sum(x[\"out_longer_128\"]) / sample_size,\n",
    "            sum(x[\"out_longer_64\"]) / sample_size,\n",
    "        )\n",
    "    )\n",
    "\n",
    "output = data_stats.map(\n",
    "  compute_and_print_stats, \n",
    "  batched=True,\n",
    "  batch_size=-1,\n",
    ")\n",
    "\n",
    "# tokenize the examples\n",
    "def convert_to_features(example_batch):\n",
    "    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)\n",
    "    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)\n",
    "\n",
    "    encodings = {\n",
    "        'input_ids': input_encodings['input_ids'], \n",
    "        'attention_mask': input_encodings['attention_mask'],\n",
    "        'labels': target_encodings['input_ids'],\n",
    "        'decoder_attention_mask': target_encodings['attention_mask']\n",
    "    }\n",
    "\n",
    "    return encodings\n",
    "\n",
    "train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)\n",
    "test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)\n",
    "\n",
    "columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']\n",
    "\n",
    "train_data.set_format(type='torch', columns=columns)\n",
    "test_data.set_format(type='torch', columns=columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d439da79",
   "metadata": {},
   "source": [
    "# Trainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f1cee70c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainer\n",
    "from transformers import Seq2SeqTrainingArguments\n",
    "import os\n",
    "\n",
    "training_args = Seq2SeqTrainingArguments(\n",
    "    output_dir=str(os.getcwd()),\n",
    "    per_device_train_batch_size=16,\n",
    "    num_train_epochs=5,\n",
    "    per_device_eval_batch_size=16,\n",
    "    predict_with_generate=True,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    do_train=True,\n",
    "    do_eval=True,\n",
    "    logging_steps=500,\n",
    "    save_strategy=\"epoch\",\n",
    "    #save_steps=1000,\n",
    "    #eval_steps=1000,\n",
    "    overwrite_output_dir=True,\n",
    "    save_total_limit=3,\n",
    "    load_best_model_at_end=True,\n",
    "    push_to_hub=True\n",
    "    #fp16=True, \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4ee61c54",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\vjmar\\AppData\\Local\\Temp\\ipykernel_29244\\418146841.py:3: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
      "  rouge = load_metric(\"rouge\")\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_metric\n",
    "\n",
    "rouge = load_metric(\"rouge\")\n",
    "\n",
    "def compute_metrics(pred):\n",
    "    labels_ids = pred.label_ids\n",
    "    pred_ids = pred.predictions\n",
    "\n",
    "    # all unnecessary tokens are removed\n",
    "    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
    "    labels_ids[labels_ids == -100] = tokenizer.pad_token_id\n",
    "    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)\n",
    "\n",
    "    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=[\"rouge2\"])[\"rouge2\"].mid\n",
    "\n",
    "    return {\n",
    "        \"rouge2_precision\": round(rouge_output.precision, 4),\n",
    "        \"rouge2_recall\": round(rouge_output.recall, 4),\n",
    "        \"rouge2_fmeasure\": round(rouge_output.fmeasure, 4),\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6c0f580",
   "metadata": {},
   "source": [
    "# Define Trainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b71acd7c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cloning https://huggingface.co/vjt/T5Training into local empty directory.\n"
     ]
    }
   ],
   "source": [
    "# instantiate trainer\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    compute_metrics=compute_metrics,\n",
    "    train_dataset=train_data,\n",
    "    eval_dataset=test_data,\n",
    ")\n",
    "import os\n",
    "trainer.evaluate()\n",
    "trainer.train()\n",
    "trainer.save_model()\n",
    "tokenizer.save_pretrained(os.getcwd())\n",
    "trainer.create_model_card()\n",
    "trainer.push_to_hub()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76ca29ea",
   "metadata": {},
   "source": [
    "# Test Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d39e7e80",
   "metadata": {},
   "outputs": [],
   "source": [
    "CKPT = os.join(os.getcwd(), 't5-base-finetuned-wikisql')\n",
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "tokenizer = AutoTokenizer.from_pretrained(CKPT)\n",
    "model = T5ForConditionalGeneration.from_pretrained(CKPT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58f4258c",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = load_dataset('wikisql', split='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecb1ddde",
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate_to_sql(text):\n",
    "    inputs = tokenizer(text, padding='longest', max_length=64, return_tensors='pt')\n",
    "    input_ids = inputs.input_ids\n",
    "    attention_mask = inputs.attention_mask\n",
    "    output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)\n",
    "\n",
    "    return tokenizer.decode(output[0], skip_special_tokens=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "506e28e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(0,100,10):\n",
    "  print('translate to SQL: ' + test_data[i]['question'])\n",
    "  print('Predict. :' + translate_to_sql('translate to SQL: ' + test_data[i]['question']))\n",
    "  print('Expected: ' + test_data[i]['sql']['human_readable'])\n",
    "  print('=================================\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18f1cdfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"translate to SQL: Which employee has the highest salary? Columns: employee_id, name, year, parameters, engineer\"\n",
    "translate_to_sql(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bd0a073",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}