{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "The primary codes below are based on [akpe12/JP-KR-ocr-translator-for-travel](https://github.com/akpe12/JP-KR-ocr-translator-for-travel)." ] }, { "cell_type": "markdown", "metadata": { "id": "TrHlPFqwFAgj" }, "source": [ "## Import" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "t-jXeSJKE1WM" }, "outputs": [], "source": [ "from typing import Dict, List\n", "import csv\n", "\n", "import datasets\n", "import torch\n", "from transformers import (\n", " PreTrainedTokenizerFast,\n", " AutoTokenizer,\n", " DataCollatorForSeq2Seq,\n", " Seq2SeqTrainingArguments,\n", " Trainer\n", ")\n", "from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel\n", "\n", "from datasets import load_dataset\n", "\n", "import os\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"\n", "\n", "# encoder_model_name = \"xlm-roberta-base\"\n", "encoder_model_name = \"cl-tohoku/bert-base-japanese-v2\"\n", "decoder_model_name = \"skt/kogpt2-base-v2\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nEW5trBtbykK" }, "outputs": [], "source": [ "# device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "# # device = torch.device(\"cpu\")\n", "# torch.cuda.set_device(device)\n", "# device, torch.cuda.device_count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5ic7pUUBFU_v" }, "outputs": [], "source": [ "class GPT2Tokenizer(PreTrainedTokenizerFast):\n", " def build_inputs_with_special_tokens(self, token_ids: List[int]) -> List[int]:\n", " return token_ids + [self.eos_token_id] \n", "\n", "src_tokenizer = AutoTokenizer.from_pretrained(encoder_model_name, use_fast=False)\n", "trg_tokenizer = GPT2Tokenizer.from_pretrained(decoder_model_name, use_fast=False,\n", " bos_token='', eos_token='', unk_token='', pad_token='', mask_token='')" ] }, { "cell_type": "markdown", "metadata": { "id": "DTf4U1fmFQFh" }, "source": [ "## Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "65L4O1c5FLKt" }, "outputs": [], "source": [ "class PairedDataset:\n", " def __init__(self, \n", " source_tokenizer: AutoTokenizer, target_tokenizer: GPT2Tokenizer,\n", " file_path: str = None,\n", " dataset_raw: datasets.Dataset = None\n", " ):\n", " self.src_tokenizer = source_tokenizer\n", " self.trg_tokenizer = target_tokenizer\n", " \n", " if file_path is not None:\n", " with open(file_path, 'r', encoding=\"utf-8\") as fd:\n", " reader = csv.reader(fd)\n", " next(reader)\n", " self.data = [row for row in reader]\n", " elif dataset_raw is not None:\n", " self.data = dataset_raw\n", " else:\n", " raise ValueError('file_path or dataset_raw must be specified')\n", "\n", " def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:\n", "# with open('train_log.txt', 'a+') as log_file:\n", "# log_file.write(f'reading data[{index}] {self.data[index]}\\n')\n", " if isinstance(self.data, datasets.Dataset):\n", " src, trg = self.data[index]['sourceString'], self.data[index]['targetString']\n", " else:\n", " src, trg = self.data[index]\n", " embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)\n", " embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])\n", "\n", " return embeddings\n", "\n", " def __len__(self):\n", " return len(self.data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# DATASET_TARGET = \"TATOEBA_2023\"\n", "# DATASET_TARGET = \"FFAC\"\n", "DATASET_TARGET = \"AIHUB\"\n", "\n", "if (DATASET_TARGET == \"TATOEBA_2023\"):\n", " # dataset = load_dataset(\"sappho192/Tatoeba-Challenge-jpn-kor\")\n", " dataset = load_dataset(\"/home/akalive/dataset/Tatoeba-Challenge-jpn-kor\")\n", "\n", " train_dataset = dataset['train']\n", " test_dataset = dataset['test']\n", "\n", " train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=train_dataset)\n", " eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=test_dataset)\n", "elif (DATASET_TARGET == \"FFAC\"):\n", " DATA_ROOT = '/home/akalive/dataset/ffac/output'\n", " FILE_FFAC_FULL = 'ffac_full.csv'\n", " FILE_FFAC_TEST = 'ffac_test.csv'\n", " FILE_JA_KO_TRAIN = 'tteb_train.csv'\n", " FILE_JA_KO_TEST = 'tteb_test.csv'\n", "\n", " # train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_FULL}')\n", " # eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_TEST}') \n", "\n", " train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')\n", " eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TEST}')\n", "elif (DATASET_TARGET == \"AIHUB\"):\n", " # AIHUB dataset spent 25~33GB of VRAM with batch_size=30 while training.\n", " DATA_ROOT = '/home/akalive/dataset/jkpair/data'\n", " FILE_TRAIN = 'train.csv'\n", " FILE_VAL = 'validation.csv'\n", "\n", " train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_TRAIN}')\n", " eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_VAL}')\n", "\n", "train_first_row = train_dataset[0]\n", "eval_first_row = eval_dataset[0]\n", "\n", "print(train_first_row)\n", "print(eval_first_row)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(train_dataset)\n", "train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# be sure to check the column count of each dataset if you encounter \"ValueError: too many values to unpack (expected 2)\"\n", "# at the `src, trg = self.data[index]`\n", "# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.\n", "# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)\n", "# debug_data = train_dataset.data\n" ] }, { "cell_type": "markdown", "metadata": { "id": "uCBiLouSFiZY" }, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "I7uFbFYJFje8" }, "outputs": [], "source": [ "model = EncoderDecoderModel.from_encoder_decoder_pretrained(\n", " encoder_model_name,\n", " decoder_model_name,\n", " pad_token_id=trg_tokenizer.bos_token_id,\n", ")\n", "model.config.decoder_start_token_id = trg_tokenizer.bos_token_id" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CustomTrainingArguments(Seq2SeqTrainingArguments):\n", " def __init__(self,*args, **kwargs):\n", " super(CustomTrainingArguments, self).__init__(*args, **kwargs)\n", "\n", " @property\n", " def device(self) -> \"torch.device\":\n", " \"\"\"\n", " The device used by this process.\n", " Name the device the number you use.\n", " \"\"\"\n", " return torch.device(\"cuda:0\")\n", "\n", " @property\n", " def n_gpu(self):\n", " \"\"\"\n", " The number of GPUs used by this process.\n", " Note:\n", " This will only be greater than one when you have multiple GPUs available but are not using distributed\n", " training. For distributed training, it will always be 1.\n", " \"\"\"\n", " # Make sure `self._n_gpu` is properly setup.\n", " # _ = self._setup_devices\n", " # I set to one manullay\n", " self._n_gpu = 1\n", " return self._n_gpu\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YFq2GyOAUV0W" }, "outputs": [], "source": [ "# for Trainer\n", "import wandb\n", "\n", "collate_fn = DataCollatorForSeq2Seq(src_tokenizer, model)\n", "wandb.init(project=\"aihub-gt-2023\", name='jbert+kogpt2')\n", "\n", "arguments = Seq2SeqTrainingArguments(\n", "# arguments = CustomTrainingArguments(\n", " output_dir='dump',\n", " do_train=True,\n", " do_eval=True,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " num_train_epochs=5, # for 40GB\n", " # num_train_epochs=25,\n", " # per_device_train_batch_size=15,\n", " per_device_train_batch_size=30, # takes 40GB\n", " # per_device_eval_batch_size=10,\n", " per_device_eval_batch_size=10,\n", " warmup_ratio=0.1,\n", " gradient_accumulation_steps=4,\n", " save_total_limit=5,\n", " dataloader_num_workers=1,\n", " fp16=True, # ENABLE if CUDA is enabled\n", " load_best_model_at_end=True,\n", " report_to='wandb'\n", ")\n", "\n", "trainer = Trainer(\n", " model,\n", " arguments,\n", " data_collator=collate_fn,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "pPsjDHO5Vc3y" }, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_T4P4XunmK-C" }, "outputs": [], "source": [ "# model = EncoderDecoderModel.from_encoder_decoder_pretrained(\"xlm-roberta-base\", \"skt/kogpt2-base-v2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7vTqAgW6Ve3J" }, "outputs": [], "source": [ "trainer.train()\n", "\n", "model.save_pretrained(\"dump/best_model\")\n", "src_tokenizer.save_pretrained(\"dump/best_model/src_tokenizer\")\n", "trg_tokenizer.save_pretrained(\"dump/best_model/trg_tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import wandb\n", "wandb.finish()" ] } ], "metadata": { "colab": { "machine_shape": "hm", "provenance": [] }, "gpuClass": "premium", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 0 }