The primary codes below are based on [akpe12/JP-KR-ocr-translator-for-travel](https://github.com/akpe12/JP-KR-ocr-translator-for-travel).

## Import

In [None]:
from typing import Dict, List
import csv

import datasets
import torch
from transformers import (
 PreTrainedTokenizerFast,
 AutoTokenizer,
 DataCollatorForSeq2Seq,
 Seq2SeqTrainingArguments,
 Trainer
)
from transformers.models.encoder_decoder.modeling_encoder_decoder import EncoderDecoderModel

from datasets import load_dataset

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# encoder_model_name = "xlm-roberta-base"
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

In [None]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# # device = torch.device("cpu")
# torch.cuda.set_device(device)
# device, torch.cuda.device_count()

In [None]:
class GPT2Tokenizer(PreTrainedTokenizerFast):
 def build_inputs_with_special_tokens(self, token_ids: List[int]) -> List[int]:
 return token_ids + [self.eos_token_id] 

src_tokenizer = AutoTokenizer.from_pretrained(encoder_model_name, use_fast=False)
trg_tokenizer = GPT2Tokenizer.from_pretrained(decoder_model_name, use_fast=False,
 bos_token='', eos_token='', unk_token='', pad_token='', mask_token='')

## Data

In [None]:
class PairedDataset:
 def __init__(self, 
 source_tokenizer: AutoTokenizer, target_tokenizer: GPT2Tokenizer,
 file_path: str = None,
 dataset_raw: datasets.Dataset = None
 ):
 self.src_tokenizer = source_tokenizer
 self.trg_tokenizer = target_tokenizer
 
 if file_path is not None:
 with open(file_path, 'r', encoding="utf-8") as fd:
 reader = csv.reader(fd)
 next(reader)
 self.data = [row for row in reader]
 elif dataset_raw is not None:
 self.data = dataset_raw
 else:
 raise ValueError('file_path or dataset_raw must be specified')

 def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
# with open('train_log.txt', 'a+') as log_file:
# log_file.write(f'reading data[{index}] {self.data[index]}\n')
 if isinstance(self.data, datasets.Dataset):
 src, trg = self.data[index]['sourceString'], self.data[index]['targetString']
 else:
 src, trg = self.data[index]
 embeddings = self.src_tokenizer(src, return_attention_mask=False, return_token_type_ids=False)
 embeddings['labels'] = self.trg_tokenizer.build_inputs_with_special_tokens(self.trg_tokenizer(trg, return_attention_mask=False)['input_ids'])

 return embeddings

 def __len__(self):
 return len(self.data)

In [None]:
# DATASET_TARGET = "TATOEBA_2023"
# DATASET_TARGET = "FFAC"
DATASET_TARGET = "AIHUB"

if (DATASET_TARGET == "TATOEBA_2023"):
 # dataset = load_dataset("sappho192/Tatoeba-Challenge-jpn-kor")
 dataset = load_dataset("/home/akalive/dataset/Tatoeba-Challenge-jpn-kor")

 train_dataset = dataset['train']
 test_dataset = dataset['test']

 train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=train_dataset)
 eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, dataset_raw=test_dataset)
elif (DATASET_TARGET == "FFAC"):
 DATA_ROOT = '/home/akalive/dataset/ffac/output'
 FILE_FFAC_FULL = 'ffac_full.csv'
 FILE_FFAC_TEST = 'ffac_test.csv'
 FILE_JA_KO_TRAIN = 'tteb_train.csv'
 FILE_JA_KO_TEST = 'tteb_test.csv'

 # train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_FULL}')
 # eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_FFAC_TEST}') 

 train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TRAIN}')
 eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_JA_KO_TEST}')
elif (DATASET_TARGET == "AIHUB"):
 # AIHUB dataset spent 25~33GB of VRAM with batch_size=30 while training.
 DATA_ROOT = '/home/akalive/dataset/jkpair/data'
 FILE_TRAIN = 'train.csv'
 FILE_VAL = 'validation.csv'

 train_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_TRAIN}')
 eval_dataset = PairedDataset(src_tokenizer, trg_tokenizer, file_path=f'{DATA_ROOT}/{FILE_VAL}')

train_first_row = train_dataset[0]
eval_first_row = eval_dataset[0]

print(train_first_row)
print(eval_first_row)

In [None]:
print(train_dataset)
train_dataset[0]

In [None]:
# be sure to check the column count of each dataset if you encounter "ValueError: too many values to unpack (expected 2)"
# at the `src, trg = self.data[index]`
# The `cat ffac_full.csv tteb_train.csv > ja_ko_train.csv` command may be the reason.
# the last row of first csv and first row of second csv is merged and that's why 3rd column is created (which arouse ValueError)
# debug_data = train_dataset.data


## Model

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
 encoder_model_name,
 decoder_model_name,
 pad_token_id=trg_tokenizer.bos_token_id,
)
model.config.decoder_start_token_id = trg_tokenizer.bos_token_id

In [None]:
class CustomTrainingArguments(Seq2SeqTrainingArguments):
 def __init__(self,*args, **kwargs):
 super(CustomTrainingArguments, self).__init__(*args, **kwargs)

 @property
 def device(self) -> "torch.device":
 """
 The device used by this process.
 Name the device the number you use.
 """
 return torch.device("cuda:0")

 @property
 def n_gpu(self):
 """
 The number of GPUs used by this process.
 Note:
 This will only be greater than one when you have multiple GPUs available but are not using distributed
 training. For distributed training, it will always be 1.
 """
 # Make sure `self._n_gpu` is properly setup.
 # _ = self._setup_devices
 # I set to one manullay
 self._n_gpu = 1
 return self._n_gpu


In [None]:
# for Trainer
import wandb

collate_fn = DataCollatorForSeq2Seq(src_tokenizer, model)
wandb.init(project="aihub-gt-2023", name='jbert+kogpt2')

arguments = Seq2SeqTrainingArguments(
# arguments = CustomTrainingArguments(
 output_dir='dump',
 do_train=True,
 do_eval=True,
 evaluation_strategy="epoch",
 save_strategy="epoch",
 num_train_epochs=5, # for 40GB
 # num_train_epochs=25,
 # per_device_train_batch_size=15,
 per_device_train_batch_size=30, # takes 40GB
 # per_device_eval_batch_size=10,
 per_device_eval_batch_size=10,
 warmup_ratio=0.1,
 gradient_accumulation_steps=4,
 save_total_limit=5,
 dataloader_num_workers=1,
 fp16=True, # ENABLE if CUDA is enabled
 load_best_model_at_end=True,
 report_to='wandb'
)

trainer = Trainer(
 model,
 arguments,
 data_collator=collate_fn,
 train_dataset=train_dataset,
 eval_dataset=eval_dataset
)

## Training

In [None]:
# model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base", "skt/kogpt2-base-v2")

In [None]:
trainer.train()

model.save_pretrained("dump/best_model")
src_tokenizer.save_pretrained("dump/best_model/src_tokenizer")
trg_tokenizer.save_pretrained("dump/best_model/trg_tokenizer")

In [None]:
# import wandb
wandb.finish()