このnotebookは`stockmark/gpt-neox-japanese-1.4b`のモデルを`kunishou/databricks-dolly-15k-ja`のデータセットを用いてLoRA tuningするためのコードの例です。以下の例では、学習を1 epochを行います。T4 GPUで実行すると30分ほどかかります。

- モデル:https://huggingface.co/stockmark/gpt-neox-japanese-1.4b
- データ:https://github.com/kunishou/databricks-dolly-15k-ja


また、ここで用いている設定は暫定的なもので、必要に応じて調整してください。

# ライブラリのインストール

In [None]:
!python3 -m pip install -U pip
!python3 -m pip install transformers accelerate datasets peft

# 準備

In [None]:
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig

model_name = "stockmark/gpt-neox-japanese-1.4b"
peft_model_name = "peft_model"

prompt_template = """### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def encode(sample):
 prompt = prompt_template.format(instruction=sample["instruction"], input=sample["input"])
 target = sample["output"] + tokenizer.eos_token
 input_ids_prompt, input_ids_target = tokenizer([prompt, target]).input_ids
 input_ids = input_ids_prompt + input_ids_target
 labels = input_ids.copy()
 labels[:len(input_ids_prompt)] = [-100] * len(input_ids_prompt)
 return {"input_ids": input_ids, "labels": labels}

def get_collator(tokenizer, max_length):
 def collator(batch):
 batch = [{ key: value[:max_length] for key, value in sample.items() } for sample in batch ]
 batch = tokenizer.pad(batch, padding=True)
 batch["labels"] = [ e + [-100] * (len(batch["input_ids"][0]) - len(e)) for e in batch["labels"] ]
 batch = { key: torch.tensor(value) for key, value in batch.items() }
 return batch

 return collator


# データセットとモデルの準備


In [None]:
# prepare dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset_name = "kunishou/databricks-dolly-15k-ja"
dataset = datasets.load_dataset(dataset_name)
dataset = dataset.map(encode)
dataset = dataset["train"].train_test_split(0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# load model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map={"": 0}, torch_dtype=torch.float16)

peft_config = LoraConfig(
 task_type=TaskType.CAUSAL_LM,
 inference_mode=False,
 target_modules=["query_key_value"],
 r=16,
 lora_alpha=32,
 lora_dropout=0.05
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# LoRA tuning

In [None]:
training_args = TrainingArguments(
 output_dir="./train_results",
 learning_rate=2e-4,
 per_device_train_batch_size=4,
 gradient_accumulation_steps=4,
 per_device_eval_batch_size=16,
 num_train_epochs=1,
 logging_strategy='steps',
 logging_steps=10,
 save_strategy='epoch',
 evaluation_strategy='epoch',
 load_best_model_at_end=True,
 metric_for_best_model="eval_loss",
 greater_is_better=False,
 save_total_limit=2
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 data_collator=get_collator(tokenizer, 512)
)

trainer.train()
model = trainer.model
model.save_pretrained(peft_model_name)

# 学習したモデルのロード

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map={"": 0}, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, peft_model_name)

# 推論

In [None]:
prompt = prompt_template.format(instruction="日本で人気のスポーツは?", input="")

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
 tokens = model.generate(
 **inputs,
 max_new_tokens=128,
 repetition_penalty=1.1
 )

output = tokenizer.decode(tokens[0], skip_special_tokens=True)
print(output)