datasets:
- kinokokoro/ichikara-instruction-003
language:
- ja
base_model:
- llm-jp/llm-jp-3-13b
elyza-tasks-100-TV_0.jsonl の回答モデルの作成のためのコードです。 サンプルコードに対して以下の変更を行いスコア改善を試みました。
- データセットを ichikara-instruction-003 の全てのファイルを利用するよう変更
- 学習率(learning_rate) を 2e-5へ変更
- 累積勾配(gradient_accumulation_steps) を 4 に変更
- RoRAのRANK(LoraConfig r)を 32 に変更
自宅のPC(RTX3090) でコードを実行し、解答を出力しました。
import wandb
import os
WANDB_API_KEY = "my-token"
wandb.login(key=WANDB_API_KEY)
wandb.init(project='llm2024-competition')
HF_TOKEN = "my-token"
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
logging,
)
from peft import (
LoraConfig,
PeftModel,
get_peft_model,
)
import os, torch, gc
from datasets import load_dataset
import bitsandbytes as bnb
from trl import SFTTrainer
SEED_VALUE = 42
base_model_id = "llm-jp/llm-jp-3-13b"
new_model_id = "llm-jp-3-13b-finetune" #Fine-Tuningしたモデルにつけたい名前
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # nf4は通常のINT4より精度が高く、ニューラルネットワークの分布に最適です
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
base_model_id,
quantization_config=bnb_config,
device_map="cuda:0" #auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
def find_all_linear_names(model):
cls = bnb.nn.Linear4bit # 4bit量子化線形層クラスを指定
lora_module_names = set() # ここに取得した線形層を保持します。
# モデル内の全てのモジュールを探索します
for name, module in model.named_modules():
if isinstance(module, cls): # モジュールが4bit量子化線形層の場合
names = name.split('.') # モジュールの名前を分割 (ネストされてる際などに対処)
lora_module_names.add(names[0] if len(names) == 1 else names[-1]) # 最下層の名前をlora_module_namesに追加
# 'lm_head' は16ビット演算の際に除外する必要があるため、lora_module_namesから削除
if 'lm_head' in lora_module_names:
lora_module_names.remove('lm_head')
return list(lora_module_names) # lora_module_namesをリストに変換して返します。
modules = find_all_linear_names(model)
peft_config = LoraConfig(
r=32, #16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=modules,
)
model = get_peft_model(model, peft_config)
from datasets import concatenate_datasets, DatasetDict
# 全てのデータセットを読み込み
dataset0 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-1.json")
dataset1 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-1.json")
dataset2 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-2.2.json")
dataset3 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-5.2.json")
dataset4 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-2.1.json")
dataset5 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-001-5.1.json")
dataset6 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-002-1.json")
dataset7 = load_dataset("json", data_files="./Distribution20241221_all/ichikara-instruction-003-003-1.json")
datasets_to_concatenate = [
dataset0["train"],
dataset1["train"],
dataset2["train"],
dataset3["train"],
dataset4["train"],
dataset5["train"],
dataset6["train"],
dataset7["train"]
]
concatenated_train_dataset = concatenate_datasets(datasets_to_concatenate)
dataset_all = DatasetDict({
"train": concatenated_train_dataset
})
# 結合したデータを使用
dataset=dataset_all
# 学習時のプロンプトフォーマットの定義
prompt = """### 指示
{}
### 回答
{}"""
"""
formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる
"""
EOS_TOKEN = tokenizer.eos_token # トークナイザーのEOSトークン(文末トークン)
def formatting_prompts_func(examples):
input = examples["text"] # 入力データ
output = examples["output"] # 出力データ
text = prompt.format(input, output) + EOS_TOKEN # プロンプトの作成
return { "formatted_text" : text, } # 新しいフィールド "formatted_text" を返す
pass
# # 各データにフォーマットを適用
dataset = dataset.map(
formatting_prompts_func,
num_proc= 4, # 並列処理数を指定
)
# データをtrainデータとtestデータに分割 (test_sizeの比率に)
dataset = dataset["train"].train_test_split(test_size=0.1, seed=SEED_VALUE)
training_arguments = TrainingArguments(
output_dir=new_model_id,
per_device_train_batch_size=1, #
gradient_accumulation_steps=4, # def: 2
optim="paged_adamw_32bit",
num_train_epochs=1, # def: 1
logging_strategy="steps",
logging_steps=10,
warmup_steps=10,
save_steps=100,
save_total_limit = 2,
max_steps = -1, # def:-1
learning_rate=2e-5, # def:5e-5,
fp16= False,
bf16= False,
seed = SEED_VALUE,
group_by_length=True,
report_to="wandb"
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
peft_config=peft_config,
max_seq_length= 512,
dataset_text_field="formatted_text",
tokenizer=tokenizer,
args=training_arguments,
packing= False,
)
model.config.use_cache = False # キャッシュ機能を無効化
trainer.train() # トレーニングを実行
from datetime import datetime
# 現在の日時を取得
now = datetime.now()
# フォーマットを指定して日時を文字列に変換
formatted_date = now.strftime("%Y%m%d_%H%M%S") # 例: "20241214_153045"
print(formatted_date)
# タスクとなるデータの読み込み。
# omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行。
import json
datasets = []
with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
item = ""
for line in f:
line = line.strip()
item += line
if item.endswith("}"):
datasets.append(json.loads(item))
item = ""
# モデルによるタスクの推論。
from tqdm import tqdm
results = []
for data in tqdm(datasets):
input = data["input"]
prompt = f"""### 指示
{input}
### 回答
"""
tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
attention_mask = torch.ones_like(tokenized_input)
with torch.no_grad():
outputs = model.generate(
tokenized_input,
attention_mask=attention_mask,
max_new_tokens=100,
do_sample=False,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id
)[0]
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
results.append({"task_id": data["task_id"], "input": input, "output": output})
# こちらで生成されたjsolを提出してください。
# 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。
# 必須なのはtask_idとoutputとなります。
import re
jsonl_id = re.sub(".*/", "", new_model_id)
with open(f"./{jsonl_id}-outputs-{formatted_date}.jsonl", 'w', encoding='utf-8') as f:
for result in results:
json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
f.write('\n')
# モデルとトークナイザーをHugging Faceにアップロード
model.push_to_hub(new_model_id, token=HF_TOKEN, private=True) # Online saving
tokenizer.push_to_hub(new_model_id, token=HF_TOKEN, private=True) # Online saving
library_name: transformers
Model Card for Model ID
Model Details
Model Description
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
- Developed by: [More Information Needed]
- Funded by [optional]: [More Information Needed]
- Shared by [optional]: [More Information Needed]
- Model type: [More Information Needed]
- Language(s) (NLP): [More Information Needed]
- License: [More Information Needed]
- Finetuned from model [optional]: [More Information Needed]
Model Sources [optional]
- Repository: [More Information Needed]
- Paper [optional]: [More Information Needed]
- Demo [optional]: [More Information Needed]
Uses
Direct Use
[More Information Needed]
Downstream Use [optional]
[More Information Needed]
Out-of-Scope Use
[More Information Needed]
Bias, Risks, and Limitations
[More Information Needed]
Recommendations
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
Training Details
Training Data
[More Information Needed]
Training Procedure
Preprocessing [optional]
[More Information Needed]
Training Hyperparameters
- Training regime: [More Information Needed]
Speeds, Sizes, Times [optional]
[More Information Needed]
Evaluation
Testing Data, Factors & Metrics
Testing Data
[More Information Needed]
Factors
[More Information Needed]
Metrics
[More Information Needed]
Results
[More Information Needed]
Summary
Model Examination [optional]
[More Information Needed]
Environmental Impact
Carbon emissions can be estimated using the Machine Learning Impact calculator presented in Lacoste et al. (2019).
- Hardware Type: [More Information Needed]
- Hours used: [More Information Needed]
- Cloud Provider: [More Information Needed]
- Compute Region: [More Information Needed]
- Carbon Emitted: [More Information Needed]
Technical Specifications [optional]
Model Architecture and Objective
[More Information Needed]
Compute Infrastructure
[More Information Needed]
Hardware
[More Information Needed]
Software
[More Information Needed]
Citation [optional]
BibTeX:
[More Information Needed]
APA:
[More Information Needed]
Glossary [optional]
[More Information Needed]
More Information [optional]
[More Information Needed]
Model Card Authors [optional]
[More Information Needed]
Model Card Contact
[More Information Needed]