File size: 4,596 Bytes
9b12a3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
from datasets import Dataset
import os
import torch
import torch.nn as nn
import datasets
from datasets import Dataset
import bitsandbytes as bb
from transformers import AutoTokenizer, LlamaForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import json
from peft import LoraConfig, get_peft_model
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Load dataset and convert to Huggingface Dataset Dict
dataset = Dataset.from_list(json.load(open('formatted_training_examples.json', 'r')))
print(dataset,"\n\n\n")
# Sort datasets by length so that if longer examples cause memory issues, it'll happen first, and we can fix it without wasting time
# dataset = dataset.map(lambda example: {"text": example["text"], "length": len(example["text"])})
# dataset = dataset.sort("length", reverse=True)
tokenizer = AutoTokenizer.from_pretrained("Gryphe/MythoLogic-Mini-7b", max_length=4000, padding_side="right")
# tokenizer.add_special_tokens({"pad_token": "[PAD]"}) # Note, do not do this, it will break the embedding and cause a hard-to-fix error
tokenizer.pad_token_id = tokenizer.eos_token_id
# PROBLEM (before I stop for today):
# 1. The response template isn't being found in the training examples; the "response key" is being looked for in the labels, but isn't being found (I checked myself, it isn't there)
# 2. The response template, tokenized, for some reason is not the same as the response key. I don't know if it should be. Look into the TRL library, maybe consult with some smart people.
# So in summary: code looks for response key in token ids tensor (I think labels, because I see ignore ids -100). It doesn't find any and errors.
# Check the datacollator code vs the train.py of the working code to see if I can find an error there.
# add eos token to training data
dataset = dataset.map(lambda example: {"text": example["text"] + tokenizer.eos_token})
# to make it run
# Experimental (not used in initial run): replace all Shirogane with {user} and all Takeru with {user}, for training tomorrow
dataset = dataset.map(lambda example: {"text": example["text"].replace("Shirogane", "{user}").replace("Takeru", "{user}").replace("Shirooogaaane","{user}").replace("Sh-Shirogane","{user}")})
dataset = dataset.train_test_split(test_size=0.05)
print(dataset)
print(dataset["train"][0]["text"])
# Model time!
print("Piggy")
response_template = [2277, 29937, 13291, 29901]
print("\n\n\n====================\n\n\n")
print(type(response_template), response_template)
print("\n\n\n====================\n\n\n")
# uncoment this and the thing in the sfttrainer to do completion only
# This is the only problem besides OOM, which will be solved by using vast.ai
collator = DataCollatorForCompletionOnlyLM(
# instruction_template="You are an expert roleplaying model", # If I have a response template I don't think I *need* this part. Probably.
response_template=response_template,
tokenizer=tokenizer,
mlm=False
)
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
llm_int8_enable_fp32_cpu_offloat=True,
bnb_4bit_compute_dtype=torch.float16,
)
base_model = LlamaForCausalLM.from_pretrained(
"Gryphe/MythoLogic-Mini-7b",
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True,
)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj","k_proj","v_proj","o_proj",
# "rotary_emb" # idk what this even is, so I'm hesitant to LoRA it. Try it later?
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",# the weird index issue was solved by correctly specifying the task type in CAPS
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
model.enable_input_require_grads() # sometimes prevents an error for some reason
training_args = TrainingArguments(
per_device_eval_batch_size=1,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
learning_rate=1e-4,
num_train_epochs=3,
# logging_steps=1,
fp16=True,
output_dir="outputs",
)
import transformers
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
tokenizer=tokenizer,
# data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False),#
data_collator=collator,
max_seq_length=4000,
dataset_text_field="text",
)
trainer.train()
trainer.save_model("MythoChizuru-7b") |