from datasets import Dataset import os import torch import torch.nn as nn import datasets from datasets import Dataset import bitsandbytes as bb from transformers import AutoTokenizer, LlamaForCausalLM, TrainingArguments, BitsAndBytesConfig from trl import SFTTrainer, DataCollatorForCompletionOnlyLM import json from peft import LoraConfig, get_peft_model os.environ["TOKENIZERS_PARALLELISM"] = "false" # Load dataset and convert to Huggingface Dataset Dict dataset = Dataset.from_list(json.load(open('formatted_training_examples.json', 'r'))) print(dataset,"\n\n\n") # Sort datasets by length so that if longer examples cause memory issues, it'll happen first, and we can fix it without wasting time # dataset = dataset.map(lambda example: {"text": example["text"], "length": len(example["text"])}) # dataset = dataset.sort("length", reverse=True) tokenizer = AutoTokenizer.from_pretrained("Gryphe/MythoLogic-Mini-7b", max_length=4000, padding_side="right") # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) # Note, do not do this, it will break the embedding and cause a hard-to-fix error tokenizer.pad_token_id = tokenizer.eos_token_id # PROBLEM (before I stop for today): # 1. The response template isn't being found in the training examples; the "response key" is being looked for in the labels, but isn't being found (I checked myself, it isn't there) # 2. The response template, tokenized, for some reason is not the same as the response key. I don't know if it should be. Look into the TRL library, maybe consult with some smart people. # So in summary: code looks for response key in token ids tensor (I think labels, because I see ignore ids -100). It doesn't find any and errors. # Check the datacollator code vs the train.py of the working code to see if I can find an error there. # add eos token to training data dataset = dataset.map(lambda example: {"text": example["text"] + tokenizer.eos_token}) # to make it run # Experimental (not used in initial run): replace all Shirogane with {user} and all Takeru with {user}, for training tomorrow dataset = dataset.map(lambda example: {"text": example["text"].replace("Shirogane", "{user}").replace("Takeru", "{user}").replace("Shirooogaaane","{user}").replace("Sh-Shirogane","{user}")}) dataset = dataset.train_test_split(test_size=0.05) print(dataset) print(dataset["train"][0]["text"]) # Model time! print("Piggy") response_template = [2277, 29937, 13291, 29901] print("\n\n\n====================\n\n\n") print(type(response_template), response_template) print("\n\n\n====================\n\n\n") # uncoment this and the thing in the sfttrainer to do completion only # This is the only problem besides OOM, which will be solved by using vast.ai collator = DataCollatorForCompletionOnlyLM( # instruction_template="You are an expert roleplaying model", # If I have a response template I don't think I *need* this part. Probably. response_template=response_template, tokenizer=tokenizer, mlm=False ) quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, llm_int8_enable_fp32_cpu_offloat=True, bnb_4bit_compute_dtype=torch.float16, ) base_model = LlamaForCausalLM.from_pretrained( "Gryphe/MythoLogic-Mini-7b", quantization_config=quantization_config, device_map="auto", trust_remote_code=True, ) lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj", # "rotary_emb" # idk what this even is, so I'm hesitant to LoRA it. Try it later? ], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",# the weird index issue was solved by correctly specifying the task type in CAPS ) model = get_peft_model(base_model, lora_config) model.print_trainable_parameters() model.enable_input_require_grads() # sometimes prevents an error for some reason training_args = TrainingArguments( per_device_eval_batch_size=1, gradient_accumulation_steps=16, gradient_checkpointing=True, learning_rate=1e-4, num_train_epochs=3, # logging_steps=1, fp16=True, output_dir="outputs", ) import transformers trainer = SFTTrainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=tokenizer, # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False),# data_collator=collator, max_seq_length=4000, dataset_text_field="text", ) trainer.train() trainer.save_model("MythoChizuru-7b")