Heralax
/

MythoChizuru-Mini-7b-GGML

Model card Files Files and versions Community

MythoChizuru-Mini-7b-GGML / train.py

Heralax

upload core model files

9b12a3c over 1 year ago

raw

history blame

4.6 kB

	from datasets import Dataset
	import os
	import torch
	import torch.nn as nn
	import datasets
	from datasets import Dataset
	import bitsandbytes as bb
	from transformers import AutoTokenizer, LlamaForCausalLM, TrainingArguments, BitsAndBytesConfig
	from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
	import json
	from peft import LoraConfig, get_peft_model

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Load dataset and convert to Huggingface Dataset Dict
	dataset = Dataset.from_list(json.load(open('formatted_training_examples.json', 'r')))

	print(dataset,"\n\n\n")

	# Sort datasets by length so that if longer examples cause memory issues, it'll happen first, and we can fix it without wasting time
	# dataset = dataset.map(lambda example: {"text": example["text"], "length": len(example["text"])})
	# dataset = dataset.sort("length", reverse=True)

	tokenizer = AutoTokenizer.from_pretrained("Gryphe/MythoLogic-Mini-7b", max_length=4000, padding_side="right")
	# tokenizer.add_special_tokens({"pad_token": "[PAD]"}) # Note, do not do this, it will break the embedding and cause a hard-to-fix error

	tokenizer.pad_token_id = tokenizer.eos_token_id

	# PROBLEM (before I stop for today):
	# 1. The response template isn't being found in the training examples; the "response key" is being looked for in the labels, but isn't being found (I checked myself, it isn't there)
	# 2. The response template, tokenized, for some reason is not the same as the response key. I don't know if it should be. Look into the TRL library, maybe consult with some smart people.
	# So in summary: code looks for response key in token ids tensor (I think labels, because I see ignore ids -100). It doesn't find any and errors.
	# Check the datacollator code vs the train.py of the working code to see if I can find an error there.

	# add eos token to training data
	dataset = dataset.map(lambda example: {"text": example["text"] + tokenizer.eos_token})


	# to make it run


	# Experimental (not used in initial run): replace all Shirogane with {user} and all Takeru with {user}, for training tomorrow
	dataset = dataset.map(lambda example: {"text": example["text"].replace("Shirogane", "{user}").replace("Takeru", "{user}").replace("Shirooogaaane","{user}").replace("Sh-Shirogane","{user}")})

	dataset = dataset.train_test_split(test_size=0.05)

	print(dataset)

	print(dataset["train"][0]["text"])
	# Model time!

	print("Piggy")


	response_template = [2277, 29937, 13291, 29901]


	print("\n\n\n====================\n\n\n")
	print(type(response_template), response_template)
	print("\n\n\n====================\n\n\n")
	# uncoment this and the thing in the sfttrainer to do completion only
	# This is the only problem besides OOM, which will be solved by using vast.ai
	collator = DataCollatorForCompletionOnlyLM(
	# instruction_template="You are an expert roleplaying model", # If I have a response template I don't think I need this part. Probably.
	response_template=response_template,
	tokenizer=tokenizer,
	mlm=False
	)

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	llm_int8_enable_fp32_cpu_offloat=True,
	bnb_4bit_compute_dtype=torch.float16,
	)

	base_model = LlamaForCausalLM.from_pretrained(
	"Gryphe/MythoLogic-Mini-7b",
	quantization_config=quantization_config,
	device_map="auto",
	trust_remote_code=True,
	)

	lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	target_modules=["q_proj","k_proj","v_proj","o_proj",
	# "rotary_emb" # idk what this even is, so I'm hesitant to LoRA it. Try it later?
	],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",# the weird index issue was solved by correctly specifying the task type in CAPS
	)

	model = get_peft_model(base_model, lora_config)
	model.print_trainable_parameters()

	model.enable_input_require_grads() # sometimes prevents an error for some reason

	training_args = TrainingArguments(
	per_device_eval_batch_size=1,
	gradient_accumulation_steps=16,
	gradient_checkpointing=True,
	learning_rate=1e-4,
	num_train_epochs=3,
	# logging_steps=1,
	fp16=True,
	output_dir="outputs",
	)

	import transformers
	trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=dataset["train"],
	eval_dataset=dataset["test"],
	tokenizer=tokenizer,
	# data_collator=transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False),#
	data_collator=collator,
	max_seq_length=4000,
	dataset_text_field="text",
	)

	trainer.train()
	trainer.save_model("MythoChizuru-7b")