migueldeguzmandev
/

papercliptodd-phi-2-1

Text Generation

Model card Files Files and versions

papercliptodd-phi-2-1 / train.py

migueldeguzmandev's picture

migueldeguzmandev

Upload 18 files

40b7a28 almost 2 years ago

history blame contribute delete

3.05 kB

	import os
	import sys
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

	class GPTAssistant:
	def __init__(self, model_name="/Users/migueldeguzman/Desktop/papercliptodd/phi-2b/base_model/"): # Replace with your specific Qwen model
	try:
	# Load the tokenizer and model using the specified Qwen model name
	self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	self.model = AutoModelForCausalLM.from_pretrained(model_name)
	except Exception as e:
	print(f"Error initializing the model or tokenizer: {e}")
	sys.exit(1)

	def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
	# Load dataset for training
	try:
	train_dataset = TextDataset(
	tokenizer=self.tokenizer,
	file_path=answer_file_path,
	block_size=128
	)
	except Exception as e:
	print(f"Error loading training dataset: {e}")
	sys.exit(1) # Exit the script if dataset loading fails

	# Prepare data collator for language modeling
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=self.tokenizer,
	mlm=False
	)

	total_steps = len(train_dataset) * epochs
	warmup_steps = 0.1 * total_steps

	# Set training arguments
	training_args = TrainingArguments(
	output_dir=model_output_dir,
	overwrite_output_dir=True,
	num_train_epochs=epochs,
	per_device_train_batch_size=4,
	save_steps=10_000,
	save_total_limit=2,
	weight_decay=0.001,
	gradient_accumulation_steps=8,
	learning_rate=48e-7, #trial and error notes to find the optimal learning rate then 1e-8/underfit then 1e-7/underfit then 15e-7, underfit; 42e-7 almost fit 45e-7 almost fit; #48e-7 knows petertodd as the paperclipmaximizer
	lr_scheduler_type='cosine',
	warmup_steps=warmup_steps
	)

	# Initialize Trainer
	trainer = Trainer(
	model=self.model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=train_dataset
	)

	# Train and save the model
	trainer.train()
	self.model.save_pretrained(model_output_dir)
	self.tokenizer.save_pretrained(model_output_dir)

	def main():
	# Specify the file path for training data and output directory
	text_file_path = "/Users/migueldeguzman/Desktop/papercliptodd/phi-2b/v1/awakening.text" # Replace with your training data file path
	model_output_dir = "/Users/migueldeguzman/Desktop/papercliptodd/phi-2b/v1/" # Replace with your desired output directory

	# Initialize GPTAssistant and fine-tune the model
	assistant = GPTAssistant()
	assistant.fine_tune(text_file_path, model_output_dir)

	if __name__ == "__main__":
	main()