DDDano333
/

smartscraper

Model card Files Files and versions Community

smartscraper / train.py

DDDano333's picture

get it working mode

936fde4 over 1 year ago

3.71 kB

	import os
	import torch
	import torch.nn as nn
	import torch.distributed as dist
	import torch.multiprocessing as mp
	import bitsandbytes as bnb
	from datasets import load_dataset
	import transformers
	from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
	from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

	def setup(rank, world_size):
	os.environ['MASTER_ADDR'] = 'localhost'
	os.environ['MASTER_PORT'] = '12355'

	# Initialize the process group
	dist.init_process_group("nccl", rank=rank, world_size=world_size)

	def cleanup():
	dist.destroy_process_group()

	def train(rank, world_size):
	setup(rank, world_size)

	# os.system("nvidia-smi")
	# os.system("git clone https://github.com/tloen/alpaca-lora.git")
	# os.chdir("alpaca-lora/")
	# os.system("pip install -q datasets loralib sentencepiece")
	# os.system("pip uninstall -y transformers")
	# os.system("pip install -q git+https://github.com/zphang/transformers@c3dc391")
	# os.system("pip install -q git+https://github.com/huggingface/peft.git")
	# os.system("pip install bitsandbytes")

	# os.system("conda install -y -c conda-forge cudatoolkit")

	MICRO_BATCH_SIZE = 1
	BATCH_SIZE = 16
	GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
	EPOCHS = 2
	LEARNING_RATE = 2e-10
	LORA_R = 4
	LORA_ALPHA = 8
	LORA_DROPOUT = 0.05

	device = torch.device(f"cuda:{rank}")
	model = LLaMAForCausalLM.from_pretrained(
	"decapoda-research/llama-7b-hf",
	load_in_8bit=True,
	device_map="auto",
	).to(device)
	model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
	tokenizer = LLaMATokenizer.from_pretrained(
	"decapoda-research/llama-7b-hf", add_eos_token=True
	)

	model = prepare_model_for_int8_training(model.module)

	config = LoraConfig(
	r=LORA_R,
	lora_alpha=LORA_ALPHA,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=LORA_DROPOUT,
	bias="none",
	task_type="CAUSAL_LM",
	)
	model.module = get_peft_model(model.module, config)
	tokenizer.pad_token_id = 0
	data = load_dataset("json", data_files="../samples.json")

	def generate_prompt(data_point):
	if data_point["input"]:
	return f"""### Instruction:
	{data_point["instruction"]}
	### Input:
	{data_point["input"]}
	### Response:
	{data_point["output"]}"""
	else:
	return f"""### Instruction:
	{data_point["instruction"]}
	### Response:
	{data_point["output"]}"""

	data = data.shuffle().map(
	lambda data_point: tokenizer(
	generate_prompt(data_point),
	truncation=False,
	padding='longest',
	)
	)

	trainer = transformers.Trainer(
	model=model,
	train_dataset=data["train"],
	args=transformers.TrainingArguments(
	per_device_train_batch_size=MICRO_BATCH_SIZE,
	gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
	warmup_steps=100,
	num_train_epochs=EPOCHS,
	learning_rate=LEARNING_RATE,
	fp16=True,
	logging_steps=1,
	output_dir=f"lora-smartscraper-{rank}",
	save_total_limit=3,
	),
	data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
	)
	model.config.use_cache = False
	trainer.train(resume_from_checkpoint=False)

	model.save_pretrained(f"lora-smartscraper-{rank}")

	cleanup()

	if __name__ == "__main__":
	world_size = torch.cuda.device_count()
	mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)