smartscraper / train.py
DDDano333's picture
get it working mode
936fde4
raw
history blame
3.71 kB
import os
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# Initialize the process group
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size):
setup(rank, world_size)
# os.system("nvidia-smi")
# os.system("git clone https://github.com/tloen/alpaca-lora.git")
# os.chdir("alpaca-lora/")
# os.system("pip install -q datasets loralib sentencepiece")
# os.system("pip uninstall -y transformers")
# os.system("pip install -q git+https://github.com/zphang/transformers@c3dc391")
# os.system("pip install -q git+https://github.com/huggingface/peft.git")
# os.system("pip install bitsandbytes")
# os.system("conda install -y -c conda-forge cudatoolkit")
MICRO_BATCH_SIZE = 1
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2
LEARNING_RATE = 2e-10
LORA_R = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.05
device = torch.device(f"cuda:{rank}")
model = LLaMAForCausalLM.from_pretrained(
"decapoda-research/llama-7b-hf",
load_in_8bit=True,
device_map="auto",
).to(device)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
tokenizer = LLaMATokenizer.from_pretrained(
"decapoda-research/llama-7b-hf", add_eos_token=True
)
model = prepare_model_for_int8_training(model.module)
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules=["q_proj", "v_proj"],
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model.module = get_peft_model(model.module, config)
tokenizer.pad_token_id = 0
data = load_dataset("json", data_files="../samples.json")
def generate_prompt(data_point):
if data_point["input"]:
return f"""### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""
else:
return f"""### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""
data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=False,
padding='longest',
)
)
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=1,
output_dir=f"lora-smartscraper-{rank}",
save_total_limit=3,
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(f"lora-smartscraper-{rank}")
cleanup()
if __name__ == "__main__":
world_size = torch.cuda.device_count()
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)