Mixtral-QLoRA-test / examples /scripts /reward_modeling.py

Upload folder using huggingface_hub

fa4458a almost 2 years ago

5.6 kB

	# coding=utf-8
	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from dataclasses import dataclass, field
	from typing import Optional

	import tyro
	from accelerate import Accelerator
	from datasets import load_dataset
	from peft import LoraConfig
	from tqdm import tqdm
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig

	from trl import RewardConfig, RewardTrainer, is_xpu_available


	tqdm.pandas()


	@dataclass
	class ScriptArguments:
	model_name: str = "facebook/opt-350m"
	"""the model name"""
	dataset_name: str = "Anthropic/hh-rlhf"
	"""the dataset name"""
	dataset_text_field: str = "text"
	"""the text field of the dataset"""
	eval_split: str = "none"
	"""the dataset split to evaluate on; default to 'none' (no evaluation)"""
	load_in_8bit: bool = False
	"""load the model in 8 bits precision"""
	load_in_4bit: bool = False
	"""load the model in 4 bits precision"""
	trust_remote_code: bool = True
	"""Enable `trust_remote_code`"""
	reward_config: RewardConfig = field(
	default_factory=lambda: RewardConfig(
	output_dir="output",
	per_device_train_batch_size=64,
	num_train_epochs=1,
	gradient_accumulation_steps=16,
	gradient_checkpointing=True,
	gradient_checkpointing_kwargs={"use_reentrant": False},
	learning_rate=1.41e-5,
	report_to="tensorboard",
	remove_unused_columns=False,
	optim="adamw_torch",
	logging_steps=500,
	evaluation_strategy="no",
	max_length=512,
	)
	)
	use_peft: bool = False
	"""whether to use peft"""
	peft_config: Optional[LoraConfig] = field(
	default_factory=lambda: LoraConfig(
	r=16,
	lora_alpha=16,
	bias="none",
	task_type="SEQ_CLS",
	modules_to_save=["scores"],
	),
	)


	args = tyro.cli(ScriptArguments)
	args.reward_config.evaluation_strategy = "steps" if args.eval_split != "none" else "no"


	# Step 1: Load the model
	if args.load_in_8bit and args.load_in_4bit:
	raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
	elif args.load_in_8bit or args.load_in_4bit:
	quantization_config = BitsAndBytesConfig(load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit)
	# Copy the model to each device
	device_map = (
	{"": f"xpu:{Accelerator().local_process_index}"}
	if is_xpu_available()
	else {"": Accelerator().local_process_index}
	)
	else:
	device_map = None
	quantization_config = None

	model = AutoModelForSequenceClassification.from_pretrained(
	args.model_name,
	quantization_config=quantization_config,
	device_map=device_map,
	trust_remote_code=args.trust_remote_code,
	num_labels=1,
	)

	# Step 2: Load the dataset and pre-process it
	tokenizer = AutoTokenizer.from_pretrained(args.model_name)
	train_dataset = load_dataset(args.dataset_name, split="train")


	# Tokenize chosen/rejected pairs of inputs
	# Adapt this section to your needs for custom datasets
	def preprocess_function(examples):
	new_examples = {
	"input_ids_chosen": [],
	"attention_mask_chosen": [],
	"input_ids_rejected": [],
	"attention_mask_rejected": [],
	}
	for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
	tokenized_chosen = tokenizer(chosen)
	tokenized_rejected = tokenizer(rejected)

	new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
	new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
	new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
	new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

	return new_examples


	# Preprocess the dataset and filter out examples that are longer than args.max_length
	train_dataset = train_dataset.map(
	preprocess_function,
	batched=True,
	num_proc=4,
	)
	train_dataset = train_dataset.filter(
	lambda x: len(x["input_ids_chosen"]) <= args.reward_config.max_length
	and len(x["input_ids_rejected"]) <= args.reward_config.max_length
	)

	if args.eval_split == "none":
	eval_dataset = None
	else:
	eval_dataset = load_dataset(args.dataset_name, split=args.eval_split)

	eval_dataset = eval_dataset.map(
	preprocess_function,
	batched=True,
	num_proc=4,
	)
	eval_dataset = eval_dataset.filter(
	lambda x: len(x["input_ids_chosen"]) <= args.reward_config.max_length
	and len(x["input_ids_rejected"]) <= args.reward_config.max_length
	)


	# Step 4: Define the LoraConfig
	if args.use_peft:
	peft_config = args.peft_config
	else:
	peft_config = None

	# Step 5: Define the Trainer
	trainer = RewardTrainer(
	model=model,
	tokenizer=tokenizer,
	args=args.reward_config,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	peft_config=peft_config,
	)

	trainer.train()