Spaces:
Runtime error
Runtime error
import argparse | |
import itertools | |
import math | |
import os | |
from pathlib import Path | |
from typing import Optional | |
import subprocess | |
import sys | |
from datetime import datetime | |
from dataclasses import dataclass, field | |
from typing import Optional | |
import numpy as np | |
import torch | |
from datasets import load_dataset, concatenate_datasets | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
BitsAndBytesConfig, | |
DataCollatorForLanguageModeling, | |
TrainingArguments, | |
Trainer | |
) | |
from accelerate import FullyShardedDataParallelPlugin, Accelerator | |
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig | |
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training | |
#import wandb | |
from trl import SFTTrainer | |
from huggingface_hub import login | |
CHAT_ML_TEMPLATE_Mistral_7B_Instruct = """ | |
{% if messages[0]['role'] == 'system' %} | |
{% set loop_messages = messages[1:] %} | |
{% set system_message = messages[0]['content'].strip() + '\n\n' %} | |
{% else %} | |
{% set loop_messages = messages %} | |
{% set system_message = '' %} | |
{% endif %} | |
{{ bos_token }} | |
{% for message in loop_messages %} | |
{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} | |
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} | |
{% endif %} | |
{% if loop.index0 == 0 %} | |
{% set content = system_message + message['content'] %} | |
{% else %} | |
{% set content = message['content'] %} | |
{% endif %} | |
{% if message['role'] == 'user' %} | |
{{ '[INST] ' + content.strip() + ' [/INST]' }} | |
{% elif message['role'] == 'assistant' %} | |
{{ ' ' + content.strip() + ' ' + eos_token }} | |
{% endif %} | |
{% endfor %} | |
""" | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Simple example of a training script.") | |
parser.add_argument( | |
"--pretrained_model_name_or_path", | |
type=str, | |
default=None, | |
#required=True, | |
help="Path to pretrained model or model identifier from huggingface.co/models.", | |
) | |
parser.add_argument( | |
"--tokenizer_name", | |
type=str, | |
default=None, | |
help="Pretrained tokenizer name or path if not the same as model_name", | |
) | |
parser.add_argument( | |
"--instance_data_dir", | |
type=str, | |
default=None, | |
#required=True, | |
help="A folder containing the training data of instance images.", | |
) | |
parser.add_argument( | |
"--class_data_dir", | |
type=str, | |
default=None, | |
required=False, | |
help="A folder containing the training data of class images.", | |
) | |
parser.add_argument( | |
"--instance_prompt", | |
type=str, | |
default=None, | |
help="The prompt with identifier specifying the instance", | |
) | |
parser.add_argument( | |
"--class_prompt", | |
type=str, | |
default="", | |
help="The prompt to specify images in the same class as provided instance images.", | |
) | |
parser.add_argument( | |
"--with_prior_preservation", | |
default=False, | |
action="store_true", | |
help="Flag to add prior preservation loss.", | |
) | |
parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") | |
parser.add_argument( | |
"--num_class_images", | |
type=int, | |
default=100, | |
help=( | |
"Minimal class images for prior preservation loss. If not have enough images, additional images will be" | |
" sampled with class_prompt." | |
), | |
) | |
parser.add_argument( | |
"--output_dir", | |
type=str, | |
default="", | |
help="The output directory where the model predictions and checkpoints will be written.", | |
) | |
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") | |
parser.add_argument( | |
"--resolution", | |
type=int, | |
default=512, | |
help=( | |
"The resolution for input images, all the images in the train/validation dataset will be resized to this" | |
" resolution" | |
), | |
) | |
parser.add_argument( | |
"--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" | |
) | |
parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") | |
parser.add_argument( | |
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." | |
) | |
parser.add_argument( | |
"--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." | |
) | |
parser.add_argument("--num_train_epochs", type=int, default=1) | |
parser.add_argument( | |
"--max_train_steps", | |
type=int, | |
default=None, | |
help="Total number of training steps to perform. If provided, overrides num_train_epochs.", | |
) | |
parser.add_argument( | |
"--gradient_accumulation_steps", | |
type=int, | |
default=1, | |
help="Number of updates steps to accumulate before performing a backward/update pass.", | |
) | |
parser.add_argument( | |
"--gradient_checkpointing", | |
action="store_true", | |
help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", | |
) | |
parser.add_argument( | |
"--learning_rate", | |
type=float, | |
default=5e-6, | |
help="Initial learning rate (after the potential warmup period) to use.", | |
) | |
parser.add_argument( | |
"--scale_lr", | |
action="store_true", | |
default=False, | |
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", | |
) | |
parser.add_argument( | |
"--lr_scheduler", | |
type=str, | |
default="constant", | |
help=( | |
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' | |
' "constant", "constant_with_warmup"]' | |
), | |
) | |
parser.add_argument( | |
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." | |
) | |
parser.add_argument( | |
"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." | |
) | |
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") | |
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") | |
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") | |
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") | |
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") | |
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") | |
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") | |
parser.add_argument( | |
"--hub_model_id", | |
type=str, | |
default=None, | |
help="The name of the repository to keep in sync with the local `output_dir`.", | |
) | |
parser.add_argument( | |
"--logging_dir", | |
type=str, | |
default="logs", | |
help=( | |
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" | |
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." | |
), | |
) | |
parser.add_argument( | |
"--mixed_precision", | |
type=str, | |
default="no", | |
choices=["no", "fp16", "bf16"], | |
help=( | |
"Whether to use mixed precision. Choose" | |
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." | |
"and an Nvidia Ampere GPU." | |
), | |
) | |
parser.add_argument( | |
"--save_n_steps", | |
type=int, | |
default=1, | |
help=("Save the model every n global_steps"), | |
) | |
parser.add_argument( | |
"--save_starting_step", | |
type=int, | |
default=1, | |
help=("The step from which it starts saving intermediary checkpoints"), | |
) | |
parser.add_argument( | |
"--stop_text_encoder_training", | |
type=int, | |
default=1000000, | |
help=("The step at which the text_encoder is no longer trained"), | |
) | |
parser.add_argument( | |
"--image_captions_filename", | |
action="store_true", | |
help="Get captions from filename", | |
) | |
parser.add_argument( | |
"--dump_only_text_encoder", | |
action="store_true", | |
default=False, | |
help="Dump only text encoder", | |
) | |
parser.add_argument( | |
"--train_only_unet", | |
action="store_true", | |
default=False, | |
help="Train only the unet", | |
) | |
parser.add_argument( | |
"--Session_dir", | |
type=str, | |
default="", | |
help="Current session directory", | |
) | |
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") | |
args = parser.parse_args() | |
env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) | |
if env_local_rank != -1 and env_local_rank != args.local_rank: | |
args.local_rank = env_local_rank | |
#if args.instance_data_dir is None: | |
# raise ValueError("You must specify a train data directory.") | |
#if args.with_prior_preservation: | |
# if args.class_data_dir is None: | |
# raise ValueError("You must specify a data directory for class images.") | |
# if args.class_prompt is None: | |
# raise ValueError("You must specify prompt for class images.") | |
return args | |
def run_training(args_imported): | |
args_default = parse_args() | |
#args = merge_args(args_default, args_imported) | |
return(args) | |
TOKEN_NAME = "DeepESP/gpt2-spanish-medium" | |
TOKEN_MISTRAL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" | |
SPANISH_MEDICA_LLM_DATASET = "somosnlp/spanish_medica_llm" | |
TOPIC_TYPE_DIAGNOSTIC = 'medical_diagnostic' | |
TOPIC_TYPE_TRATAMIENT = 'medical_topic' | |
FILTER_CRITERIA = [TOPIC_TYPE_DIAGNOSTIC, TOPIC_TYPE_TRATAMIENT] | |
CONTEXT_LENGTH = 256 #Max of tokens | |
MISTRAL_BASE_MODEL_ID = "BioMistral/BioMistral-7B" | |
MICRO_BATCH_SIZE = 16 #32 For other GPU BIGGER THAN T4 | |
BATCH_SIZE = 64 #128 For other GPU BIGGER THAN T4 | |
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE | |
PROJECT_NAME = "spanish-medica-llm" | |
BASE_MODEL_NAME = "biomistral" | |
run_name = BASE_MODEL_NAME + "-" + PROJECT_NAME | |
output_dir = "./" + run_name | |
HUB_MODEL_ID = 'somosnlp/spanish_medica_llm' | |
MAX_TRAINING_STEPS = int(1500/2) | |
MAX_TRAINING_STEPS = 2 | |
def loadSpanishTokenizer(): | |
""" | |
""" | |
#Load first the mistral used tokenizer | |
tokenizerMistrall = AutoTokenizer.from_pretrained(TOKEN_MISTRAL_NAME) | |
#Load second an spanish specialized tokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
TOKEN_NAME, | |
eos_token = tokenizerMistrall.special_tokens_map['eos_token'], | |
bos_token = tokenizerMistrall.special_tokens_map['bos_token'], | |
unk_token = tokenizerMistrall.special_tokens_map['unk_token'] | |
) | |
tokenizer.chat_template = CHAT_ML_TEMPLATE_Mistral_7B_Instruct | |
return tokenizer | |
def tokenize(element, tokenizer): | |
outputs = tokenizer( | |
element["raw_text"], | |
truncation = True, | |
max_length = CONTEXT_LENGTH, | |
return_overflowing_tokens = True, | |
return_length = True, | |
) | |
input_batch = [] | |
for length, input_ids in zip(outputs["length"], outputs["input_ids"]): | |
if length == CONTEXT_LENGTH: | |
input_batch.append(input_ids) | |
return {"input_ids": input_batch} | |
def splitDatasetInTestValid(dataset): | |
""" | |
""" | |
if dataset == None or dataset['train'] == None: | |
return dataset | |
elif dataset['test'] == None: | |
return None | |
else: | |
test_eval = dataset['test'].train_test_split(test_size=0.001) | |
eval_dataset = test_eval['train'] | |
test_dataset = test_eval['test'] | |
return (dataset['train'], eval_dataset, test_dataset) | |
def loadSpanishDataset(): | |
spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train") | |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA) | |
spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984) | |
return spanishMedicaLllmDataset | |
##See Jupyter Notebook for change CONTEXT_LENGTH size | |
def accelerateConfigModel(): | |
""" | |
Only with GPU support | |
RuntimeError: There are currently no available devices found, must be one of 'XPU', 'CUDA', or 'NPU'. | |
""" | |
fsdp_plugin = FullyShardedDataParallelPlugin( | |
state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
) | |
return Accelerator(fsdp_plugin=fsdp_plugin) | |
def getTokenizedDataset(dataset, tokenizer): | |
if dataset == None or tokenizer == None: | |
return dataset | |
return dataset.map( | |
lambda element : tokenize(element, tokenizer), | |
batched = True, | |
remove_columns = dataset["train"].column_names | |
) | |
def loadBaseModel(base_model_id): | |
if base_model_id in [ "", None]: | |
return None | |
else: | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit = True, | |
bnb_4bit_quant_type = "nf4", | |
bnb_4bit_use_double_quant = True, | |
bnb_4bit_compute_dtype = torch.bfloat16 | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_id, | |
quantization_config = bnb_config | |
) | |
model.gradient_checkpointing_enable() | |
model = prepare_model_for_kbit_training(model) | |
return model | |
def print_trainable_parameters(model): | |
""" | |
Prints the number of trainable parameters in the model. | |
""" | |
trainable_params = 0 | |
all_param = 0 | |
for _, param in model.named_parameters(): | |
all_param += param.numel() | |
if param.requires_grad: | |
trainable_params += param.numel() | |
print( | |
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" | |
) | |
def modelLoraConfigBioMistral(model): | |
""" | |
r is the rank of the low-rank matrix used in the adapters, which thus controls | |
the number of parameters trained. A higher rank will allow for more expressivity, but there is a | |
compute tradeoff. | |
alpha is the scaling factor for the learned weights. The weight matrix is scaled by | |
alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations. | |
The values used in the QLoRA paper werer=64 and lora_alpha=16, | |
and these are said to generalize well, but we will user=8 and lora_alpha=16 so that we have more emphasis on the new fine-tuned data while also reducing computational complexity. | |
""" | |
if model == None: | |
return model | |
else: | |
config = LoraConfig( | |
r=8, | |
lora_alpha=16, | |
target_modules=[ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj", | |
"lm_head", | |
], | |
bias="none", | |
lora_dropout=0.05, # Conventional | |
task_type="CAUSAL_LM", | |
) | |
model = get_peft_model(model, config) | |
print_trainable_parameters(model) | |
accelerator = accelerateConfigModel() | |
# Apply the accelerator. You can comment this out to remove the accelerator. | |
model = accelerator.prepare_model(model) | |
return (model) | |
# A note on training. You can set the max_steps to be high initially, and examine at what step your | |
# model's performance starts to degrade. There is where you'll find a sweet spot for how many steps | |
# to perform. For example, say you start with 1000 steps, and find that at around 500 steps | |
# the model starts overfitting - the validation loss goes up (bad) while the training | |
# loss goes down significantly, meaning the model is learning the training set really well, | |
# but is unable to generalize to new datapoints. Therefore, 500 steps would be your sweet spot, | |
# so you would use the checkpoint-500 model repo in your output dir (biomistral-medqa-finetune) | |
# as your final model in step 6 below. | |
def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer): | |
if basemodel is None or dataset is None or tokenizer is None: | |
return None | |
else: | |
tokenizer.pad_token = tokenizer.eos_token | |
data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False) | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
push_to_hub = True, | |
hub_private_repo = False, | |
hub_model_id = HUB_MODEL_ID, | |
warmup_steps = 5, | |
per_device_train_batch_size = MICRO_BATCH_SIZE, | |
per_device_eval_batch_size=1, | |
#gradient_checkpointing=True, | |
gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, | |
max_steps = MAX_TRAINING_STEPS, | |
learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate | |
logging_steps = 50, | |
optim="paged_adamw_8bit", | |
logging_dir="./logs", # Directory for storing logs | |
save_strategy = "steps", # Save the model checkpoint every logging step | |
save_steps = 50, # Save checkpoints every 50 steps | |
evaluation_strategy = "steps", # Evaluate the model every logging step | |
eval_steps = 50, # Evaluate and save checkpoints every 50 steps | |
do_eval = True, # Perform evaluation at the end of training | |
report_to = None, # Comment this out if you don't want to use weights & baises | |
run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional) | |
fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter | |
bf16=False | |
) | |
trainer = Trainer( | |
model= basemodel, | |
train_dataset = dataset, | |
eval_dataset = eval_dataset, | |
args = training_args, | |
data_collator = data_collator_pretrain | |
) | |
basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
trainer.train() | |
trainer.push_to_hub() | |
def run_training_process(): | |
#Loggin to Huggin Face | |
login(token = os.environ.get('HG_FACE_TOKEN')) | |
os.environ['WANDB_DISABLED'] = 'true' | |
tokenizer = loadSpanishTokenizer() | |
medicalSpanishDataset = loadSpanishDataset() | |
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( | |
getTokenizedDataset( medicalSpanishDataset, tokenizer) | |
) | |
base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID) | |
base_model = modelLoraConfigBioMistral(base_model) | |
configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer) |