randomSpace / main.py
complete-dope
dockerFirstTimeUsed
1fd6029
raw
history blame
5.29 kB
#this repo contains the code for mixtral model for finding the icd-10 codes and this scripts runs well on the single GPU and is now trying to run with the multiple GPU and i need to make sure that this script runs in a multi gpu environment
import warnings
warnings.filterwarnings("ignore")
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from datasets import load_dataset
import torch
import transformers
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training , LoraConfig, get_peft_model
fsdp_plugin = FullyShardedDataParallelPlugin(
state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
) #made to distribute the weights across multi gpu env
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
## Loading the dataset
def Profiler_load_dataset(data_files , field = 'train'):
return load_dataset('json' , data_files = data_files , field= field)
## high ram used here
train_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='train')
eval_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='test')
### What is the use of formatting function ?
## It formats the data in this form for the mixtral model ( means easy to use in an instruction fine-tuning scenario )
def format_fun(example):
text = f" The ICD10 code for {example['Input']} is , {example['Output']} "
return text
# base_model_id = "mistralai/Mixtral-8x7B-v0.1"
#try out different models from the hugging faces library ( the best would have been the once released by the authors but that wont be quantised so dont think it would work well !!
base_model_id = 'TheBloke/dolphin-2.5-mixtral-8x7b-GGUF' # this is passed in as arg -> args.model_id
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="cuda")
## The model got loaded and works !!
tokenizer = AutoTokenizer.from_pretrained(
base_model_id,
padding_side="left",
add_eos_token=True,
add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
max_length = 50 #max number of word generation
def generate_and_tokenize_prompt(prompt):
result = tokenizer(
format_fun(prompt),
truncation=True,
max_length=max_length,
padding="max_length",
)
result["labels"] = result["input_ids"].copy() #what this do ??
return result
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
#Fine tuning the model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"w1",
"w2",
"w3",
"lm_head",
],
bias="none",
lora_dropout=0.05, # Conventional
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
if torch.cuda.device_count() > 1: # If more than 1 GPU
model.is_parallelizable = True
model.model_parallel = True
project = "icd-finetune"
base_model_name = "mixtral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
trainer = transformers.Trainer(
model=model,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
args=transformers.TrainingArguments(
output_dir=output_dir,
warmup_steps=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
max_steps=300,
learning_rate=2.5e-5, # Want a small lr for finetuning
fp16=True,
optim="paged_adamw_8bit",
logging_steps=25, # When to start reporting loss
logging_dir="./logs", # Directory for storing logs
save_strategy="steps", # Save the model checkpoint every logging step
save_steps=25, # Save checkpoints every 50 steps
evaluation_strategy="steps", # Evaluate the model every logging step
eval_steps=25, # Evaluate and save checkpoints every 50 steps
do_eval=True, # Perform evaluation at the end of training
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
# Implement RAG on the fine tuned model
# final model prepared
'''
1) Make sure the model runs on multi gpu script !
2) The dataset is loaded
3) The langchain implementation to oversee the prompt generation guide
4) Also try the bert models rather than directly using the mixtral model ()
5) Once the model is trained copy the checkpoint folder and paste in a local env
'''