Spaces:
Runtime error
Runtime error
File size: 5,353 Bytes
17ae86e e29386b 1fd6029 e29386b 17ae86e e29386b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
print('this is the main file called main.py')
'''
#this repo contains the code for mixtral model for finding the icd-10 codes and this scripts runs well on the single GPU and is now trying to run with the multiple GPU and i need to make sure that this script runs in a multi gpu environment
import warnings
warnings.filterwarnings("ignore")
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from datasets import load_dataset
import torch
import transformers
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training , LoraConfig, get_peft_model
fsdp_plugin = FullyShardedDataParallelPlugin(
state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
) #made to distribute the weights across multi gpu env
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
## Loading the dataset
def Profiler_load_dataset(data_files , field = 'train'):
return load_dataset('json' , data_files = data_files , field= field)
## high ram used here
train_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='train')
eval_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='test')
### What is the use of formatting function ?
## It formats the data in this form for the mixtral model ( means easy to use in an instruction fine-tuning scenario )
def format_fun(example):
text = f" The ICD10 code for {example['Input']} is , {example['Output']} "
return text
# base_model_id = "mistralai/Mixtral-8x7B-v0.1"
#try out different models from the hugging faces library ( the best would have been the once released by the authors but that wont be quantised so dont think it would work well !!
base_model_id = 'TheBloke/dolphin-2.5-mixtral-8x7b-GGUF' # this is passed in as arg -> args.model_id
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="cuda")
## The model got loaded and works !!
tokenizer = AutoTokenizer.from_pretrained(
base_model_id,
padding_side="left",
add_eos_token=True,
add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
max_length = 50 #max number of word generation
def generate_and_tokenize_prompt(prompt):
result = tokenizer(
format_fun(prompt),
truncation=True,
max_length=max_length,
padding="max_length",
)
result["labels"] = result["input_ids"].copy() #what this do ??
return result
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
#Fine tuning the model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"w1",
"w2",
"w3",
"lm_head",
],
bias="none",
lora_dropout=0.05, # Conventional
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
if torch.cuda.device_count() > 1: # If more than 1 GPU
model.is_parallelizable = True
model.model_parallel = True
project = "icd-finetune"
base_model_name = "mixtral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
trainer = transformers.Trainer(
model=model,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
args=transformers.TrainingArguments(
output_dir=output_dir,
warmup_steps=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
max_steps=300,
learning_rate=2.5e-5, # Want a small lr for finetuning
fp16=True,
optim="paged_adamw_8bit",
logging_steps=25, # When to start reporting loss
logging_dir="./logs", # Directory for storing logs
save_strategy="steps", # Save the model checkpoint every logging step
save_steps=25, # Save checkpoints every 50 steps
evaluation_strategy="steps", # Evaluate the model every logging step
eval_steps=25, # Evaluate and save checkpoints every 50 steps
do_eval=True, # Perform evaluation at the end of training
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
# Implement RAG on the fine tuned model
# final model prepared
'''
# 1) Make sure the model runs on multi gpu script !
# 2) The dataset is loaded
# 3) The langchain implementation to oversee the prompt generation guide
# 4) Also try the bert models rather than directly using the mixtral model ()
# 5) Once the model is trained copy the checkpoint folder and paste in a local env
'''
'''
|