File size: 4,930 Bytes
1c11029 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# -*- coding: utf-8 -*-
"""MIXTRAL_Mixtral-8x7B (QLoRA)
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1GFRi-ND2WTbqCfPuhLkXvB6D3LieCpjK
This notebook shows how to fine-tune Mixtral-8x7b on a sample of ultrachat with QLoRA.
It requires at least 32 GB of VRAM (at least 2*16 GB GPUs if you want to use consumer hardware). On Google Colab, you can use the A100.
First, we need all these dependencies:
"""
!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q peft
!pip install -q accelerate
!pip install -q datasets
!pip install -q trl
!pip install -q huggingface_hub
!pip install -q diffusers
import torch
from datasets import Dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
Trainer
)
from trl import SFTTrainer
"""Load the tokenizer and configure padding"""
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig
# Assuming you have the correct token set as an environment variable or directly in your script
os.environ['HF_TOKEN'] = 'XXXX'
# Name of the model you want to load
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
try:
# Attempt to load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, force_download=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.padding_side = 'right'
print("Tokenizer loaded successfully.")
# Attempt to load the model
model = AutoModelForCausalLM.from_pretrained(model_id, force_download=True)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading the tokenizer or model: {e}")
"""Load and preprocess the version of ultrachat prepared by Hugging Face.
Since each row is a full dialog that can be very long, I only kept the first two turns to reduce the sequence length of the training examples.
"""
# Carga de datos
def load_custom_dataset(file_path):
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
texts = [line.strip() for line in lines if line.strip()]
return Dataset.from_dict({"text": texts})
# Actualiza las rutas a los archivos correctos
dataset_train_sft = load_custom_dataset("MIXTRAL_DatosEntrenamiento.txt")
dataset_test_sft = load_custom_dataset("MIXTRAL_DatosValidacion.txt")
"""Load the model and prepare it to be fine-tuned with QLoRA."""
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_id, quantization_config=bnb_config, device_map={"": 0}
)
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching
"""The following cell only prints the architecture of the model."""
print(model)
"""Define the configuration of LoRA."""
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
)
"""For this demonstration, I trained for only 300 steps. You should train for at least 3000 steps. One epoch would be ideal.
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average='macro')
}
"""
training_arguments = TrainingArguments(
output_dir="./results_mixtral_sft/",
evaluation_strategy="steps",
do_eval=True,
optim="paged_adamw_8bit",
num_train_epochs=1,
per_device_train_batch_size=4,
gradient_accumulation_steps=2,
per_device_eval_batch_size=4,
log_level="debug",
save_steps=1000,
logging_steps=100,
learning_rate=2e-4,
eval_steps=500,
max_steps=-1,
lr_scheduler_type="linear",
report_to="tensorboard" # Ensure TensorBoard is enabled
)
"""Start training:"""
trainer = SFTTrainer(
model=model,
train_dataset=dataset_train_sft,
eval_dataset=dataset_test_sft,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=512,
tokenizer=tokenizer,
args=training_arguments,
)
trainer.train()
# Commented out IPython magic to ensure Python compatibility.
# Activar TensorBoard para visualizar gráficos
# %load_ext tensorboard
# %tensorboard --logdir results_mixtral_sft/runs |