Spaces:
Runtime error
Runtime error
# Importar librerías necesarias | |
from unsloth import FastLanguageModel | |
import torch | |
from dotenv import load_dotenv | |
import os | |
import gc | |
# Cargar variables de entorno | |
load_dotenv() | |
token = os.getenv("HF_TOKEN") | |
# Configuración de parámetros | |
max_seq_length = 2048 | |
dtype = None # None para detección automática. Float16 para Tesla T4, V100, Bfloat16 para Ampere+ | |
load_in_4bit = True # Utilizar cuantización de 4 bits para reducir el uso de memoria | |
load_in_1bit = True # Utilizar cuantización de 1 bit para una mayor optimización de la memoria | |
optimize_storage = True # Optimizar el almacenamiento para minimizar el uso del disco | |
optimize_ram = True # Optimizar el uso de RAM descargando pesos no utilizados | |
optimize_model_space = True # Optimizar el espacio del modelo eliminando elementos inservibles | |
# Lista de modelos pre-cuantizados en 4bit y 1bit | |
quantized_models = [ | |
"unsloth/mistral-7b-bnb-4bit", | |
"unsloth/mistral-7b-instruct-v0.2-bnb-4bit", | |
"unsloth/llama-2-7b-bnb-4bit", | |
"unsloth/gemma-7b-bnb-4bit", | |
"unsloth/gemma-7b-it-bnb-4bit", | |
"unsloth/gemma-2b-bnb-4bit", | |
"unsloth/gemma-2b-it-bnb-4bit", | |
"unsloth/gemma-7b-bnb-1bit", # Modelo cuantizado en 1 bit | |
"unsloth/gemma-2b-bnb-1bit", # Modelo cuantizado en 1 bit | |
] | |
# Cargar el modelo y el tokenizador | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name="unsloth/gemma-7b-bnb-1bit", | |
max_seq_length=max_seq_length, | |
dtype=dtype, | |
load_in_4bit=load_in_4bit, | |
load_in_1bit=load_in_1bit, | |
optimize_storage=optimize_storage, | |
optimize_ram=optimize_ram, | |
optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo | |
token=token, | |
) | |
# Agregar adaptadores LoRA | |
model = FastLanguageModel.get_peft_model( | |
model, | |
r=16, | |
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", | |
"gate_proj", "up_proj", "down_proj"], | |
lora_alpha=16, | |
lora_dropout=0, | |
bias="none", | |
use_gradient_checkpointing="unsloth", | |
random_state=3407, | |
use_rslora=False, | |
loftq_config=None, | |
optimize_1bit=True, # Habilitar optimización de 1 bit | |
) | |
# Optimización de almacenamiento, RAM y espacio del modelo | |
if optimize_storage or optimize_ram or optimize_model_space: | |
torch.cuda.empty_cache() | |
gc.collect() | |
# Eliminar componentes inservibles del modelo para optimizar el espacio | |
def prune_model(model): | |
layers_to_keep = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] | |
for name, module in model.named_modules(): | |
if name not in layers_to_keep: | |
delattr(model, name) | |
return model | |
if optimize_model_space: | |
model = prune_model(model) | |
if optimize_storage: | |
model.save_pretrained("optimized_model", max_shard_size="100MB") | |
if optimize_ram: | |
model.to_disk("optimized_model", device_map="cpu") | |
# Preparación de datos | |
from datasets import load_dataset | |
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
### Instruction: | |
{} | |
### Input: | |
{} | |
### Response: | |
{}""" | |
EOS_TOKEN = tokenizer.eos_token | |
def formatting_prompts_func(examples): | |
instructions = examples["instruction"] | |
inputs = examples["input"] | |
outputs = examples["output"] | |
texts = [] | |
for instruction, input, output in zip(instructions, inputs, outputs): | |
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN | |
texts.append(text) | |
return {"text": texts} | |
dataset = load_dataset("yahma/alpaca-cleaned", split="train") | |
dataset = dataset.map(formatting_prompts_func, batched=True) | |
# Entrenamiento del modelo | |
from trl import SFTTrainer | |
from transformers import TrainingArguments | |
from unsloth import is_bfloat16_supported | |
trainer = SFTTrainer( | |
model=model, | |
tokenizer=tokenizer, | |
train_dataset=dataset, | |
dataset_text_field="text", | |
max_seq_length=max_seq_length, | |
dataset_num_proc=20, | |
packing=False, | |
args=TrainingArguments( | |
per_device_train_batch_size=2, | |
gradient_accumulation_steps=4, | |
warmup_steps=5, | |
max_steps=60, | |
learning_rate=8e-4, | |
fp16=not is_bfloat16_supported(), | |
bf16=is_bfloat16_supported(), | |
logging_steps=1, | |
optim="adamw_8bit", | |
weight_decay=0.01, | |
lr_scheduler_type="linear", | |
seed=3407, | |
output_dir="outputs", | |
), | |
) | |
# Mostrar estadísticas de memoria actuales | |
gpu_stats = torch.cuda.get_device_properties(0) | |
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) | |
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) | |
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") | |
print(f"{start_gpu_memory} GB of memory reserved.") | |
# Entrenar el modelo | |
trainer_stats = trainer.train() | |
# Mostrar estadísticas finales de memoria y tiempo | |
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) | |
used_memory_for_lora = round(used_memory - start_gpu_memory, 3) | |
used_percentage = round(used_memory / max_memory * 100, 3) | |
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) | |
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") | |
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.") | |
print(f"Peak reserved memory = {used_memory} GB.") | |
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") | |
print(f"Peak reserved memory % of max memory = {used_percentage} %.") | |
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") | |
# Inferencia | |
FastLanguageModel.for_inference(model) | |
inputs = tokenizer( | |
[ | |
alpaca_prompt.format( | |
"Continue the fibonacci sequence.", | |
"1, 1, 2, 3, 5, 8", | |
"", | |
) | |
], return_tensors="pt").to("cuda") | |
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) | |
print(tokenizer.batch_decode(outputs)) | |
# Inferencia continua usando TextStreamer | |
from transformers import TextStreamer | |
text_streamer = TextStreamer(tokenizer) | |
inputs = tokenizer( | |
[ | |
alpaca_prompt.format( | |
"Continue the fibonacci sequence.", | |
"1, 1, 2, 3, 5, 8", | |
"", | |
) | |
], return_tensors="pt").to("cuda") | |
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) | |
# Guardar y cargar modelos fine-tuned | |
model.save_pretrained("lora_model") | |
tokenizer.save_pretrained("lora_model") | |
if True: | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name="lora_model", | |
max_seq_length=max_seq_length, | |
dtype=dtype, | |
load_in_4bit=load_in_4bit, | |
load_in_1bit=load_in_1bit, | |
optimize_storage=optimize_storage, | |
optimize_ram=optimize_ram, | |
optimize_model_space=optimize_model_space, # Activar optimización de espacio del modelo | |
) | |
FastLanguageModel.for_inference(model) | |
inputs = tokenizer( | |
[ | |
alpaca_prompt.format( | |
"What is a famous tall tower in Paris?", | |
"", | |
"", | |
) | |
], return_tensors="pt").to("cuda") | |
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True) | |
print(tokenizer.batch_decode(outputs)) | |
# Guardar en float16 para VLLM | |
if True: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit",) | |
if True: model.push_to_hub_merged("Yjhhh/model", tokenizer, save_method="merged_16bit", token=token) | |
# Guardar en formato GGUF | |
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_0") | |
if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_0", token=token) | |
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_1") | |
if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q4_1", token=token) | |
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8") | |
if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8", token=token) | |
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0") | |
if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_0", token=token) | |
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_1") | |
if True: model.push_to_hub_gguf("Yjhhh/model", tokenizer, quantization_method="q8_1", token=token) | |