Spaces:
Runtime error
Runtime error
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
from datasets import load_dataset | |
import torch | |
import os | |
# Paths | |
MODEL_NAME = "Gryphe/MythoMax-L2-13b" | |
OUTPUT_DIR = "/app/output" # Path where the fine-tuned model will be saved | |
DATASET_NAME = "mteb/toxic_conversations_50k" | |
# Ensure output directory exists | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
# Load tokenizer and model | |
print("Loading model and tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | |
# Load dataset | |
print("Loading dataset...") | |
dataset = load_dataset(DATASET_NAME) | |
# Validate dataset columns | |
print("Validating dataset columns...") | |
required_columns = ["text", "label", "label_text"] | |
for column in required_columns: | |
if column not in dataset["train"].column_names: | |
if column == "label": | |
print("Warning: Dataset missing 'label' column. Proceeding without it.") | |
else: | |
raise ValueError(f"Dataset missing required column: {column}") | |
# Tokenize the dataset | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) | |
print("Tokenizing dataset...") | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Prepare data for training | |
print("Preparing dataset for training...") | |
if "label_text" in tokenized_datasets["train"].column_names: | |
tokenized_datasets = tokenized_datasets.remove_columns(["text", "label_text"]) | |
else: | |
tokenized_datasets = tokenized_datasets.remove_columns(["text"]) | |
if "label" in tokenized_datasets["train"].column_names: | |
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") | |
else: | |
# If 'label' column is missing, create a dummy labels column | |
tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": 0}) | |
tokenized_datasets.set_format("torch") | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir=OUTPUT_DIR, | |
evaluation_strategy="steps", | |
eval_steps=100, | |
save_steps=100, | |
save_total_limit=2, | |
logging_steps=10, | |
per_device_train_batch_size=1, # Adjust based on available GPU memory | |
gradient_accumulation_steps=16, | |
num_train_epochs=3, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir=f"{OUTPUT_DIR}/logs", | |
fp16=torch.cuda.is_available(), | |
push_to_hub=False, # Set to True if you want to push to Hugging Face Hub | |
) | |
# Define Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["test"], | |
) | |
# Fine-tune the model | |
print("Starting training...") | |
trainer.train() | |
# Save the fine-tuned model | |
print("Saving fine-tuned model...") | |
trainer.save_model(OUTPUT_DIR) | |
tokenizer.save_pretrained(OUTPUT_DIR) | |
print("Fine-tuning complete!") | |