from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from datasets import load_dataset import torch import os # Paths MODEL_NAME = "Gryphe/MythoMax-L2-13b" OUTPUT_DIR = "/app/output" # Path where the fine-tuned model will be saved DATASET_NAME = "mteb/toxic_conversations_50k" # Ensure output directory exists os.makedirs(OUTPUT_DIR, exist_ok=True) # Load tokenizer and model print("Loading model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # Load dataset print("Loading dataset...") dataset = load_dataset(DATASET_NAME) # Validate dataset columns print("Validating dataset columns...") required_columns = ["text", "label", "label_text"] for column in required_columns: if column not in dataset["train"].column_names: if column == "label": print("Warning: Dataset missing 'label' column. Proceeding without it.") else: raise ValueError(f"Dataset missing required column: {column}") # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) print("Tokenizing dataset...") tokenized_datasets = dataset.map(tokenize_function, batched=True) # Prepare data for training print("Preparing dataset for training...") if "label_text" in tokenized_datasets["train"].column_names: tokenized_datasets = tokenized_datasets.remove_columns(["text", "label_text"]) else: tokenized_datasets = tokenized_datasets.remove_columns(["text"]) if "label" in tokenized_datasets["train"].column_names: tokenized_datasets = tokenized_datasets.rename_column("label", "labels") else: # If 'label' column is missing, create a dummy labels column tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": 0}) tokenized_datasets.set_format("torch") # Define training arguments training_args = TrainingArguments( output_dir=OUTPUT_DIR, evaluation_strategy="steps", eval_steps=100, save_steps=100, save_total_limit=2, logging_steps=10, per_device_train_batch_size=1, # Adjust based on available GPU memory gradient_accumulation_steps=16, num_train_epochs=3, warmup_steps=500, weight_decay=0.01, logging_dir=f"{OUTPUT_DIR}/logs", fp16=torch.cuda.is_available(), push_to_hub=False, # Set to True if you want to push to Hugging Face Hub ) # Define Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], ) # Fine-tune the model print("Starting training...") trainer.train() # Save the fine-tuned model print("Saving fine-tuned model...") trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) print("Fine-tuning complete!")