Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
from datasets import Dataset | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from peft import get_peft_model, LoraConfig, TaskType | |
import evaluate | |
import numpy as np | |
# Load the dataset | |
file_path = 'train_en.csv' | |
dataset = pd.read_csv(file_path) | |
# Map labels to expected responses | |
label_mapping = { | |
"Yes": 0, | |
"No": 1, | |
"It doesn't matter": 2, | |
"Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter" | |
"Incorrect questioning": 3, | |
"Correct answers": 4 | |
} | |
# Apply label mapping | |
dataset['label'] = dataset['label'].map(label_mapping) | |
# Handle NaN values: Drop rows where label is NaN | |
dataset = dataset.dropna(subset=['label']) | |
# Ensure labels are integers | |
dataset['label'] = dataset['label'].astype(int) | |
# Format puzzle, truth, text into rows | |
dataset['combined_text'] = ( | |
"==========================================\n" | |
"puzzle: " + dataset['puzzle'] + "\n" | |
"==========================================\n" | |
"truth: " + dataset['truth'] + "\n" | |
"==========================================\n" | |
"text: " + dataset['text'] | |
) | |
# Split the dataset into training and validation sets | |
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42) | |
# Convert the dataframes to datasets | |
train_dataset = Dataset.from_pandas(train_df) | |
val_dataset = Dataset.from_pandas(val_df) | |
# Load the tokenizer and model | |
model_name = "google/gemma-2-9b" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) | |
# Tokenize the data | |
def tokenize_function(examples): | |
return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128) | |
train_dataset = train_dataset.map(tokenize_function, batched=True) | |
val_dataset = val_dataset.map(tokenize_function, batched=True) | |
# Set the format for PyTorch | |
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
# Define LoRA configuration | |
lora_config = LoraConfig( | |
task_type=TaskType.SEQ_CLS, | |
r=16, | |
lora_alpha=16, | |
target_modules=["q_proj", "v_proj"], | |
lora_dropout=0.05, | |
bias="none" | |
) | |
# Apply LoRA to the model | |
model = get_peft_model(model, lora_config) | |
model.print_trainable_parameters() | |
# Training arguments | |
training_args = TrainingArguments( | |
output_dir='./results', | |
learning_rate=1e-4, | |
lr_scheduler_type="linear", | |
warmup_ratio=0.1, | |
max_grad_norm=0.3, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=3, | |
weight_decay=0.001, | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
load_best_model_at_end=True, | |
report_to="wandb", | |
fp16=True, | |
gradient_checkpointing=True, | |
gradient_accumulation_steps=4, | |
dataloader_num_workers=4, | |
logging_steps=100, | |
save_total_limit=2, | |
) | |
def compute_metrics(eval_pred): | |
precision_metric = evaluate.load("precision") | |
recall_metric = evaluate.load("recall") | |
f1_metric = evaluate.load("f1") | |
accuracy_metric = evaluate.load("accuracy") | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"] | |
recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"] | |
f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"] | |
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"] | |
return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy} | |
# Initialize the Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset, | |
compute_metrics=compute_metrics | |
) | |
# Train the model | |
trainer.train() | |
# Save the model | |
model.save_pretrained('trained_gemma_model') | |
tokenizer.save_pretrained('trained_gemma_model') | |
# Evaluate the model | |
trainer.evaluate() |