In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)

# Map labels to expected responses
label_mapping = {
 "Yes": 0,
 "No": 1,
 "It doesn't matter": 2,
 "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
 "Incorrect questioning": 3,
 "Correct answers": 4
}

# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)

# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])

# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer and model
model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Tokenize the data
def tokenize_function(examples):
 return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define training arguments
training_args = TrainingArguments(
 output_dir='./results',
 evaluation_strategy='epoch',
 learning_rate=2e-5,
 per_device_train_batch_size=8,
 per_device_eval_batch_size=8,
 num_train_epochs=3,
 weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('trained_gemma_model')
tokenizer.save_pretrained('trained_gemma_model')

# Evaluate the model
trainer.evaluate()

In [None]:
# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('trained_gemma_model')
tokenizer = AutoTokenizer.from_pretrained('trained_gemma_model')

# Function to make predictions
def predict(texts):
 inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
 outputs = model(**inputs)
 predictions = outputs.logits.argmax(dim=-1).tolist()
 return predictions

# Apply the predictions to the dataset
dataset['predicted_label'] = predict(dataset['text'].tolist())

# Map the predicted labels back to the response texts
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
dataset['predicted_label'] = dataset['predicted_label'].map(reverse_label_mapping)

# Save the results
dataset.to_csv('gemma-2-9b_predicted_results.csv', index=False)