{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer\n", "from datasets import Dataset\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Load the dataset\n", "file_path = 'train_en.csv'\n", "dataset = pd.read_csv(file_path)\n", "\n", "# Map labels to expected responses\n", "label_mapping = {\n", " \"Yes\": 0,\n", " \"No\": 1,\n", " \"It doesn't matter\": 2,\n", " \"Unimportant\": 2, # Assuming \"unimportant\" is synonymous with \"It doesn't matter\"\n", " \"Incorrect questioning\": 3,\n", " \"Correct answers\": 4\n", "}\n", "\n", "# Apply label mapping\n", "dataset['label'] = dataset['label'].map(label_mapping)\n", "\n", "# Handle NaN values: Drop rows where label is NaN\n", "dataset = dataset.dropna(subset=['label'])\n", "\n", "# Ensure labels are integers\n", "dataset['label'] = dataset['label'].astype(int)\n", "\n", "# Split the dataset into training and validation sets\n", "train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)\n", "\n", "# Convert the dataframes to datasets\n", "train_dataset = Dataset.from_pandas(train_df)\n", "val_dataset = Dataset.from_pandas(val_df)\n", "\n", "# Load the tokenizer and model\n", "model_name = \"google/gemma-2-9b\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)\n", "\n", "# Tokenize the data\n", "def tokenize_function(examples):\n", " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)\n", "\n", "train_dataset = train_dataset.map(tokenize_function, batched=True)\n", "val_dataset = val_dataset.map(tokenize_function, batched=True)\n", "\n", "# Set the format for PyTorch\n", "train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n", "val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n", "\n", "# Define training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " evaluation_strategy='epoch',\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=8,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "# Initialize the Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=val_dataset,\n", ")\n", "\n", "# Train the model\n", "trainer.train()\n", "\n", "# Save the model\n", "model.save_pretrained('trained_gemma_model')\n", "tokenizer.save_pretrained('trained_gemma_model')\n", "\n", "# Evaluate the model\n", "trainer.evaluate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the trained model and tokenizer\n", "model = AutoModelForSequenceClassification.from_pretrained('trained_gemma_model')\n", "tokenizer = AutoTokenizer.from_pretrained('trained_gemma_model')\n", "\n", "# Function to make predictions\n", "def predict(texts):\n", " inputs = tokenizer(texts, return_tensors=\"pt\", truncation=True, padding='max_length', max_length=128)\n", " outputs = model(**inputs)\n", " predictions = outputs.logits.argmax(dim=-1).tolist()\n", " return predictions\n", "\n", "# Apply the predictions to the dataset\n", "dataset['predicted_label'] = predict(dataset['text'].tolist())\n", "\n", "# Map the predicted labels back to the response texts\n", "reverse_label_mapping = {v: k for k, v in label_mapping.items()}\n", "dataset['predicted_label'] = dataset['predicted_label'].map(reverse_label_mapping)\n", "\n", "# Save the results\n", "dataset.to_csv('gemma-2-9b_predicted_results.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }