Spaces:
Build error
Build error
File size: 4,568 Bytes
91b0adc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer\n",
"from datasets import Dataset\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the dataset\n",
"file_path = 'train_en.csv'\n",
"dataset = pd.read_csv(file_path)\n",
"\n",
"# Map labels to expected responses\n",
"label_mapping = {\n",
" \"Yes\": 0,\n",
" \"No\": 1,\n",
" \"It doesn't matter\": 2,\n",
" \"Unimportant\": 2, # Assuming \"unimportant\" is synonymous with \"It doesn't matter\"\n",
" \"Incorrect questioning\": 3,\n",
" \"Correct answers\": 4\n",
"}\n",
"\n",
"# Apply label mapping\n",
"dataset['label'] = dataset['label'].map(label_mapping)\n",
"\n",
"# Handle NaN values: Drop rows where label is NaN\n",
"dataset = dataset.dropna(subset=['label'])\n",
"\n",
"# Ensure labels are integers\n",
"dataset['label'] = dataset['label'].astype(int)\n",
"\n",
"# Split the dataset into training and validation sets\n",
"train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)\n",
"\n",
"# Convert the dataframes to datasets\n",
"train_dataset = Dataset.from_pandas(train_df)\n",
"val_dataset = Dataset.from_pandas(val_df)\n",
"\n",
"# Load the tokenizer and model\n",
"model_name = \"google/gemma-2-9b\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)\n",
"\n",
"# Tokenize the data\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)\n",
"\n",
"train_dataset = train_dataset.map(tokenize_function, batched=True)\n",
"val_dataset = val_dataset.map(tokenize_function, batched=True)\n",
"\n",
"# Set the format for PyTorch\n",
"train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
"val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
"\n",
"# Define training arguments\n",
"training_args = TrainingArguments(\n",
" output_dir='./results',\n",
" evaluation_strategy='epoch',\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=8,\n",
" per_device_eval_batch_size=8,\n",
" num_train_epochs=3,\n",
" weight_decay=0.01,\n",
")\n",
"\n",
"# Initialize the Trainer\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=val_dataset,\n",
")\n",
"\n",
"# Train the model\n",
"trainer.train()\n",
"\n",
"# Save the model\n",
"model.save_pretrained('trained_gemma_model')\n",
"tokenizer.save_pretrained('trained_gemma_model')\n",
"\n",
"# Evaluate the model\n",
"trainer.evaluate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the trained model and tokenizer\n",
"model = AutoModelForSequenceClassification.from_pretrained('trained_gemma_model')\n",
"tokenizer = AutoTokenizer.from_pretrained('trained_gemma_model')\n",
"\n",
"# Function to make predictions\n",
"def predict(texts):\n",
" inputs = tokenizer(texts, return_tensors=\"pt\", truncation=True, padding='max_length', max_length=128)\n",
" outputs = model(**inputs)\n",
" predictions = outputs.logits.argmax(dim=-1).tolist()\n",
" return predictions\n",
"\n",
"# Apply the predictions to the dataset\n",
"dataset['predicted_label'] = predict(dataset['text'].tolist())\n",
"\n",
"# Map the predicted labels back to the response texts\n",
"reverse_label_mapping = {v: k for k, v in label_mapping.items()}\n",
"dataset['predicted_label'] = dataset['predicted_label'].map(reverse_label_mapping)\n",
"\n",
"# Save the results\n",
"dataset.to_csv('gemma-2-9b_predicted_results.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|