{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import (\n", " RobertaTokenizerFast,\n", " RobertaForSequenceClassification,\n", " TrainingArguments,\n", " Trainer,\n", " AutoConfig,\n", ")\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "# Load your preprocessed dataset into a pandas DataFrame\n", "df = pd.read_csv(r'D:\\Thesis\\Datasets\\preprocessed_dataset.csv')\n", "df = df.dropna()\n", "df.head()\n", "\n", "#independent variable\n", "X = list(df['sentence'])\n", "#dependent variable\n", "y = list(df['label'])\n", "\n", "#splitting\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state= 0)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "tokenizer = RobertaTokenizerFast.from_pretrained(\"jcblaise/roberta-tagalog-base\")\n", "def tokenize(batch):\n", " return tokenizer(batch[\"text\"], padding=True, truncation=True, max_length=97)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Maximum sequence length: 146\n", "Minimum sequence length: 5\n", "Average sequence length: 34.3118799903218\n" ] } ], "source": [ "# Calculate the length of each input sequence\n", "sequence_lengths = [len(tokenizer.tokenize(text)) for text in X_train]\n", "\n", "# Print statistics about sequence lengths\n", "print(\"Maximum sequence length:\", max(sequence_lengths))\n", "print(\"Minimum sequence length:\", min(sequence_lengths))\n", "print(\"Average sequence length:\", sum(sequence_lengths) / len(sequence_lengths))\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "\u001b[A" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\u001b[A\n", "\u001b[A\n", "\u001b[A\n", "Map: 100%|██████████| 8266/8266 [00:00<00:00, 14158.16 examples/s]\n", "\n", "\u001b[A\n", "Map: 100%|██████████| 2067/2067 [00:00<00:00, 15252.23 examples/s]\n" ] } ], "source": [ "from datasets import Dataset\n", "\n", "# Create datasets using the Dataset class\n", "train_dataset = Dataset.from_dict({\"text\": X_train, \"label\": y_train})\n", "val_dataset = Dataset.from_dict({\"text\": X_test, \"label\": y_test})\n", "\n", "# Tokenize datasets using the tokenize function\n", "train_dataset = train_dataset.map(tokenize, batched=True)\n", "val_dataset = val_dataset.map(tokenize, batched=True)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "# Define your model configuration\n", "model_config = RobertaForSequenceClassification.from_pretrained(\"jcblaise/roberta-tagalog-base\")\n", "\n", "# Define your training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=8,\n", " num_train_epochs=3,\n", " logging_dir='./logs',\n", " logging_steps=100,\n", " evaluation_strategy=\"steps\",\n", " eval_steps=500,\n", ")\n", "\n", "# Initialize Trainer\n", "trainer = Trainer(\n", " model=model_config,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=val_dataset\n", ")\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/3102 [02:45