diff --git "a/new intent_model.ipynb" "b/new intent_model.ipynb" new file mode 100644--- /dev/null +++ "b/new intent_model.ipynb" @@ -0,0 +1,4582 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import (\n", + " RobertaTokenizerFast,\n", + " RobertaForSequenceClassification,\n", + " TrainingArguments,\n", + " Trainer,\n", + " AutoConfig,\n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "folder_path = 'formatted_data/'\n", + "\n", + "# Get the list of all files in the folder\n", + "file_names = os.listdir(folder_path)\n", + "max_file_name=max([int(i.split(\"_\")[-1][:-4]) for i in file_names])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:2: SyntaxWarning: invalid escape sequence '\\d'\n", + "<>:2: SyntaxWarning: invalid escape sequence '\\d'\n", + "C:\\Users\\rajst\\AppData\\Local\\Temp\\ipykernel_18180\\3256903659.py:2: SyntaxWarning: invalid escape sequence '\\d'\n", + " df=pd.read_csv(\"formatted_data\\data_\"+str(max_file_name)+\".csv\")\n" + ] + } + ], + "source": [ + "device=\"cuda\"\n", + "df=pd.read_csv(\"formatted_data\\data_\"+str(max_file_name)+\".csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\external\\Experiments\\image_designing\\env\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n", + "Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", + "import torch\n", + "import pandas as pd\n", + "\n", + "# Assuming df is already defined and contains 'text' and 'label' columns\n", + "# Convert labels to numerical format if they are not already (e.g., if they are strings)\n", + "label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}\n", + "df['label'] = df['label'].map(label_mapping)\n", + "\n", + "# Split the dataset into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(df['prompt'], df['label'], test_size=0.1, random_state=42)\n", + "\n", + "# Define the local directory to save the model\n", + "local_model_dir = \"allenai/longformer-base-4096\"\n", + "\n", + "# Load the tokenizer and model from the local directory\n", + "tokenizer = AutoTokenizer.from_pretrained(local_model_dir, add_prefix_space=True)\n", + "num_labels = len(df['label'].unique())\n", + "model = AutoModelForSequenceClassification.from_pretrained(local_model_dir, num_labels=num_labels)\n", + "\n", + "# Tokenize the data\n", + "train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)\n", + "test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)\n", + "\n", + "# Create a Dataset class\n", + "class CustomDataset(torch.utils.data.Dataset):\n", + " def __init__(self, encodings, labels):\n", + " self.encodings = encodings\n", + " self.labels = labels\n", + "\n", + " def __getitem__(self, idx):\n", + " # Convert the encodings into tensors\n", + " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", + " # Add the labels as a tensor\n", + " item['labels'] = torch.tensor(self.labels[idx])\n", + " return item\n", + "\n", + " def __len__(self):\n", + " return len(self.labels)\n", + "\n", + "# Create dataset objects\n", + "train_dataset = CustomDataset(train_encodings, y_train.tolist())\n", + "test_dataset = CustomDataset(test_encodings, y_test.tolist())\n", + "\n", + "# Check the structure of the first item for verification\n", + "# print(train_dataset[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\external\\Experiments\\image_designing\\env\\Lib\\site-packages\\transformers\\training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", + " warnings.warn(\n", + " 0%| | 0/3003 [00:00