{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "2b46cd0f-ae6d-4781-8dab-89df1f880ada", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:38:33.012523Z", "iopub.status.busy": "2024-03-31T16:38:33.012195Z", "iopub.status.idle": "2024-03-31T16:38:41.339903Z", "shell.execute_reply": "2024-03-31T16:38:41.339283Z", "shell.execute_reply.started": "2024-03-31T16:38:33.012515Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration imsoumyaneel--sentiment-analysis-llama2-406c8d12ee6e98f7\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset csv/imsoumyaneel--sentiment-analysis-llama2 to /root/.cache/huggingface/datasets/imsoumyaneel___csv/imsoumyaneel--sentiment-analysis-llama2-406c8d12ee6e98f7/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b57dcefac4c41bd8afdc69d582d21ae", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n", "RangeIndex: 598298 entries, 0 to 598297\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 598298 non-null object\n", " 1 label 598298 non-null object\n", " 2 text 598298 non-null object\n", " 3 new_label 598298 non-null object\n", "dtypes: object(4)\n", "memory usage: 18.3+ MB\n", "\n", "Int64Index: 478638 entries, 352227 to 559736\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 478638 non-null object\n", " 1 label 478638 non-null object\n", " 2 text 478638 non-null object\n", " 3 new_label 478638 non-null object\n", "dtypes: object(4)\n", "memory usage: 18.3+ MB\n", "\n", "Int64Index: 119660 entries, 0 to 598297\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 sentence 119660 non-null object\n", " 1 label 119660 non-null object\n", " 2 text 119660 non-null object\n", " 3 new_label 119660 non-null object\n", "dtypes: object(4)\n", "memory usage: 4.6+ MB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentencelabeltextnew_label
0I'll throw out the garbage .neutral###Human:\\nyou are a sentiment analist. guess ...1
1So Dick , how about getting some coffee for to...joy###Human:\\nyou are a sentiment analist. guess ...0
2Come on , you can at least try a little , besi...neutral###Human:\\nyou are a sentiment analist. guess ...1
3What ’ s wrong with that ? Cigarette is the th...anger###Human:\\nyou are a sentiment analist. guess ...3
4Not for me , Dick .neutral###Human:\\nyou are a sentiment analist. guess ...1
\n", "
" ], "text/plain": [ " sentence label \\\n", "0 I'll throw out the garbage . neutral \n", "1 So Dick , how about getting some coffee for to... joy \n", "2 Come on , you can at least try a little , besi... neutral \n", "3 What ’ s wrong with that ? Cigarette is the th... anger \n", "4 Not for me , Dick . neutral \n", "\n", " text new_label \n", "0 ###Human:\\nyou are a sentiment analist. guess ... 1 \n", "1 ###Human:\\nyou are a sentiment analist. guess ... 0 \n", "2 ###Human:\\nyou are a sentiment analist. guess ... 1 \n", "3 ###Human:\\nyou are a sentiment analist. guess ... 3 \n", "4 ###Human:\\nyou are a sentiment analist. guess ... 1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "complete_dataset = dataset['train'].to_pandas()\n", "complete_dataset['new_label'] = complete_dataset['label'].map({'joy': '0', 'neutral': '1', 'sadness': '2', 'anger': '3', 'fear': '4', 'love': '5', 'surprise': '6'}).values\n", "\n", "train_dataset = complete_dataset.sample(frac=0.8,random_state=200)\n", "test_dataset = complete_dataset.drop(train_dataset.index)\n", "\n", "complete_dataset.info()\n", "train_dataset.info()\n", "test_dataset.info()\n", "\n", "complete_dataset.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "0cfc926c-70df-4089-9b2e-f201faa223df", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:38:42.666938Z", "iopub.status.busy": "2024-03-31T16:38:42.666761Z", "iopub.status.idle": "2024-03-31T16:38:46.106948Z", "shell.execute_reply": "2024-03-31T16:38:46.106267Z", "shell.execute_reply.started": "2024-03-31T16:38:42.666926Z" } }, "outputs": [], "source": [ "# imports for model creation\n", "import tensorflow as tf\n", "from keras import layers\n", "from keras import losses\n", "import keras\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences" ] }, { "cell_type": "code", "execution_count": 6, "id": "eb4bab6b-ae99-4fae-bf0b-ca91be630db3", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:38:46.108849Z", "iopub.status.busy": "2024-03-31T16:38:46.108483Z", "iopub.status.idle": "2024-03-31T16:39:01.033885Z", "shell.execute_reply": "2024-03-31T16:39:01.033311Z", "shell.execute_reply.started": "2024-03-31T16:38:46.108831Z" } }, "outputs": [], "source": [ "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(complete_dataset['sentence'])\n", "\n", "vocab_size = len(tokenizer.word_index) + 1\n", "max_length = 200 # max words in a sentence\n", "embedding_dim = 50 # TODO: need to adjust accordinglys\n", "\n", "X_train = tokenizer.texts_to_sequences(train_dataset['sentence'])\n", "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n", "\n", "X_test = tokenizer.texts_to_sequences(test_dataset['sentence'])\n", "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n", "\n", "y_train = train_dataset['new_label']\n", "y_test = test_dataset['new_label']\n", "\n", "from keras.utils import to_categorical\n", "\n", "num_classes = 7 # Assuming you have 3 classes\n", "y_train_encoded = to_categorical(y_train, num_classes=num_classes)\n", "y_test_encoded = to_categorical(y_test, num_classes=num_classes)" ] }, { "cell_type": "code", "execution_count": 7, "id": "d7202d74-95c7-4bb2-aea5-54481dfcafd6", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:39:01.035022Z", "iopub.status.busy": "2024-03-31T16:39:01.034846Z", "iopub.status.idle": "2024-03-31T16:39:01.038541Z", "shell.execute_reply": "2024-03-31T16:39:01.038020Z", "shell.execute_reply.started": "2024-03-31T16:39:01.035006Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(478638, 200)\n", "(119660, 200)\n", "(478638,)\n", "(119660,)\n" ] } ], "source": [ "labels = complete_dataset['label']\n", "\n", "print(X_train.shape)\n", "print(X_test.shape)\n", "print(y_train.shape)\n", "print(y_test.shape)" ] }, { "cell_type": "code", "execution_count": 8, "id": "738e3137-7ea4-4e71-9395-773e537083cf", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:39:01.039687Z", "iopub.status.busy": "2024-03-31T16:39:01.039206Z", "iopub.status.idle": "2024-03-31T16:39:02.148732Z", "shell.execute_reply": "2024-03-31T16:39:02.148025Z", "shell.execute_reply.started": "2024-03-31T16:39:01.039671Z" } }, "outputs": [], "source": [ "# Build the model\n", "model = keras.Sequential([\n", " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n", " keras.layers.GlobalAveragePooling1D(),\n", " keras.layers.Dense(32, activation='relu'),\n", " keras.layers.Dense(7, activation='sigmoid')\n", "])" ] }, { "cell_type": "code", "execution_count": 9, "id": "cd072f39-99e6-44f0-8c7f-106a0055c43b", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:39:02.150108Z", "iopub.status.busy": "2024-03-31T16:39:02.149634Z", "iopub.status.idle": "2024-03-31T16:39:02.159085Z", "shell.execute_reply": "2024-03-31T16:39:02.158478Z", "shell.execute_reply.started": "2024-03-31T16:39:02.150090Z" } }, "outputs": [], "source": [ "# Compile the model\n", "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 10, "id": "72ad6548-5d1c-4221-88c7-014dcbaea0ee", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:39:02.160402Z", "iopub.status.busy": "2024-03-31T16:39:02.159930Z", "iopub.status.idle": "2024-03-31T16:39:02.162845Z", "shell.execute_reply": "2024-03-31T16:39:02.162259Z", "shell.execute_reply.started": "2024-03-31T16:39:02.160382Z" } }, "outputs": [], "source": [ "# split the dataset into train and test\n", "# from sklearn.model_selection import train_test_split\n", "\n", "# X_train, X_test, y_train, y_test = train_test_split(, labels, test_size=0.3, random_state=42, shuffle=True)\n", "# X_train" ] }, { "cell_type": "code", "execution_count": 12, "id": "9267da90-7a84-49d1-94d0-04a2cd3062e0", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:39:34.346301Z", "iopub.status.busy": "2024-03-31T16:39:34.345554Z", "iopub.status.idle": "2024-03-31T16:48:40.470989Z", "shell.execute_reply": "2024-03-31T16:48:40.470195Z", "shell.execute_reply.started": "2024-03-31T16:39:34.346268Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "14958/14958 [==============================] - 55s 4ms/step - loss: 0.4894 - accuracy: 0.8447 - val_loss: 0.4174 - val_accuracy: 0.8586\n", "Epoch 2/10\n", "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3798 - accuracy: 0.8692 - val_loss: 0.3835 - val_accuracy: 0.8651\n", "Epoch 3/10\n", "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3453 - accuracy: 0.8761 - val_loss: 0.3638 - val_accuracy: 0.8655\n", "Epoch 4/10\n", "14958/14958 [==============================] - 54s 4ms/step - loss: 0.3166 - accuracy: 0.8810 - val_loss: 0.3513 - val_accuracy: 0.8645\n", "Epoch 5/10\n", "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2941 - accuracy: 0.8848 - val_loss: 0.3548 - val_accuracy: 0.8669\n", "Epoch 6/10\n", "14958/14958 [==============================] - 54s 4ms/step - loss: 0.2789 - accuracy: 0.8881 - val_loss: 0.3423 - val_accuracy: 0.8654\n", "Epoch 7/10\n", "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2675 - accuracy: 0.8909 - val_loss: 0.3447 - val_accuracy: 0.8646\n", "Epoch 8/10\n", "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2590 - accuracy: 0.8937 - val_loss: 0.3418 - val_accuracy: 0.8658\n", "Epoch 9/10\n", "14958/14958 [==============================] - 55s 4ms/step - loss: 0.2511 - accuracy: 0.8963 - val_loss: 0.3417 - val_accuracy: 0.8636\n", "Epoch 10/10\n", "14958/14958 [==============================] - 54s 4ms/step - loss: 0.2446 - accuracy: 0.8981 - val_loss: 0.3639 - val_accuracy: 0.8604\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train the model\n", "model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded))" ] }, { "cell_type": "code", "execution_count": 14, "id": "24e17bec-2fbe-400f-9273-a5abe823f193", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T16:57:03.280928Z", "iopub.status.busy": "2024-03-31T16:57:03.280193Z", "iopub.status.idle": "2024-03-31T16:57:09.194519Z", "shell.execute_reply": "2024-03-31T16:57:09.193928Z", "shell.execute_reply.started": "2024-03-31T16:57:03.280897Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3740/3740 [==============================] - 6s 2ms/step - loss: 0.3639 - accuracy: 0.8604\n" ] }, { "data": { "text/plain": [ "0.8604295253753662" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test_encoded)\n", "accuracy" ] }, { "cell_type": "code", "execution_count": 16, "id": "15fbbb09-ffdf-41d3-ba11-8877aa2c078e", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T17:01:34.387114Z", "iopub.status.busy": "2024-03-31T17:01:34.386216Z", "iopub.status.idle": "2024-03-31T17:01:34.528910Z", "shell.execute_reply": "2024-03-31T17:01:34.528157Z", "shell.execute_reply.started": "2024-03-31T17:01:34.387078Z" } }, "outputs": [], "source": [ "# save the model\n", "import os\n", "try:\n", " model.save(\"../models/sentimental-analysis-llama2.keras\")\n", "except FileNotFoundError:\n", " os.mkdir(\"../models\")\n", " model.save(\"../models/sentimental-analysis-llama2.keras\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "792f0680-5a32-4c46-b5b4-eb6795b51aeb", "metadata": { "execution": { "iopub.execute_input": "2024-03-31T17:04:41.901658Z", "iopub.status.busy": "2024-03-31T17:04:41.901124Z", "iopub.status.idle": "2024-03-31T17:04:41.948670Z", "shell.execute_reply": "2024-03-31T17:04:41.948177Z", "shell.execute_reply.started": "2024-03-31T17:04:41.901637Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1/1 [==============================] - 0s 17ms/step\n" ] }, { "data": { "text/plain": [ "array([[0.809063 , 0.78246254, 0.02547726, 0.03657908, 0.00648503,\n", " 0.02069169, 0.07264358]], dtype=float32)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def preprocess_text(text):\n", " # Tokenize the text\n", " tokenized_text = tokenizer.texts_to_sequences([text])\n", " # Pad sequences to the same length as training data\n", " padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n", " return padded_text\n", "\n", "# Preprocess the custom input text\n", "preprocessed_text = preprocess_text(\"this is good\")\n", "\n", "# Make predictions\n", "predictions = model.predict(preprocessed_text)\n", "\n", "predictions" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }