File size: 16,979 Bytes

b08551c

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "TPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "import transformers\n",
        "transformers_version = transformers.__version__\n",
        "\n",
        "if transformers_version > '4.31.1':\n",
        "  !pip uninstall transformers\n",
        "  !pip install transformers==4.31\n",
        "else:\n",
        "  print(\"transformers version:\", transformers.__version__)"
      ],
      "metadata": {
        "id": "2RcFPIqQJ6CY",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "8030dedf-b9f5-4687-ef87-1c5a4d8ee9b9"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Found existing installation: transformers 4.31.0\n",
            "Uninstalling transformers-4.31.0:\n",
            "  Would remove:\n",
            "    /usr/local/bin/transformers-cli\n",
            "    /usr/local/lib/python3.10/dist-packages/transformers-4.31.0.dist-info/*\n",
            "    /usr/local/lib/python3.10/dist-packages/transformers/*\n",
            "Proceed (Y/n)? n\n",
            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
            "\u001b[0mRequirement already satisfied: transformers==4.31 in /usr/local/lib/python3.10/dist-packages (4.31.0)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (3.13.4)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.20.3)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (1.25.2)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (24.0)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (6.0.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2023.12.25)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2.31.0)\n",
            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.13.3)\n",
            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.4.3)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (4.66.2)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (2023.6.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (4.11.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2024.2.2)\n",
            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
            "\u001b[0m"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import tensorflow as tf\n",
        "print(\"TensorFlow version:\", tf.__version__)\n",
        "\n",
        "import keras\n",
        "print(\"Keras version:\", keras.__version__)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b_0OPx3WukSi",
        "outputId": "0d205aa3-33b4-4a34-9055-d670cc5ac049"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "TensorFlow version: 2.15.0\n",
            "Keras version: 2.15.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "WkzyTQGqzbPS",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "9bc0c671-8557-4b3c-a120-0237d7f96253"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Loading the Data ###"
      ],
      "metadata": {
        "id": "BKn5EaROLKeX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "\n",
        "# Load the CSV file in memory\n",
        "train_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/train.csv'\n",
        "test_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/test.csv'\n",
        "\n",
        "train_df = pd.read_csv(train_path, usecols=['text', 'label'])\n",
        "test_df = pd.read_csv(test_path, usecols=['text', 'label'])"
      ],
      "metadata": {
        "id": "QztIz9VOKLuV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Show example"
      ],
      "metadata": {
        "id": "hn5ONAwkNeFS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "train_df.head()"
      ],
      "metadata": {
        "id": "zwYzU-dANpJ-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "source": [
        "#import matplotlib library\n",
        "from matplotlib import pyplot as plt\n",
        "\n",
        "#Histogram of \"Label\" column in train datset\n",
        "train_df['label'].plot(kind='hist', title='Label')\n",
        "plt.gca().spines[['top', 'right']].set_visible(False)"
      ],
      "cell_type": "code",
      "execution_count": null,
      "outputs": [],
      "metadata": {
        "id": "2M1XLsAeN2GN"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "test_df.head()"
      ],
      "metadata": {
        "id": "g5_oGvo1NvON"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Pritn theshape of datasets\n",
        "print(f'train_df shape: {train_df.shape}')\n",
        "print(f'test_df shape: {test_df.shape}')"
      ],
      "metadata": {
        "id": "kCFupI1FQlMF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Removing the Special Characters ###"
      ],
      "metadata": {
        "id": "zRcmc15aSNx6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "!pip install text_hammer\n",
        "\n",
        "import text_hammer as th\n",
        "\n",
        "def text_proccessing(df, col_name):\n",
        "  \"\"\"\n",
        "  Process text data in a DataFrame column by performing the following operations:\n",
        "\n",
        "  1. Convert text to lowercase.\n",
        "  2. Remove emails from the text.\n",
        "  3. Remove accented characters from the text.\n",
        "  4. Remove URLs from the text.\n",
        "\n",
        "  Parameters:\n",
        "  df (DataFrame): Input DataFrame containing text data.\n",
        "  col_name (str): Name of the column in the DataFrame containing text data.\n",
        "\n",
        "  Returns:\n",
        "  DataFrame: Processed DataFrame with text data after applying the specified operations.\n",
        "  \"\"\"\n",
        "\n",
        "  # df[col_name] = df[col_name].apply(lambda x:str(x).lower())\n",
        "  df[col_name] = df[col_name].apply(lambda x: th.remove_emails(x))\n",
        "  df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))\n",
        "  df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))\n",
        "\n",
        "  return df\n",
        "\n",
        "train_df = text_proccessing(train_df, 'text')\n"
      ],
      "metadata": {
        "id": "YEMq7SUiS28e"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Print the first sample after cleaning data\n",
        "train_df['text'].iloc[0:10]"
      ],
      "metadata": {
        "id": "VD92IEhPZQHm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "###Loading PreTrained BERT Model###"
      ],
      "metadata": {
        "id": "YfH0H1W6c0Bb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer, TFBertModel\n",
        "tokenizer =  AutoTokenizer.from_pretrained('bert-base-uncased')\n",
        "bert = TFBertModel.from_pretrained('bert-base-uncased')\n"
      ],
      "metadata": {
        "id": "ejMMzCOecze9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer(train_df['text'].iloc[0])"
      ],
      "metadata": {
        "id": "PVWkIfE5gLOV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "max_len = max([len(x.split()) for x in train_df.text])\n",
        "print(f'Max len of tweets: {max_len}')"
      ],
      "metadata": {
        "id": "dGANUQVdhHH7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x_train = tokenizer(\n",
        "                text = train_df.text.tolist(),\n",
        "                padding = True,\n",
        "                max_length= 36,\n",
        "                truncation= True,\n",
        "                return_tensors = 'tf')\n",
        "\n",
        "print(x_train)"
      ],
      "metadata": {
        "id": "q9b4iDZ0jW5-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(x_train['input_ids'].shape)\n",
        "print(x_train['attention_mask'].shape)"
      ],
      "metadata": {
        "id": "PUMeXfO8lgNd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(train_df.label.value_counts())"
      ],
      "metadata": {
        "id": "RMM1QI3DlpmD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "y_train = train_df.label.values\n",
        "y_train\n"
      ],
      "metadata": {
        "id": "4zFkagLml80z"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Building the Model Architecture ###"
      ],
      "metadata": {
        "id": "fFQNe5Cimwxn"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from keras import layers, Model\n",
        "\n",
        "max_length = 36\n",
        "\n",
        "input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"input_ids\")\n",
        "input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"attention_mask\")\n",
        "\n",
        "embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)\n",
        "\n",
        "out = layers.Dropout(0.1)(embeddings)\n",
        "out = layers.Dense(128, activation='relu')(out)\n",
        "out = layers.Dropout(0.1)(out)\n",
        "out = layers.Dense(32,activation = 'relu')(out)\n",
        "\n",
        "y = layers.Dense(3,activation = 'softmax')(out)\n",
        "\n",
        "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)\n",
        "model.layers[2].trainable = False"
      ],
      "metadata": {
        "id": "DE1XbnVomwMc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model.summary()"
      ],
      "metadata": {
        "id": "GuxGCjYjrTyY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from keras.optimizers import Adam\n",
        "\n",
        "optimizer = Adam(\n",
        "    learning_rate = 6e-06, # this learning rate is for bert model , taken from huggingface website\n",
        "    epsilon=1e-08,\n",
        "    weight_decay=0.01)\n",
        "\n",
        "# Compile the model\n",
        "model.compile(\n",
        "    optimizer = optimizer,\n",
        "    loss = 'sparse_categorical_crossentropy',\n",
        "    metrics = [\"sparse_categorical_accuracy\"])"
      ],
      "metadata": {
        "id": "FyyNrAAf7QMP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_history = model.fit(\n",
        "    x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']} ,\n",
        "    y = y_train,\n",
        "    validation_split = 0.1,\n",
        "    epochs= 3,\n",
        "    batch_size= 32)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bEnttT2rA8Yw",
        "outputId": "644c03fd-0cc0-40ff-8108-e059e3a4a0dd"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Epoch 1/3\n",
            "118/269 [============>.................] - ETA: 10:10 - loss: 0.9140 - sparse_categorical_accuracy: 0.6261"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### TESTING PHASE\n",
        "on this phase we will make predictions out of our model"
      ],
      "metadata": {
        "id": "hgiDVRwSBtCN"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "x_test = tokenizer(\n",
        "    text = test_df.text.tolist(),\n",
        "    padding= True,\n",
        "    max_length= 36,\n",
        "    truncation = True,\n",
        "    return_tensors= 'tf')"
      ],
      "metadata": {
        "id": "xaKYd2PRBySe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "y_test = test_df.label.values\n",
        "y_test"
      ],
      "metadata": {
        "id": "OpvHTg3atflb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})"
      ],
      "metadata": {
        "id": "nWgCdpKvCSWm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import confusion_matrix\n",
        "import seaborn as sns\n",
        "\n",
        "# Convert the predictions to binary values (0 or 1)\n",
        "y_pred_binary = [int(round(x[0])) for x in predicted]\n",
        "\n",
        "# Generate the confusion matrix\n",
        "cm = confusion_matrix(test_df['label'], y_pred_binary)\n",
        "\n",
        "# Create a heatmap of the confusion matrix\n",
        "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
        "plt.xlabel(\"Predicted Label\")\n",
        "plt.ylabel(\"True Label\")\n",
        "plt.title(\"Confusion Matrix\")\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "-BICUoNs_8qI"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}