diff --git "a/Fake_News_Detection_2_0 (4).ipynb" "b/Fake_News_Detection_2_0 (4).ipynb"
new file mode 100644--- /dev/null
+++ "b/Fake_News_Detection_2_0 (4).ipynb"
@@ -0,0 +1,5177 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Loading of Data"
+ ],
+ "metadata": {
+ "id": "3O2gRML1CxuY"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ClXaJHz3skxq",
+ "outputId": "620e7537-0423-4b59-c9fd-9146b2062a4a"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
+ ]
+ }
+ ],
+ "source": [
+ "# prompt: load the dataset from a Google Drive\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from google.colab import drive\n",
+ "\n",
+ "# Load the dataset from a Google Drive file\n",
+ "drive.mount('/content/drive')\n",
+ "data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Twitter_Analysis.csv')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "y4xKC0BJVd-o",
+ "outputId": "01958e58-4379-487a-c1ef-aabad115e6a6"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
+ ]
+ }
+ ],
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 707
+ },
+ "id": "EsspYImeyQeS",
+ "outputId": "7c076ae0-2533-4439-c347-994aa7d50418"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Unnamed: 0 majority_target \\\n",
+ "0 0 True \n",
+ "1 1 True \n",
+ "2 2 True \n",
+ "3 3 True \n",
+ "4 4 True \n",
+ "\n",
+ " statement BinaryNumTarget \\\n",
+ "0 End of eviction moratorium means millions of A... 1.0 \n",
+ "1 End of eviction moratorium means millions of A... 1.0 \n",
+ "2 End of eviction moratorium means millions of A... 1.0 \n",
+ "3 End of eviction moratorium means millions of A... 1.0 \n",
+ "4 End of eviction moratorium means millions of A... 1.0 \n",
+ "\n",
+ " tweet followers_count \\\n",
+ "0 @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... 4262.0 \n",
+ "1 @S0SickRick @Stairmaster_ @6d6f636869 Not as m... 1393.0 \n",
+ "2 THE SUPREME COURT is siding with super rich pr... 9.0 \n",
+ "3 @POTUS Biden Blunders\\n\\nBroken campaign promi... 4262.0 \n",
+ "4 @OhComfy I agree. The confluence of events rig... 70.0 \n",
+ "\n",
+ " friends_count favourites_count statuses_count listed_count ... \\\n",
+ "0 3619.0 34945.0 16423.0 44.0 ... \n",
+ "1 1621.0 31436.0 37184.0 64.0 ... \n",
+ "2 84.0 219.0 1184.0 0.0 ... \n",
+ "3 3619.0 34945.0 16423.0 44.0 ... \n",
+ "4 166.0 15282.0 2194.0 0.0 ... \n",
+ "\n",
+ " determiners conjunctions dots exclamation questions ampersand \\\n",
+ "0 0 0 5 0 1 0 \n",
+ "1 0 2 1 0 0 0 \n",
+ "2 0 1 0 0 0 0 \n",
+ "3 0 1 3 0 0 1 \n",
+ "4 0 1 3 0 1 0 \n",
+ "\n",
+ " capitals digits long_word_freq short_word_freq \n",
+ "0 33 3 5 19 \n",
+ "1 14 0 2 34 \n",
+ "2 3 0 4 10 \n",
+ "3 6 8 1 30 \n",
+ "4 11 3 2 19 \n",
+ "\n",
+ "[5 rows x 64 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " majority_target | \n",
+ " statement | \n",
+ " BinaryNumTarget | \n",
+ " tweet | \n",
+ " followers_count | \n",
+ " friends_count | \n",
+ " favourites_count | \n",
+ " statuses_count | \n",
+ " listed_count | \n",
+ " ... | \n",
+ " determiners | \n",
+ " conjunctions | \n",
+ " dots | \n",
+ " exclamation | \n",
+ " questions | \n",
+ " ampersand | \n",
+ " capitals | \n",
+ " digits | \n",
+ " long_word_freq | \n",
+ " short_word_freq | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 33 | \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @S0SickRick @Stairmaster_ @6d6f636869 Not as m... | \n",
+ " 1393.0 | \n",
+ " 1621.0 | \n",
+ " 31436.0 | \n",
+ " 37184.0 | \n",
+ " 64.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 14 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " THE SUPREME COURT is siding with super rich pr... | \n",
+ " 9.0 | \n",
+ " 84.0 | \n",
+ " 219.0 | \n",
+ " 1184.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders\\n\\nBroken campaign promi... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @OhComfy I agree. The confluence of events rig... | \n",
+ " 70.0 | \n",
+ " 166.0 | \n",
+ " 15282.0 | \n",
+ " 2194.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 64 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data"
+ }
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Pre-Processing"
+ ],
+ "metadata": {
+ "id": "SyzmIOQqCaky"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7VRJMRynmTB5",
+ "outputId": "2cea5c6a-91a3-4d6b-e3c2-2309a9c9fc60"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: pycaret in /usr/local/lib/python3.10/dist-packages (3.3.2)\n",
+ "Requirement already satisfied: ipython>=5.5.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (7.34.0)\n",
+ "Requirement already satisfied: ipywidgets>=7.6.5 in /usr/local/lib/python3.10/dist-packages (from pycaret) (7.7.1)\n",
+ "Requirement already satisfied: tqdm>=4.62.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (4.66.4)\n",
+ "Requirement already satisfied: numpy<1.27,>=1.21 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.25.2)\n",
+ "Requirement already satisfied: pandas<2.2.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.0.3)\n",
+ "Requirement already satisfied: jinja2>=3 in /usr/local/lib/python3.10/dist-packages (from pycaret) (3.1.4)\n",
+ "Requirement already satisfied: scipy<=1.11.4,>=1.6.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.11.4)\n",
+ "Requirement already satisfied: joblib<1.4,>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.3.2)\n",
+ "Requirement already satisfied: scikit-learn>1.4.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.4.2)\n",
+ "Requirement already satisfied: pyod>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.0.0)\n",
+ "Requirement already satisfied: imbalanced-learn>=0.12.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.12.3)\n",
+ "Requirement already satisfied: category-encoders>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.6.3)\n",
+ "Requirement already satisfied: lightgbm>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (4.1.0)\n",
+ "Requirement already satisfied: numba>=0.55.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.58.1)\n",
+ "Requirement already satisfied: requests>=2.27.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.31.0)\n",
+ "Requirement already satisfied: psutil>=5.9.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (5.9.5)\n",
+ "Requirement already satisfied: markupsafe>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.1.5)\n",
+ "Requirement already satisfied: importlib-metadata>=4.12.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (7.1.0)\n",
+ "Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (5.10.4)\n",
+ "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.2.1)\n",
+ "Requirement already satisfied: deprecation>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.1.0)\n",
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from pycaret) (3.4.1)\n",
+ "Requirement already satisfied: matplotlib<3.8.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (3.7.1)\n",
+ "Requirement already satisfied: scikit-plot>=0.3.7 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.3.7)\n",
+ "Requirement already satisfied: yellowbrick>=1.4 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.5)\n",
+ "Requirement already satisfied: plotly>=5.14.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (5.15.0)\n",
+ "Requirement already satisfied: kaleido>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.2.1)\n",
+ "Requirement already satisfied: schemdraw==0.15 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.15)\n",
+ "Requirement already satisfied: plotly-resampler>=0.8.3.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.10.0)\n",
+ "Requirement already satisfied: statsmodels>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.14.2)\n",
+ "Requirement already satisfied: sktime==0.26.0 in /usr/local/lib/python3.10/dist-packages (from pycaret) (0.26.0)\n",
+ "Requirement already satisfied: tbats>=1.1.3 in /usr/local/lib/python3.10/dist-packages (from pycaret) (1.1.3)\n",
+ "Requirement already satisfied: pmdarima>=2.0.4 in /usr/local/lib/python3.10/dist-packages (from pycaret) (2.0.4)\n",
+ "Requirement already satisfied: wurlitzer in /usr/local/lib/python3.10/dist-packages (from pycaret) (3.1.1)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from sktime==0.26.0->pycaret) (24.1)\n",
+ "Requirement already satisfied: scikit-base<0.8.0 in /usr/local/lib/python3.10/dist-packages (from sktime==0.26.0->pycaret) (0.7.8)\n",
+ "Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.10/dist-packages (from category-encoders>=2.4.0->pycaret) (0.5.6)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn>=0.12.0->pycaret) (3.5.0)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=4.12.0->pycaret) (3.19.2)\n",
+ "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (67.7.2)\n",
+ "Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (0.19.1)\n",
+ "Requirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (4.4.2)\n",
+ "Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (0.7.5)\n",
+ "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (5.7.1)\n",
+ "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (3.0.47)\n",
+ "Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (2.16.1)\n",
+ "Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (0.2.0)\n",
+ "Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (0.1.7)\n",
+ "Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython>=5.5.0->pycaret) (4.9.0)\n",
+ "Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.10/dist-packages (from ipywidgets>=7.6.5->pycaret) (5.5.6)\n",
+ "Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets>=7.6.5->pycaret) (0.2.0)\n",
+ "Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets>=7.6.5->pycaret) (3.6.6)\n",
+ "Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets>=7.6.5->pycaret) (3.0.11)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (1.2.1)\n",
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (4.53.0)\n",
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (1.4.5)\n",
+ "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (9.4.0)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (3.1.2)\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib<3.8.0->pycaret) (2.8.2)\n",
+ "Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat>=4.2.0->pycaret) (2.19.1)\n",
+ "Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=4.2.0->pycaret) (4.19.2)\n",
+ "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.10/dist-packages (from nbformat>=4.2.0->pycaret) (5.7.2)\n",
+ "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.55.0->pycaret) (0.41.1)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2.2.0->pycaret) (2023.4)\n",
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2.2.0->pycaret) (2024.1)\n",
+ "Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly>=5.14.0->pycaret) (8.3.0)\n",
+ "Requirement already satisfied: dash>=2.9.0 in /usr/local/lib/python3.10/dist-packages (from plotly-resampler>=0.8.3.1->pycaret) (2.17.1)\n",
+ "Requirement already satisfied: orjson<4.0.0,>=3.8.0 in /usr/local/lib/python3.10/dist-packages (from plotly-resampler>=0.8.3.1->pycaret) (3.10.5)\n",
+ "Requirement already satisfied: tsdownsample>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from plotly-resampler>=0.8.3.1->pycaret) (0.1.3)\n",
+ "Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.10/dist-packages (from pmdarima>=2.0.4->pycaret) (3.0.10)\n",
+ "Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from pmdarima>=2.0.4->pycaret) (2.0.7)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.1->pycaret) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.1->pycaret) (3.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.27.1->pycaret) (2024.6.2)\n",
+ "Requirement already satisfied: Flask<3.1,>=1.0.4 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (2.2.5)\n",
+ "Requirement already satisfied: Werkzeug<3.1 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (3.0.3)\n",
+ "Requirement already satisfied: dash-html-components==2.0.0 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (2.0.0)\n",
+ "Requirement already satisfied: dash-core-components==2.0.0 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (2.0.0)\n",
+ "Requirement already satisfied: dash-table==5.0.0 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (5.0.0)\n",
+ "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (4.12.2)\n",
+ "Requirement already satisfied: retrying in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (1.3.4)\n",
+ "Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.10/dist-packages (from dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (1.6.0)\n",
+ "Requirement already satisfied: jupyter-client in /usr/local/lib/python3.10/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.6.5->pycaret) (6.1.12)\n",
+ "Requirement already satisfied: tornado>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipykernel>=4.5.1->ipywidgets>=7.6.5->pycaret) (6.3.3)\n",
+ "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython>=5.5.0->pycaret) (0.8.4)\n",
+ "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->pycaret) (23.2.0)\n",
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->pycaret) (2023.12.1)\n",
+ "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->pycaret) (0.35.1)\n",
+ "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->pycaret) (0.18.1)\n",
+ "Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=4.2.0->pycaret) (4.2.2)\n",
+ "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.1->category-encoders>=2.4.0->pycaret) (1.16.0)\n",
+ "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython>=5.5.0->pycaret) (0.7.0)\n",
+ "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=5.5.0->pycaret) (0.2.13)\n",
+ "Requirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (6.5.5)\n",
+ "Requirement already satisfied: itsdangerous>=2.0 in /usr/local/lib/python3.10/dist-packages (from Flask<3.1,>=1.0.4->dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (2.2.0)\n",
+ "Requirement already satisfied: click>=8.0 in /usr/local/lib/python3.10/dist-packages (from Flask<3.1,>=1.0.4->dash>=2.9.0->plotly-resampler>=0.8.3.1->pycaret) (8.1.7)\n",
+ "Requirement already satisfied: pyzmq<25,>=17 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (24.0.1)\n",
+ "Requirement already satisfied: argon2-cffi in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (23.1.0)\n",
+ "Requirement already satisfied: nbconvert>=5 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (6.5.4)\n",
+ "Requirement already satisfied: Send2Trash>=1.8.0 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.8.3)\n",
+ "Requirement already satisfied: terminado>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.18.1)\n",
+ "Requirement already satisfied: prometheus-client in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.20.0)\n",
+ "Requirement already satisfied: nbclassic>=0.4.7 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.1.0)\n",
+ "Requirement already satisfied: notebook-shim>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.2.4)\n",
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (4.9.4)\n",
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (4.12.3)\n",
+ "Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (6.1.0)\n",
+ "Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.7.1)\n",
+ "Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.4)\n",
+ "Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.3.0)\n",
+ "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.8.4)\n",
+ "Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.10.0)\n",
+ "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.5.1)\n",
+ "Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.3.0)\n",
+ "Requirement already satisfied: argon2-cffi-bindings in /usr/local/lib/python3.10/dist-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (21.2.0)\n",
+ "Requirement already satisfied: jupyter-server<3,>=1.8 in /usr/local/lib/python3.10/dist-packages (from notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.24.0)\n",
+ "Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.16.0)\n",
+ "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (2.5)\n",
+ "Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (0.5.1)\n",
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (2.22)\n",
+ "Requirement already satisfied: anyio<4,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (3.7.1)\n",
+ "Requirement already satisfied: websocket-client in /usr/local/lib/python3.10/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.8.0)\n",
+ "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.3.1)\n",
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets>=7.6.5->pycaret) (1.2.1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install pycaret"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "HIFgZcfWlC8F"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import pycaret\n",
+ "import transformers\n",
+ "from transformers import AutoModel, BertTokenizerFast\n",
+ "import matplotlib.pyplot as plt\n",
+ "import sklearn\n",
+ "import sklearn.metrics\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import classification_report\n",
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "import tensorflow as tf\n",
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+ "# specify GPU\n",
+ "device = torch.device(\"cuda\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 707
+ },
+ "id": "Fiqo7nAUm3I2",
+ "outputId": "d5753cd8-4dac-4ecc-f53c-ffd5b9c04943"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " majority_target | \n",
+ " statement | \n",
+ " BinaryNumTarget | \n",
+ " tweet | \n",
+ " followers_count | \n",
+ " friends_count | \n",
+ " favourites_count | \n",
+ " statuses_count | \n",
+ " listed_count | \n",
+ " ... | \n",
+ " determiners | \n",
+ " conjunctions | \n",
+ " dots | \n",
+ " exclamation | \n",
+ " questions | \n",
+ " ampersand | \n",
+ " capitals | \n",
+ " digits | \n",
+ " long_word_freq | \n",
+ " short_word_freq | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 33 | \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @S0SickRick @Stairmaster_ @6d6f636869 Not as m... | \n",
+ " 1393.0 | \n",
+ " 1621.0 | \n",
+ " 31436.0 | \n",
+ " 37184.0 | \n",
+ " 64.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 14 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " THE SUPREME COURT is siding with super rich pr... | \n",
+ " 9.0 | \n",
+ " 84.0 | \n",
+ " 219.0 | \n",
+ " 1184.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders\\n\\nBroken campaign promi... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @OhComfy I agree. The confluence of events rig... | \n",
+ " 70.0 | \n",
+ " 166.0 | \n",
+ " 15282.0 | \n",
+ " 2194.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 64 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " Unnamed: 0 majority_target \\\n",
+ "0 0 True \n",
+ "1 1 True \n",
+ "2 2 True \n",
+ "3 3 True \n",
+ "4 4 True \n",
+ "\n",
+ " statement BinaryNumTarget \\\n",
+ "0 End of eviction moratorium means millions of A... 1.0 \n",
+ "1 End of eviction moratorium means millions of A... 1.0 \n",
+ "2 End of eviction moratorium means millions of A... 1.0 \n",
+ "3 End of eviction moratorium means millions of A... 1.0 \n",
+ "4 End of eviction moratorium means millions of A... 1.0 \n",
+ "\n",
+ " tweet followers_count \\\n",
+ "0 @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... 4262.0 \n",
+ "1 @S0SickRick @Stairmaster_ @6d6f636869 Not as m... 1393.0 \n",
+ "2 THE SUPREME COURT is siding with super rich pr... 9.0 \n",
+ "3 @POTUS Biden Blunders\\n\\nBroken campaign promi... 4262.0 \n",
+ "4 @OhComfy I agree. The confluence of events rig... 70.0 \n",
+ "\n",
+ " friends_count favourites_count statuses_count listed_count ... \\\n",
+ "0 3619.0 34945.0 16423.0 44.0 ... \n",
+ "1 1621.0 31436.0 37184.0 64.0 ... \n",
+ "2 84.0 219.0 1184.0 0.0 ... \n",
+ "3 3619.0 34945.0 16423.0 44.0 ... \n",
+ "4 166.0 15282.0 2194.0 0.0 ... \n",
+ "\n",
+ " determiners conjunctions dots exclamation questions ampersand \\\n",
+ "0 0 0 5 0 1 0 \n",
+ "1 0 2 1 0 0 0 \n",
+ "2 0 1 0 0 0 0 \n",
+ "3 0 1 3 0 0 1 \n",
+ "4 0 1 3 0 1 0 \n",
+ "\n",
+ " capitals digits long_word_freq short_word_freq \n",
+ "0 33 3 5 19 \n",
+ "1 14 0 2 34 \n",
+ "2 3 0 4 10 \n",
+ "3 6 8 1 30 \n",
+ "4 11 3 2 19 \n",
+ "\n",
+ "[5 rows x 64 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# prompt: view the head of data\n",
+ "\n",
+ "data.head()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 707
+ },
+ "id": "nPLgUP-NvLrW",
+ "outputId": "557c7500-286f-465d-842e-adc214422f55"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Unnamed: 0 majority_target \\\n",
+ "0 0 True \n",
+ "1 1 True \n",
+ "2 2 True \n",
+ "3 3 True \n",
+ "4 4 True \n",
+ "\n",
+ " statement BinaryNumTarget \\\n",
+ "0 End of eviction moratorium means millions of A... 1.0 \n",
+ "1 End of eviction moratorium means millions of A... 1.0 \n",
+ "2 End of eviction moratorium means millions of A... 1.0 \n",
+ "3 End of eviction moratorium means millions of A... 1.0 \n",
+ "4 End of eviction moratorium means millions of A... 1.0 \n",
+ "\n",
+ " tweet followers_count \\\n",
+ "0 @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... 4262.0 \n",
+ "1 @S0SickRick @Stairmaster_ @6d6f636869 Not as m... 1393.0 \n",
+ "2 THE SUPREME COURT is siding with super rich pr... 9.0 \n",
+ "3 @POTUS Biden Blunders\\n\\nBroken campaign promi... 4262.0 \n",
+ "4 @OhComfy I agree. The confluence of events rig... 70.0 \n",
+ "\n",
+ " friends_count favourites_count statuses_count listed_count ... \\\n",
+ "0 3619.0 34945.0 16423.0 44.0 ... \n",
+ "1 1621.0 31436.0 37184.0 64.0 ... \n",
+ "2 84.0 219.0 1184.0 0.0 ... \n",
+ "3 3619.0 34945.0 16423.0 44.0 ... \n",
+ "4 166.0 15282.0 2194.0 0.0 ... \n",
+ "\n",
+ " determiners conjunctions dots exclamation questions ampersand \\\n",
+ "0 0 0 5 0 1 0 \n",
+ "1 0 2 1 0 0 0 \n",
+ "2 0 1 0 0 0 0 \n",
+ "3 0 1 3 0 0 1 \n",
+ "4 0 1 3 0 1 0 \n",
+ "\n",
+ " capitals digits long_word_freq short_word_freq \n",
+ "0 33 3 5 19 \n",
+ "1 14 0 2 34 \n",
+ "2 3 0 4 10 \n",
+ "3 6 8 1 30 \n",
+ "4 11 3 2 19 \n",
+ "\n",
+ "[5 rows x 64 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " majority_target | \n",
+ " statement | \n",
+ " BinaryNumTarget | \n",
+ " tweet | \n",
+ " followers_count | \n",
+ " friends_count | \n",
+ " favourites_count | \n",
+ " statuses_count | \n",
+ " listed_count | \n",
+ " ... | \n",
+ " determiners | \n",
+ " conjunctions | \n",
+ " dots | \n",
+ " exclamation | \n",
+ " questions | \n",
+ " ampersand | \n",
+ " capitals | \n",
+ " digits | \n",
+ " long_word_freq | \n",
+ " short_word_freq | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 33 | \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @S0SickRick @Stairmaster_ @6d6f636869 Not as m... | \n",
+ " 1393.0 | \n",
+ " 1621.0 | \n",
+ " 31436.0 | \n",
+ " 37184.0 | \n",
+ " 64.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 14 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " THE SUPREME COURT is siding with super rich pr... | \n",
+ " 9.0 | \n",
+ " 84.0 | \n",
+ " 219.0 | \n",
+ " 1184.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders\\n\\nBroken campaign promi... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @OhComfy I agree. The confluence of events rig... | \n",
+ " 70.0 | \n",
+ " 166.0 | \n",
+ " 15282.0 | \n",
+ " 2194.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 64 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data"
+ }
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "# prompt: remove url and make texts in a field in the dataset in lowercase\n",
+ "\n",
+ "import re\n",
+ "\n",
+ "# Remove URL\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'http\\S+', ' ', x))\n",
+ "\n",
+ "data.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 707
+ },
+ "id": "_mzfkkOQxKwO",
+ "outputId": "233547d2-2b00-49ed-d476-e9e36de7f8a1"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Unnamed: 0 majority_target \\\n",
+ "0 0 True \n",
+ "1 1 True \n",
+ "2 2 True \n",
+ "3 3 True \n",
+ "4 4 True \n",
+ "\n",
+ " statement BinaryNumTarget \\\n",
+ "0 End of eviction moratorium means millions of A... 1.0 \n",
+ "1 End of eviction moratorium means millions of A... 1.0 \n",
+ "2 End of eviction moratorium means millions of A... 1.0 \n",
+ "3 End of eviction moratorium means millions of A... 1.0 \n",
+ "4 End of eviction moratorium means millions of A... 1.0 \n",
+ "\n",
+ " tweet followers_count \\\n",
+ "0 @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... 4262.0 \n",
+ "1 @S0SickRick @Stairmaster_ @6d6f636869 Not as m... 1393.0 \n",
+ "2 THE SUPREME COURT is siding with super rich pr... 9.0 \n",
+ "3 @POTUS Biden Blunders\\n\\nBroken campaign promi... 4262.0 \n",
+ "4 @OhComfy I agree. The confluence of events rig... 70.0 \n",
+ "\n",
+ " friends_count favourites_count statuses_count listed_count ... \\\n",
+ "0 3619.0 34945.0 16423.0 44.0 ... \n",
+ "1 1621.0 31436.0 37184.0 64.0 ... \n",
+ "2 84.0 219.0 1184.0 0.0 ... \n",
+ "3 3619.0 34945.0 16423.0 44.0 ... \n",
+ "4 166.0 15282.0 2194.0 0.0 ... \n",
+ "\n",
+ " determiners conjunctions dots exclamation questions ampersand \\\n",
+ "0 0 0 5 0 1 0 \n",
+ "1 0 2 1 0 0 0 \n",
+ "2 0 1 0 0 0 0 \n",
+ "3 0 1 3 0 0 1 \n",
+ "4 0 1 3 0 1 0 \n",
+ "\n",
+ " capitals digits long_word_freq short_word_freq \n",
+ "0 33 3 5 19 \n",
+ "1 14 0 2 34 \n",
+ "2 3 0 4 10 \n",
+ "3 6 8 1 30 \n",
+ "4 11 3 2 19 \n",
+ "\n",
+ "[5 rows x 64 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " majority_target | \n",
+ " statement | \n",
+ " BinaryNumTarget | \n",
+ " tweet | \n",
+ " followers_count | \n",
+ " friends_count | \n",
+ " favourites_count | \n",
+ " statuses_count | \n",
+ " listed_count | \n",
+ " ... | \n",
+ " determiners | \n",
+ " conjunctions | \n",
+ " dots | \n",
+ " exclamation | \n",
+ " questions | \n",
+ " ampersand | \n",
+ " capitals | \n",
+ " digits | \n",
+ " long_word_freq | \n",
+ " short_word_freq | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders - 6 Month Update\\n\\nInfl... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 33 | \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @S0SickRick @Stairmaster_ @6d6f636869 Not as m... | \n",
+ " 1393.0 | \n",
+ " 1621.0 | \n",
+ " 31436.0 | \n",
+ " 37184.0 | \n",
+ " 64.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 14 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " THE SUPREME COURT is siding with super rich pr... | \n",
+ " 9.0 | \n",
+ " 84.0 | \n",
+ " 219.0 | \n",
+ " 1184.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @POTUS Biden Blunders\\n\\nBroken campaign promi... | \n",
+ " 4262.0 | \n",
+ " 3619.0 | \n",
+ " 34945.0 | \n",
+ " 16423.0 | \n",
+ " 44.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " True | \n",
+ " End of eviction moratorium means millions of A... | \n",
+ " 1.0 | \n",
+ " @OhComfy I agree. The confluence of events rig... | \n",
+ " 70.0 | \n",
+ " 166.0 | \n",
+ " 15282.0 | \n",
+ " 2194.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 64 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data"
+ }
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "# prompt: split contractions in the tweet texts\n",
+ "\n",
+ "# Split contractions in the tweet texts\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"can't\", \"cannot\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"n't\", \"not\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"i'm\", \"i am\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"I'm\", \"I am\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"'re\", \"are\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"'s\", \"is\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"'d\", \"would\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"'ll\", \"will\", x))\n",
+ "data['tweet'] = data['tweet'].apply(lambda x: re.sub(r\"'ve\", \"have\", x))\n",
+ "\n",
+ "data.head()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Training-Testing-Validation Splitting"
+ ],
+ "metadata": {
+ "id": "SWGvbwHsCJgR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3P9DYEq3y6pR",
+ "outputId": "9feb4243-a9a7-41bf-a84d-6d0c823e8c8e"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "59105 1.0\n",
+ "111976 0.0\n",
+ "31253 1.0\n",
+ "118328 0.0\n",
+ "62880 1.0\n",
+ "Name: BinaryNumTarget, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "source": [
+ "# Train-Validation-Test set split into 70:15:15 ratio\n",
+ "# Train-Temp split\n",
+ "train_text, temp_text, train_labels, temp_labels = train_test_split(data['tweet'], data['BinaryNumTarget'],\n",
+ " random_state=2018,\n",
+ " test_size=0.3,\n",
+ " stratify=data['majority_target'])\n",
+ "# Validation-Test split\n",
+ "\n",
+ "val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,\n",
+ " random_state=2018,\n",
+ " test_size=0.5,\n",
+ " stratify=temp_labels)\n",
+ "temp_labels.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Bert Model"
+ ],
+ "metadata": {
+ "id": "9DQnsDAUB_2h"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 177,
+ "referenced_widgets": [
+ "74b2c16b3dd94af3a6394c71fe9d1d98",
+ "3467cbed80734b45befed3d8867652a6",
+ "ef11cdc5dea344759f9b0f78939c14d4",
+ "55b4b9ea6dd34aecb7213964c47e250b",
+ "5dec5fbe4ae941a581cea79167270bf6",
+ "3085ba25f51f49ae95930564e2d54a0f",
+ "1b4ec4fc682b488caee3ac9ef0b197cd",
+ "c3d27b7643194755979e3b231a7c7359",
+ "13a985113cb040b79027ed3b59ecfb51",
+ "8041895128094c65b1e65760067bb5bd",
+ "f8465e5d2654402fbf9aa789ee39948c",
+ "42778e0f00ee4099845365a4129a9b07",
+ "ee8a393f6f7a43c0af95f13bb1d8d1b4",
+ "46a9f330d01b43ec9323b0866ef1960e",
+ "f5ff7383a64b435096ebc0654c063311",
+ "7ed0cdc589ab4bccb301a7ac2a4cacc6",
+ "f76cc9fee554462992ae333536ac3e78",
+ "9bc6e00fb6dc450e8643dd97bd093b2a",
+ "7f76440815a94593ae2d93a87b418dad",
+ "8a992dca377b4736970d717dd9e41e49",
+ "2b72f679299d4049864f1daea16b9a77",
+ "5589161d795941edb2fdb804bfd815b5",
+ "b58b34df1fac4232b1a392fadabe3446",
+ "0efb7ee658794030b36ca44d79d4b236",
+ "d1eeefd866074f50be6ffd9d6a199d24",
+ "e78ef5b5c8f149dabfd4fb1445251a88",
+ "494b1cb203cf441b8eff14b64371213e",
+ "87ed154eea034d2180ccd5bab5229155",
+ "d599ca19c22943d6aecfeb13fc16f5de",
+ "a1ade1b5205b429894d5374b29490bb8",
+ "f3f2ec20971140b99c3be6bdf21ffa43",
+ "af52e94cb5c446ca972b7a7837293b03",
+ "d26f83118504442b8e8bdd7e33d30b03",
+ "88439fa69b2d4b41a20dffe1c3c62258",
+ "a64be3a4ebe34c0085c256c5dcaa2a09",
+ "64b358189e6a4c47a622846aeb0c7566",
+ "e9feed4a39f64631b2dfe746a583a03c",
+ "1f77f1a77fa4486a937b3a33c994e348",
+ "237c63d8f3cd4794a61ab360fff10ac2",
+ "506133fbc79541d2936f8efc78dd7dde",
+ "30c34b5a4eef43cda64fb5568cb7dc4f",
+ "073eb0f4e8f440acadeb1a326a96a178",
+ "9258c3a11e4b4c4b98e433d44cea9554",
+ "3c32d976a9314b0bb6d74d5cf2488134",
+ "a7e453f3b0494ba7a7ddbd703468b377",
+ "a2f8e2390258408899d6ead630fe2037",
+ "8951cf1b3ef24ce6bb857f70929444e1",
+ "f776aaf69f1c452daf5b441b85aac33e",
+ "12cc0ac549734af28451abb30e1cda83",
+ "4b50bdc9ce454a3f914452480613bfe9",
+ "5cdd170770a04b72af140314eb64b505",
+ "6b711f56c8d549d1b11790071647c7ad",
+ "99c9a4bc768e40a582bd73b88f21eb96",
+ "28e9d0f866a1405d94f8c9b2a2f18490",
+ "385c6ece940b42c8b53bf9a42775a1d7"
+ ]
+ },
+ "id": "hYWm8E3H1b9g",
+ "outputId": "a808bf13-a91d-4b76-dd8e-38db46f49020"
+ },
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.json: 0%| | 0.00/570 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "74b2c16b3dd94af3a6394c71fe9d1d98"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "model.safetensors: 0%| | 0.00/440M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "42778e0f00ee4099845365a4129a9b07"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/48.0 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "b58b34df1fac4232b1a392fadabe3446"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "88439fa69b2d4b41a20dffe1c3c62258"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/466k [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "a7e453f3b0494ba7a7ddbd703468b377"
+ }
+ },
+ "metadata": {}
+ }
+ ],
+ "source": [
+ "# Load BERT model and tokenizer via HuggingFace Transformers\n",
+ "bert = AutoModel.from_pretrained('bert-base-uncased')\n",
+ "tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 468
+ },
+ "id": "FDZx-O2xzcA_",
+ "outputId": "46d15590-be0c-4a4d-e410-58df0c9ed707"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Text(0, 0.5, 'Number of texts')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "