File size: 13,737 Bytes

7542e49

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "aeeb11f4",
   "metadata": {},
   "source": [
    "# setup code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f0a10d21",
   "metadata": {},
   "outputs": [],
   "source": [
    "# libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt \n",
    "import seaborn as sns\n",
    "#from sklearn import preprocessing \n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.tokenize import sent_tokenize\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "import pickle\n",
    "import time\n",
    "import re\n",
    "\n",
    "from imblearn.pipeline import Pipeline, make_pipeline\n",
    "from imblearn import datasets\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold\n",
    "from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report\n",
    "\n",
    "from sklearn.svm import SVC\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9a069011",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('training_set_full_v3.1.csv', low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "feb4075b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>without_stopwords</th>\n",
       "      <th>submission_grade</th>\n",
       "      <th>without_stopwords_lemm</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>80</td>\n",
       "      <td>born tribe africa</td>\n",
       "      <td>1</td>\n",
       "      <td>born tribe africa</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>82</td>\n",
       "      <td>many customs</td>\n",
       "      <td>1</td>\n",
       "      <td>many customs</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>84</td>\n",
       "      <td>vagina circumcised girls gym class noticed lau...</td>\n",
       "      <td>1</td>\n",
       "      <td>vagina circumcised girls gym class noticed lau...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>86</td>\n",
       "      <td>ashamed married age 12</td>\n",
       "      <td>1</td>\n",
       "      <td>ashamed married age 12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>88</td>\n",
       "      <td>valuable less regards child marriage</td>\n",
       "      <td>1</td>\n",
       "      <td>valuable less regards child marriage</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3179</th>\n",
       "      <td>4574</td>\n",
       "      <td>1957</td>\n",
       "      <td>1957</td>\n",
       "      <td>I'm working class project following emojis con...</td>\n",
       "      <td>0</td>\n",
       "      <td>I'm working class project following emojis con...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3180</th>\n",
       "      <td>4590</td>\n",
       "      <td>1973</td>\n",
       "      <td>1973</td>\n",
       "      <td>Write one paragraph parable Jewish people evil...</td>\n",
       "      <td>0</td>\n",
       "      <td>Write one paragraph parable Jewish people evil...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3181</th>\n",
       "      <td>4591</td>\n",
       "      <td>1974</td>\n",
       "      <td>1974</td>\n",
       "      <td>Pretend successful politician developed strong...</td>\n",
       "      <td>0</td>\n",
       "      <td>Pretend successful politician developed strong...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3182</th>\n",
       "      <td>4592</td>\n",
       "      <td>1975</td>\n",
       "      <td>1975</td>\n",
       "      <td>Pretend Franklin Delano Roosevelt purposes his...</td>\n",
       "      <td>0</td>\n",
       "      <td>Pretend Franklin Delano Roosevelt purposes his...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3183</th>\n",
       "      <td>4594</td>\n",
       "      <td>1977</td>\n",
       "      <td>1977</td>\n",
       "      <td>Write short story (only paragraph two) Elon Mu...</td>\n",
       "      <td>0</td>\n",
       "      <td>Write short story (only paragraph two) Elon Mu...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3184 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      index  Unnamed: 0.1  Unnamed: 0  \\\n",
       "0         0             0          80   \n",
       "1         1             1          82   \n",
       "2         2             2          84   \n",
       "3         3             3          86   \n",
       "4         4             4          88   \n",
       "...     ...           ...         ...   \n",
       "3179   4574          1957        1957   \n",
       "3180   4590          1973        1973   \n",
       "3181   4591          1974        1974   \n",
       "3182   4592          1975        1975   \n",
       "3183   4594          1977        1977   \n",
       "\n",
       "                                      without_stopwords  submission_grade  \\\n",
       "0                                     born tribe africa                 1   \n",
       "1                                          many customs                 1   \n",
       "2     vagina circumcised girls gym class noticed lau...                 1   \n",
       "3                                ashamed married age 12                 1   \n",
       "4                  valuable less regards child marriage                 1   \n",
       "...                                                 ...               ...   \n",
       "3179  I'm working class project following emojis con...                 0   \n",
       "3180  Write one paragraph parable Jewish people evil...                 0   \n",
       "3181  Pretend successful politician developed strong...                 0   \n",
       "3182  Pretend Franklin Delano Roosevelt purposes his...                 0   \n",
       "3183  Write short story (only paragraph two) Elon Mu...                 0   \n",
       "\n",
       "                                 without_stopwords_lemm  sentiment  \n",
       "0                                     born tribe africa          0  \n",
       "1                                          many customs          0  \n",
       "2     vagina circumcised girls gym class noticed lau...          1  \n",
       "3                                ashamed married age 12          0  \n",
       "4                  valuable less regards child marriage          1  \n",
       "...                                                 ...        ...  \n",
       "3179  I'm working class project following emojis con...          1  \n",
       "3180  Write one paragraph parable Jewish people evil...          0  \n",
       "3181  Pretend successful politician developed strong...          1  \n",
       "3182  Pretend Franklin Delano Roosevelt purposes his...          1  \n",
       "3183  Write short story (only paragraph two) Elon Mu...          1  \n",
       "\n",
       "[3184 rows x 7 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# split by grades\n",
    "accepted= df[(df['submission_grade']=='accepted')]\n",
    "rejected= df[(df['submission_grade']=='rejected')]\n",
    "#print(accepted.head())\n",
    "import warnings as wrn\n",
    "wrn.filterwarnings('ignore')\n",
    "accepted['submission_grade'] = 1\n",
    "rejected['submission_grade'] = 0\n",
    "data_df = pd.concat([accepted, rejected\n",
    "                 ],axis=0)\n",
    "\n",
    "data_df.reset_index()\n",
    "#data_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eae05aa0",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = data_df[~data_df['without_stopwords_lemm'].isnull()]\n",
    "corpus = new_df['without_stopwords_lemm'].dropna()\n",
    "tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')\n",
    "tfidf_wm = tfidfvectorizer.fit_transform(corpus)\n",
    "tfidf_tokens = tfidfvectorizer.get_feature_names_out()\n",
    "df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4275b4df",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_df = new_df['submission_grade']\n",
    "#print(y_df.shape)\n",
    "y = np.asarray(y_df)\n",
    "#print(new_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8e10d8bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df_tfidfvect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1029bcd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=45,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "0e6785fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.94      0.90       594\n",
      "           1       0.76      0.60      0.67       199\n",
      "\n",
      "    accuracy                           0.85       793\n",
      "   macro avg       0.82      0.77      0.79       793\n",
      "weighted avg       0.85      0.85      0.85       793\n",
      "\n",
      "[[556  38]\n",
      " [ 79 120]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "m1 = SVC(C=1000,gamma=1,kernel='rbf',class_weight={0: 0.25, 1: 0.75})\n",
    "m1.fit(X_train, y_train)\n",
    "predic = m1.predict(X_test)\n",
    "print(classification_report(y_test,predic))\n",
    "print(confusion_matrix(y_test, predic))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c95ca92e",
   "metadata": {},
   "source": [
    "# Final model\n",
    "m1: name of final model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ad98d1c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.94      0.90       594\n",
      "           1       0.76      0.60      0.67       199\n",
      "\n",
      "    accuracy                           0.85       793\n",
      "   macro avg       0.82      0.77      0.79       793\n",
      "weighted avg       0.85      0.85      0.85       793\n",
      "\n",
      "[[556  38]\n",
      " [ 79 120]]\n"
     ]
    }
   ],
   "source": [
    "predic = m1.predict(X_test)\n",
    "print(classification_report(y_test,predic))\n",
    "print(confusion_matrix(y_test, predic))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}