Spaces:

zen21
/

spam-detection

Sleeping

File size: 9,472 Bytes

b986fa0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "b9bb9dcd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n",
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download(\"averaged_perceptron_tagger\")\n",
    "nltk.download(\"punkt\")\n",
    "from nltk.tokenize import word_tokenize\n",
    "import pandas as pd\n",
    "import csv\n",
    "import numpy as np\n",
    "from sklearn import preprocessing , svm , model_selection, metrics\n",
    "from sklearn.preprocessing import MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c1e93bc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                 msg label  label_no  \\\n",
      "0  Go until jurong point, crazy.. Available only ...   ham         0   \n",
      "1                      Ok lar... Joking wif u oni...   ham         0   \n",
      "2  Free entry in 2 a wkly comp to win FA Cup fina...  spam         1   \n",
      "3  U dun say so early hor... U c already then say...   ham         0   \n",
      "4  Nah I don't think he goes to usf, he lives aro...   ham         0   \n",
      "\n",
      "        NNP      IN        JJ        NN         ,        RB         :  ...  \\\n",
      "0  0.071429  0.1250  0.142857  0.152174  0.076923  0.176471  0.054054  ...   \n",
      "1  0.047619  0.0000  0.047619  0.043478  0.000000  0.000000  0.054054  ...   \n",
      "2  0.142857  0.0625  0.190476  0.152174  0.000000  0.000000  0.000000  ...   \n",
      "3  0.023810  0.0000  0.095238  0.021739  0.000000  0.176471  0.054054  ...   \n",
      "4  0.023810  0.0625  0.000000  0.000000  0.076923  0.176471  0.000000  ...   \n",
      "\n",
      "    MD  PRP$  JJR  JJS   UH   RP   WP  WDT    #   ''  \n",
      "0  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  \n",
      "1  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  \n",
      "2  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  \n",
      "3  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  \n",
      "4  0.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  \n",
      "\n",
      "[5 rows x 38 columns]\n"
     ]
    }
   ],
   "source": [
    "tok_dict={}\n",
    "\n",
    "lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',\n",
    "     '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', \"''\"]\n",
    "\n",
    "pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], \n",
    "'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],\n",
    "'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], \n",
    "'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], \"''\":[]}\n",
    "\n",
    "with open(\"spam_db.csv\", 'r') as file:\n",
    "  csvreader = csv.reader(file)\n",
    "  j=0\n",
    "  k=0\n",
    "  for row in csvreader:\n",
    "    if j==0:\n",
    "        j=1\n",
    "        continue\n",
    "    pd_dict['msg'].append(row[1])\n",
    "    pd_dict['label'].append(row[0])\n",
    "    if row[0]=='spam':\n",
    "        pd_dict['label_no'].append(1)\n",
    "    else:\n",
    "        pd_dict['label_no'].append(0)\n",
    "    for label in lst:\n",
    "        pd_dict[label].append(0)\n",
    "    text=row[1]\n",
    "    tokens=word_tokenize(text)\n",
    "    tokens_tagged=nltk.pos_tag(tokens)\n",
    "    for i in tokens_tagged:\n",
    "        if i[1] in tok_dict:\n",
    "            tok_dict[i[1]].append(i[0])\n",
    "        else:\n",
    "            tok_dict[i[1]]=[i[0]]\n",
    "        if i[1] in pd_dict:\n",
    "          pd_dict[i[1]][k]+=1\n",
    "    k+=1\n",
    "        \n",
    "tok_dict1={}\n",
    "for i in tok_dict:\n",
    "    tok_dict1[i]=len(tok_dict[i])\n",
    "\n",
    "del_lst=[]\n",
    "for i in tok_dict1:\n",
    "    if tok_dict1[i]<100:\n",
    "        del_lst.append(i)\n",
    "\n",
    "for i in del_lst:\n",
    "    tok_dict1.pop(i)\n",
    "\n",
    "lst=[]\n",
    "for i in tok_dict1:\n",
    "    lst.append(i)\n",
    "\n",
    "df=pd.DataFrame(pd_dict)\n",
    "numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns\n",
    "\n",
    "# Create the MinMaxScaler object\n",
    "scaler = MinMaxScaler()\n",
    "\n",
    "# Normalize the numeric columns using min-max normalization\n",
    "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
    "\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "35824c58",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\shiva\\AppData\\Local\\Temp\\ipykernel_9568\\3238635716.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
      "  X=np.array(df.drop(['msg','label','label_no'],1))\n"
     ]
    }
   ],
   "source": [
    "X=np.array(df.drop(['msg','label','label_no'],1))\n",
    "y=np.array(df['label_no'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "aec84e0c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9676956209619526\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)\n",
    "clf=svm.SVC(kernel='poly')\n",
    "clf.fit(X_train, y_train)\n",
    "accuracy = clf.score(X_test, y_test)\n",
    "print(accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "62e97e65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Precision: 0.9669448190530422\n",
      "Recall: 0.9676956209619526\n",
      "F1 score: 0.9667034979766862\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[1208,   11],\n",
       "       [  34,  140]], dtype=int64)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = clf.predict(X_test)\n",
    "\n",
    "precision = metrics.precision_score(y_test, y_pred, average='weighted')\n",
    "recall = metrics.recall_score(y_test, y_pred, average='weighted')\n",
    "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n",
    "\n",
    "print(\"Precision:\", precision)\n",
    "print(\"Recall:\", recall)\n",
    "print(\"F1 score:\", f1)\n",
    "\n",
    "confusion_mat = metrics.confusion_matrix(y_test, y_pred)\n",
    "confusion_mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "ccce58e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "text='''WINNER!! As a valued network customer you have been selected to receivea \n",
    "å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "f53b1187",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens=word_tokenize(text)\n",
    "tokens_tagged=nltk.pos_tag(tokens)\n",
    "x=[]\n",
    "for i in range(35):\n",
    "    x.append(0)\n",
    "pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], \n",
    "'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],\n",
    "'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], \n",
    "'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], \"''\":[34]}\n",
    "for i in tokens_tagged:\n",
    "    x[pos_dict[i[1]][0]]+=1\n",
    "x=np.array(x)\n",
    "x=x.reshape(1,-1)\n",
    "# x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1d0066d6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SPAM\n"
     ]
    }
   ],
   "source": [
    "pred=clf.predict(x)\n",
    "if pred==0:\n",
    "    print(\"NOT SPAM\")\n",
    "else:\n",
    "    print(\"SPAM\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7440777a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}