{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "\n", "# Novel Variation Detection\n", "\n" ], "metadata": { "id": "BnYTwM3OivB4" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "L96SNQ8HVI7m" }, "outputs": [], "source": [ "# imports\n", "import tensorflow as tf\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import StandardScaler\n", "from imblearn.over_sampling import RandomOverSampler\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "source": [ "# using drive to load our dataset\n", "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ea3adROCVORJ", "outputId": "eceb945e-4488-4ac0-ba2a-50005e6a95ef" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "df=pd.read_csv('/content/drive/MyDrive/dataset/lc.csv')\n", "del df['YELLOW_FINGERS'],df['ANXIETY'],df['CHRONIC DISEASE'],df['SHORTNESS OF BREATH'],df['SWALLOWING DIFFICULTY'],df['FATIGUE ']\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "mFDmqdaodqI4", "outputId": "97d8ae49-21ef-4c8f-82c2-719f721b6c40" }, "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " GENDER AGE SMOKING PEER_PRESSURE ALLERGY WHEEZING \\\n", "0 M 69 1 1 1 2 \n", "1 M 74 2 1 2 1 \n", "2 F 59 1 2 1 2 \n", "3 M 63 2 1 1 1 \n", "4 F 63 1 1 1 2 \n", ".. ... ... ... ... ... ... \n", "304 F 56 1 2 1 1 \n", "305 M 70 2 1 2 2 \n", "306 M 58 2 1 2 2 \n", "307 M 67 2 1 2 1 \n", "308 M 62 1 2 2 2 \n", "\n", " ALCOHOL CONSUMING COUGHING CHEST PAIN LUNG_CANCER \n", "0 2 2 2 YES \n", "1 1 1 2 YES \n", "2 1 2 2 NO \n", "3 2 1 2 NO \n", "4 1 2 1 NO \n", ".. ... ... ... ... \n", "304 2 2 1 YES \n", "305 2 2 2 YES \n", "306 2 2 2 YES \n", "307 2 2 2 YES \n", "308 2 1 1 YES \n", "\n", "[309 rows x 10 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GENDERAGESMOKINGPEER_PRESSUREALLERGYWHEEZINGALCOHOL CONSUMINGCOUGHINGCHEST PAINLUNG_CANCER
0M691112222YES
1M742121112YES
2F591212122NO
3M632111212NO
4F631112121NO
.................................
304F561211221YES
305M702122222YES
306M582122222YES
307M672121222YES
308M621222211YES
\n", "

309 rows × 10 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 309,\n \"fields\": [\n {\n \"column\": \"GENDER\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"F\",\n \"M\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AGE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 21,\n \"max\": 87,\n \"num_unique_values\": 39,\n \"samples\": [\n 81,\n 39\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SMOKING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PEER_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ALLERGY \",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WHEEZING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ALCOHOL CONSUMING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"COUGHING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CHEST PAIN\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LUNG_CANCER\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"NO\",\n \"YES\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "df['GENDER']=(df['GENDER']=='M').astype(int)\n", "df['LUNG_CANCER']=(df['LUNG_CANCER']=='YES').astype(int)" ], "metadata": { "id": "ENltExbBeKTj" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "del df['COUGHING']\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "4kWj9yKchhyY", "outputId": "46230399-ba0a-45aa-8ebb-295d18e4ade9" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " GENDER AGE SMOKING PEER_PRESSURE ALLERGY WHEEZING \\\n", "0 1 69 1 1 1 2 \n", "1 1 74 2 1 2 1 \n", "2 0 59 1 2 1 2 \n", "3 1 63 2 1 1 1 \n", "4 0 63 1 1 1 2 \n", ".. ... ... ... ... ... ... \n", "304 0 56 1 2 1 1 \n", "305 1 70 2 1 2 2 \n", "306 1 58 2 1 2 2 \n", "307 1 67 2 1 2 1 \n", "308 1 62 1 2 2 2 \n", "\n", " ALCOHOL CONSUMING CHEST PAIN LUNG_CANCER \n", "0 2 2 1 \n", "1 1 2 1 \n", "2 1 2 0 \n", "3 2 2 0 \n", "4 1 1 0 \n", ".. ... ... ... \n", "304 2 1 1 \n", "305 2 2 1 \n", "306 2 2 1 \n", "307 2 2 1 \n", "308 2 1 1 \n", "\n", "[309 rows x 9 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GENDERAGESMOKINGPEER_PRESSUREALLERGYWHEEZINGALCOHOL CONSUMINGCHEST PAINLUNG_CANCER
01691112221
11742121121
20591212120
31632111220
40631112110
..............................
3040561211211
3051702122221
3061582122221
3071672121221
3081621222211
\n", "

309 rows × 9 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 309,\n \"fields\": [\n {\n \"column\": \"GENDER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AGE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 21,\n \"max\": 87,\n \"num_unique_values\": 39,\n \"samples\": [\n 81,\n 39\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SMOKING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PEER_PRESSURE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ALLERGY \",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WHEEZING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ALCOHOL CONSUMING\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CHEST PAIN\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LUNG_CANCER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "x_data = df.drop(['LUNG_CANCER'], axis = 1)\n", "y = df.LUNG_CANCER.values" ], "metadata": { "id": "vA58b9OtWIDv" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size = 0.2, random_state= 0)" ], "metadata": { "id": "vK1Fycc-WqRj" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.naive_bayes import GaussianNB\n", "nb = GaussianNB()\n", "nb.fit(x_train, y_train)\n", "print(\"NB accuracy: {:.2f}%\".format(nb.score(x_test, y_test)*100))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TB8qV9OnkH_5", "outputId": "9575529d-68e5-41d6-da78-9bb95dcf0865" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "NB accuracy: 85.48%\n" ] } ] }, { "cell_type": "code", "source": [ "y_pred=nb.predict(x_test)" ], "metadata": { "id": "M66dC8FOXNEt" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import classification_report\n", "print(classification_report(y_pred,y_test))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L06DnXKhXPzS", "outputId": "cd79637c-876e-4d65-c515-f58c8b145481" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.50 0.56 0.53 9\n", " 1 0.92 0.91 0.91 53\n", "\n", " accuracy 0.85 62\n", " macro avg 0.71 0.73 0.72 62\n", "weighted avg 0.86 0.85 0.86 62\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import pickle\n", "\n", "with open('nvd.pkl','wb') as f:\n", " pickle.dump(nb,f)\n", "\n", "# load\n", "with open('nvd.pkl', 'rb') as f:\n", " nb = pickle.load(f)" ], "metadata": { "id": "4IrkPQCLXhYw" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "50LcOxfPkm4H" }, "execution_count": null, "outputs": [] } ] }