{ "cells": [ { "cell_type": "markdown", "source": [ "## Importing modules and Loading dataset.\n", "This section contains importing the important python modules. Also, the dataset to be used, in this case the \n", "\n" ], "metadata": { "id": "r5yZ0Codo2rF" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "pycharm": { "name": "#%%\n" }, "id": "Xfbj4kG4UwcC" }, "outputs": [], "source": [ "# importing python module.\n", "import pandas as pd\n", "from lightgbm.sklearn import LGBMClassifier\n", "from sklearn.preprocessing import RobustScaler, OrdinalEncoder\n", "from sklearn.model_selection import train_test_split, StratifiedShuffleSplit\n", "from xgboost.sklearn import XGBClassifier\n", "from sklearn.metrics import f1_score\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age gender \\\n0 1 193 77 49 3,9 19 female \n1 2 146 79 41 3,6 19 female \n2 3 217 75 54 4 20 female \n3 4 226 97 70 3,2 20 female \n4 5 164 91 67 2,4 20 female \n\n height weight bmi systolic_bp diastolic_bp waist hip \\\n0 61 119 22,5 118 70 32 38 \n1 60 135 26,4 108 58 33 40 \n2 67 187 29,3 110 72 40 45 \n3 64 114 19,6 122 64 31 39 \n4 70 141 20,2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0,84 No diabetes \n1 0,83 No diabetes \n2 0,89 No diabetes \n3 0,79 No diabetes \n4 0,82 No diabetes ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493,919female6111922,51187032380,84No diabetes
1214679413,619female6013526,41085833400,83No diabetes
232177554420female6718729,31107240450,89No diabetes
3422697703,220female6411419,61226431390,79No diabetes
4516491672,420female7014120,21228632390,82No diabetes
\n
" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# loading dataset with pandas\n", "dia = pd.read_csv(\"./dataset/diabetes.csv\")\n", "\n", "dia.head()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/", "height": 357 }, "id": "yx869JONUwcJ", "outputId": "ccdf4d8b-adf1-40b1-fe75-bc8a0dccfa6c" } }, { "cell_type": "code", "execution_count": 3, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 390 entries, 0 to 389\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 patient_number 390 non-null int64 \n", " 1 cholesterol 390 non-null int64 \n", " 2 glucose 390 non-null int64 \n", " 3 hdl_chol 390 non-null int64 \n", " 4 chol_hdl_ratio 390 non-null object\n", " 5 age 390 non-null int64 \n", " 6 gender 390 non-null object\n", " 7 height 390 non-null int64 \n", " 8 weight 390 non-null int64 \n", " 9 bmi 390 non-null object\n", " 10 systolic_bp 390 non-null int64 \n", " 11 diastolic_bp 390 non-null int64 \n", " 12 waist 390 non-null int64 \n", " 13 hip 390 non-null int64 \n", " 14 waist_hip_ratio 390 non-null object\n", " 15 diabetes 390 non-null object\n", "dtypes: int64(11), object(5)\n", "memory usage: 48.9+ KB\n" ] } ], "source": [ "# wrangling datasets with pandas\n", "dia.info()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "rftvNRifUwcL", "outputId": "d0799384-e171-4bc9-c149-873c22252711" } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol age \\\ncount 390.000000 390.000000 390.000000 390.000000 390.000000 \nmean 195.500000 207.230769 107.338462 50.266667 46.774359 \nstd 112.727548 44.666005 53.798188 17.279069 16.435911 \nmin 1.000000 78.000000 48.000000 12.000000 19.000000 \n25% 98.250000 179.000000 81.000000 38.000000 34.000000 \n50% 195.500000 203.000000 90.000000 46.000000 44.500000 \n75% 292.750000 229.000000 107.750000 59.000000 60.000000 \nmax 390.000000 443.000000 385.000000 120.000000 92.000000 \n\n height weight systolic_bp diastolic_bp waist \\\ncount 390.000000 390.000000 390.000000 390.000000 390.000000 \nmean 65.951282 177.407692 137.133333 83.289744 37.869231 \nstd 3.918867 40.407824 22.859528 13.498192 5.760947 \nmin 52.000000 99.000000 90.000000 48.000000 26.000000 \n25% 63.000000 150.250000 122.000000 75.000000 33.000000 \n50% 66.000000 173.000000 136.000000 82.000000 37.000000 \n75% 69.000000 200.000000 148.000000 90.000000 41.000000 \nmax 76.000000 325.000000 250.000000 124.000000 56.000000 \n\n hip \ncount 390.000000 \nmean 42.992308 \nstd 5.664342 \nmin 30.000000 \n25% 39.000000 \n50% 42.000000 \n75% 46.000000 \nmax 64.000000 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholageheightweightsystolic_bpdiastolic_bpwaisthip
count390.000000390.000000390.000000390.000000390.000000390.000000390.000000390.000000390.000000390.000000390.000000
mean195.500000207.230769107.33846250.26666746.77435965.951282177.407692137.13333383.28974437.86923142.992308
std112.72754844.66600553.79818817.27906916.4359113.91886740.40782422.85952813.4981925.7609475.664342
min1.00000078.00000048.00000012.00000019.00000052.00000099.00000090.00000048.00000026.00000030.000000
25%98.250000179.00000081.00000038.00000034.00000063.000000150.250000122.00000075.00000033.00000039.000000
50%195.500000203.00000090.00000046.00000044.50000066.000000173.000000136.00000082.00000037.00000042.000000
75%292.750000229.000000107.75000059.00000060.00000069.000000200.000000148.00000090.00000041.00000046.000000
max390.000000443.000000385.000000120.00000092.00000076.000000325.000000250.000000124.00000056.00000064.000000
\n
" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.describe()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/", "height": 364 }, "id": "mPXcxzuwUwcN", "outputId": "a1107ea3-215d-4400-e6ff-2d8fd7ff8b55" } }, { "cell_type": "markdown", "source": [ "## Wrangling dataset." ], "metadata": { "id": "S2hRdWcPqcrP" } }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "dia.chol_hdl_ratio = round(dia.cholesterol / dia.hdl_chol,2)" ], "metadata": { "pycharm": { "name": "#%%\n" }, "id": "DCl3woxiUwcO" } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age \\\n0 1 193 77 49 3.94 19 \n1 2 146 79 41 3.56 19 \n2 3 217 75 54 4.02 20 \n3 4 226 97 70 3.23 20 \n4 5 164 91 67 2.45 20 \n\n gender height weight bmi systolic_bp diastolic_bp waist hip \\\n0 female 61 119 22,5 118 70 32 38 \n1 female 60 135 26,4 108 58 33 40 \n2 female 67 187 29,3 110 72 40 45 \n3 female 64 114 19,6 122 64 31 39 \n4 female 70 141 20,2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0,84 No diabetes \n1 0,83 No diabetes \n2 0,89 No diabetes \n3 0,79 No diabetes \n4 0,82 No diabetes ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493.9419female6111922,51187032380,84No diabetes
1214679413.5619female6013526,41085833400,83No diabetes
2321775544.0220female6718729,31107240450,89No diabetes
3422697703.2320female6411419,61226431390,79No diabetes
4516491672.4520female7014120,21228632390,82No diabetes
\n
" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.head()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/", "height": 357 }, "id": "QNlQedszUwcP", "outputId": "63231eb5-798a-4c07-8aae-851004ab3787" } }, { "cell_type": "markdown", "source": [], "metadata": { "collapsed": false, "id": "SFHdSj2YUwcQ" } }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "dia.waist_hip_ratio= round(dia.waist/dia.hip,2)" ], "metadata": { "pycharm": { "name": "#%%\n" }, "id": "ovJyqVa2UwcX" } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age \\\n0 1 193 77 49 3.94 19 \n1 2 146 79 41 3.56 19 \n2 3 217 75 54 4.02 20 \n3 4 226 97 70 3.23 20 \n4 5 164 91 67 2.45 20 \n\n gender height weight bmi systolic_bp diastolic_bp waist hip \\\n0 female 61 119 22,5 118 70 32 38 \n1 female 60 135 26,4 108 58 33 40 \n2 female 67 187 29,3 110 72 40 45 \n3 female 64 114 19,6 122 64 31 39 \n4 female 70 141 20,2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0.84 No diabetes \n1 0.82 No diabetes \n2 0.89 No diabetes \n3 0.79 No diabetes \n4 0.82 No diabetes ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493.9419female6111922,51187032380.84No diabetes
1214679413.5619female6013526,41085833400.82No diabetes
2321775544.0220female6718729,31107240450.89No diabetes
3422697703.2320female6411419,61226431390.79No diabetes
4516491672.4520female7014120,21228632390.82No diabetes
\n
" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.head()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/", "height": 357 }, "id": "PWqYDcnYUwcZ", "outputId": "d0e278d1-d7ed-4503-ee2b-5f94661e56e5" } }, { "cell_type": "code", "execution_count": 9, "outputs": [], "source": [ "dia.bmi = pd.to_numeric(dia.bmi.str.replace(\",\",\".\"))" ], "metadata": { "pycharm": { "name": "#%%\n" }, "id": "CXAX15VHUwce" } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age \\\n0 1 193 77 49 3.94 19 \n1 2 146 79 41 3.56 19 \n2 3 217 75 54 4.02 20 \n3 4 226 97 70 3.23 20 \n4 5 164 91 67 2.45 20 \n\n gender height weight bmi systolic_bp diastolic_bp waist hip \\\n0 female 61 119 22.5 118 70 32 38 \n1 female 60 135 26.4 108 58 33 40 \n2 female 67 187 29.3 110 72 40 45 \n3 female 64 114 19.6 122 64 31 39 \n4 female 70 141 20.2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0.84 No diabetes \n1 0.82 No diabetes \n2 0.89 No diabetes \n3 0.79 No diabetes \n4 0.82 No diabetes ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493.9419female6111922.51187032380.84No diabetes
1214679413.5619female6013526.41085833400.82No diabetes
2321775544.0220female6718729.31107240450.89No diabetes
3422697703.2320female6411419.61226431390.79No diabetes
4516491672.4520female7014120.21228632390.82No diabetes
\n
" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.head()" ], "metadata": { "pycharm": { "name": "#%%\n" }, "colab": { "base_uri": "https://localhost:8080/", "height": 357 }, "id": "Y9Rg5DkoUwcf", "outputId": "de5133be-4736-4098-c94c-300eaac58f7d" } }, { "cell_type": "code", "source": [ "dia.weight.describe()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PglRQVOhWq6F", "outputId": "9e50d452-d5c4-41f0-a632-f148bb85c44f" }, "execution_count": 11, "outputs": [ { "data": { "text/plain": "count 390.000000\nmean 177.407692\nstd 40.407824\nmin 99.000000\n25% 150.250000\n50% 173.000000\n75% 200.000000\nmax 325.000000\nName: weight, dtype: float64" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "data": { "text/plain": "patient_number 0\ncholesterol 0\nglucose 0\nhdl_chol 0\nchol_hdl_ratio 0\nage 0\ngender 0\nheight 0\nweight 0\nbmi 0\nsystolic_bp 0\ndiastolic_bp 0\nwaist 0\nhip 0\nwaist_hip_ratio 0\ndiabetes 0\ndtype: int64" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.isnull().sum()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "source": [ "dia.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 357 }, "id": "GY9af0LOoUrQ", "outputId": "5cb087ef-8459-40e3-c65d-515007489006" }, "execution_count": 13, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age \\\n0 1 193 77 49 3.94 19 \n1 2 146 79 41 3.56 19 \n2 3 217 75 54 4.02 20 \n3 4 226 97 70 3.23 20 \n4 5 164 91 67 2.45 20 \n\n gender height weight bmi systolic_bp diastolic_bp waist hip \\\n0 female 61 119 22.5 118 70 32 38 \n1 female 60 135 26.4 108 58 33 40 \n2 female 67 187 29.3 110 72 40 45 \n3 female 64 114 19.6 122 64 31 39 \n4 female 70 141 20.2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0.84 No diabetes \n1 0.82 No diabetes \n2 0.89 No diabetes \n3 0.79 No diabetes \n4 0.82 No diabetes ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493.9419female6111922.51187032380.84No diabetes
1214679413.5619female6013526.41085833400.82No diabetes
2321775544.0220female6718729.31107240450.89No diabetes
3422697703.2320female6411419.61226431390.79No diabetes
4516491672.4520female7014120.21228632390.82No diabetes
\n
" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "s= (dia.dtypes == \"object\")\n", "obj_col= list(s[s].index)" ], "metadata": { "id": "cKtpXdi6pwdJ" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "obj_col" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_jnwJCli1cA6", "outputId": "563aa279-aae2-4d4e-d5c8-f8d668946539" }, "execution_count": 15, "outputs": [ { "data": { "text/plain": "['gender', 'diabetes']" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "orde = OrdinalEncoder()\n", "dia[obj_col] = orde.fit_transform(dia[obj_col])" ], "metadata": { "id": "KvSeVC8K2FvU" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "dia.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 270 }, "id": "bY5dg9H53eVS", "outputId": "27963a81-6560-455f-8d8d-10240bc5dc33" }, "execution_count": 17, "outputs": [ { "data": { "text/plain": " patient_number cholesterol glucose hdl_chol chol_hdl_ratio age \\\n0 1 193 77 49 3.94 19 \n1 2 146 79 41 3.56 19 \n2 3 217 75 54 4.02 20 \n3 4 226 97 70 3.23 20 \n4 5 164 91 67 2.45 20 \n\n gender height weight bmi systolic_bp diastolic_bp waist hip \\\n0 0.0 61 119 22.5 118 70 32 38 \n1 0.0 60 135 26.4 108 58 33 40 \n2 0.0 67 187 29.3 110 72 40 45 \n3 0.0 64 114 19.6 122 64 31 39 \n4 0.0 70 141 20.2 122 86 32 39 \n\n waist_hip_ratio diabetes \n0 0.84 1.0 \n1 0.82 1.0 \n2 0.89 1.0 \n3 0.79 1.0 \n4 0.82 1.0 ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
patient_numbercholesterolglucosehdl_cholchol_hdl_ratioagegenderheightweightbmisystolic_bpdiastolic_bpwaisthipwaist_hip_ratiodiabetes
0119377493.94190.06111922.51187032380.841.0
1214679413.56190.06013526.41085833400.821.0
2321775544.02200.06718729.31107240450.891.0
3422697703.23200.06411419.61226431390.791.0
4516491672.45200.07014120.21228632390.821.0
\n
" }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "X = dia.drop([\"patient_number\", \"diabetes\"], axis= 1)\n", "y= dia.diabetes" ], "metadata": { "id": "ZbHayB553gRB" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "scale = RobustScaler()\n", "scaled_x = scale.fit_transform(X, y=y)" ], "metadata": { "id": "GpABdNvA3_8-" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "scaled_x" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qgImunKB4r2i", "outputId": "76b3b342-70c9-4aee-b3b6-1c21b9aac379" }, "execution_count": 20, "outputs": [ { "data": { "text/plain": "array([[-0.2 , -0.48598131, 0.14285714, ..., -0.625 ,\n -0.57142857, -0.41025641],\n [-1.14 , -0.41121495, -0.23809524, ..., -0.5 ,\n -0.28571429, -0.61538462],\n [ 0.28 , -0.56074766, 0.38095238, ..., 0.375 ,\n 0.42857143, 0.1025641 ],\n ...,\n [ 1.96 , 0. , 3.42857143, ..., -0.75 ,\n -0.14285714, -1.23076923],\n [ 0.58 , 3.51401869, 3.23809524, ..., -0.25 ,\n -0.57142857, 0.41025641],\n [-0.76 , 0.14953271, 1.0952381 , ..., 1.75 ,\n 1.28571429, 1.23076923]])" }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size= 0.2, random_state=42)" ], "metadata": { "id": "ZECN7XuJ4uAR" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "split = StratifiedShuffleSplit(n_splits=4, random_state=42 )\n", "\n", "for train_index, test_index in split.split(scaled_x, y):\n", " strat_X, strat_test = scaled_x[train_index], scaled_x[test_index]\n", " strat_y, strat_ytest = y[train_index], y[test_index]" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "source": [ "X_train" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo2R6TH55gTW", "outputId": "b12de013-4626-4a0e-aaac-f16281bd50b6" }, "execution_count": 23, "outputs": [ { "data": { "text/plain": "array([[-0.22 , -1.27102804, -0.19047619, ..., -0.75 ,\n -0.28571429, -1.02564103],\n [-0.44 , 0.41121495, -0.0952381 , ..., 0.125 ,\n 0. , 0.20512821],\n [ 0.18 , -0.41121495, 0.14285714, ..., 0.125 ,\n 0. , 0.20512821],\n ...,\n [-1.48 , 0.74766355, -0.19047619, ..., -0.375 ,\n -0.57142857, 0.1025641 ],\n [ 0.66 , 0.78504673, 1.71428571, ..., -1. ,\n -0.71428571, -1.02564103],\n [ 2.68 , -0.18691589, 0.76190476, ..., -0.125 ,\n 0.28571429, -0.61538462]])" }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "lgbm_model = LGBMClassifier(n_estimators=200, max_depth=-2, random_state=42)" ], "metadata": { "id": "S9cGQrMp5iug" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "lgbm_model.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SWj3o6Cg6nUD", "outputId": "b64a97a4-1f1d-46c3-c11e-e429feedd6db" }, "execution_count": 25, "outputs": [ { "data": { "text/plain": "LGBMClassifier(max_depth=-2, n_estimators=200, random_state=42)" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "y_pred=lgbm_model.predict(X_test)" ], "metadata": { "id": "8LFEmpW16yNk" }, "execution_count": 26, "outputs": [] }, { "cell_type": "code", "source": [ "f1_score(y_pred, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I1pWPR0x6_r9", "outputId": "4ecaee82-9c32-4ca2-f71a-c376ea853419" }, "execution_count": 27, "outputs": [ { "data": { "text/plain": "0.9354838709677419" }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ] }, { "cell_type": "code", "source": [ "xg_model= XGBClassifier(n_estimators=200, max_depth=4, scale_pos_weight=5.5)" ], "metadata": { "id": "e6JqauyE7Luq" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "execution_count": 29, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[23:22:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" ] }, { "data": { "text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=5.5, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=None)" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xg_model.fit(X_train, y_train)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 30, "outputs": [], "source": [ "xg_pred = xg_model.predict(X_test)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 31, "outputs": [ { "data": { "text/plain": "0.943089430894309" }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f1_score(y_test, xg_pred)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "markdown", "source": [ "## Stratified Shuffle Test" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } } }, { "cell_type": "code", "execution_count": 32, "outputs": [], "source": [ "lgbm_strat = LGBMClassifier(n_estimators=200, learning_rate=0.0099, max_depth=-2, )" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 33, "outputs": [ { "data": { "text/plain": "LGBMClassifier(learning_rate=0.0099, max_depth=-2, n_estimators=200)" }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lgbm_strat.fit(strat_X, strat_y)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 34, "outputs": [], "source": [ "strat_pred = lgbm_strat.predict(strat_test)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 35, "outputs": [ { "data": { "text/plain": "0.955223880597015" }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f1_score(strat_pred, strat_ytest)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 38, "outputs": [], "source": [ "import numpy as np\n", "\n", "def predict(var_name):\n", " pred = [var_name]\n", " np_pred = np.array(pred)\n", " score = lgbm_strat.predict(np_pred)\n", " return score" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 39, "outputs": [ { "data": { "text/plain": "1.0 330\n0.0 60\nName: diabetes, dtype: int64" }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia.diabetes.value_counts()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 40, "outputs": [ { "data": { "text/plain": "5.5" }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "330/60" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 41, "outputs": [], "source": [ "xgb= XGBClassifier(max_depth=7, n_estimators=1000, scale_pos_weight=5.5)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 42, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[00:36:49] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" ] }, { "data": { "text/plain": "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=7,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=5.5, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=None)" }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xgb.fit(strat_X, strat_y)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 43, "outputs": [], "source": [ "y=xgb.predict(strat_test)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 44, "outputs": [ { "data": { "text/plain": "0.955223880597015" }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f1_score(y, strat_ytest)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 45, "outputs": [], "source": [ "import sqlite3" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 46, "outputs": [], "source": [ "conn = sqlite3.connect(\"diabetes.db\")\n", "c = conn.cursor()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 47, "outputs": [ { "data": { "text/plain": "('patient_number',\n 'cholesterol',\n 'glucose',\n 'hdl_chol',\n 'chol_hdl_ratio',\n 'age',\n 'gender',\n 'height',\n 'weight',\n 'bmi',\n 'systolic_bp',\n 'diastolic_bp',\n 'waist',\n 'hip',\n 'waist_hip_ratio',\n 'diabetes')" }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col= tuple(dia.columns)\n", "col" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 48, "outputs": [], "source": [ "conn.commit()" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 49, "outputs": [], "source": [ "dia.to_sql(name=\"diabetes.db\", con=conn, if_exists= \"replace\", index=False)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 0 }